from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)


from sklearn.cluster import dbscan
from sklearn.preprocessing import StandardScaler


taxi = pd.read_csv("./data/williamsburg_taxi_trips.csv")
taxi.head()


feature_columns = [
    "pickup_x",
    "pickup_y",
    "dropoff_x",
    "dropoff_y",
    "trip_distance",
    "pickup_hour",
]
features = taxi[feature_columns].copy()


features.head()


# Scale these features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
 
scaled_features

array([[ 3.35254171e+00,  2.18196697e+00,  8.59345108e-02,
         1.89871932e-01, -2.58698769e-03,  8.16908067e-01],
       [-9.37728802e-01, -4.08167622e-01, -9.76176333e-02,
        -9.77141849e-03, -2.81690985e-03,  8.16908067e-01],
       [-7.05204353e-01, -6.95217705e-02,  1.21306539e-01,
        -6.45086739e-02, -2.80981012e-03, -1.32713022e+00],
       ...,
       [-1.32952083e+00, -1.14848599e+00, -3.37095821e-01,
        -1.09933782e-01, -2.76011198e-03,  7.04063946e-01],
       [-7.52953521e-01, -7.01094651e-01, -2.61571762e-01,
        -3.00037860e-01, -2.84530879e-03,  7.04063946e-01],
       [-3.97090015e-01, -1.71084059e-02, -1.11647543e+00,
         2.84810408e-01, -2.93269014e-03,  8.16908067e-01]])


print(scaled_features.shape)
print(features.shape)

(223722, 6)
(223722, 6)


# Run DBSCAN 
cores, labels = dbscan(scaled_features, eps=0.25, min_samples=50)

# Add the labels back to the original (unscaled) dataset
features['label'] = labels


# Extract the number of clusters 
num_clusters = features['label'].nunique() - 1
print(num_clusters)

27


# Get cluster sizes, from largest to smallest
N = features.groupby('label').size().sort_values(ascending=False)
print(N)

label
-1     101292
 1      50673
 2      33277
 3      24360
 0       4481
 6       2270
 5       2215
 7       1459
 4        912
 9        519
 11       414
 8        254
 12       224
 13       211
 10       183
 17       143
 14       116
 23        97
 20        86
 16        85
 18        76
 15        70
 19        69
 24        52
 22        51
 21        49
 26        43
 25        41
dtype: int64


# Extract labels (ignoring label -1 for noise)
top5 = list(N.iloc[1:6].index)
print(top5)

[1, 2, 3, 0, 6]


# get the features for the top 5 labels
selection = features['label'].isin(top5)

# select top 5 and groupby by the label
grps = features.loc[selection].groupby('label')

# calculate average pickup hour and trip distance per cluster
avg_values = grps[['pickup_hour', 'trip_distance']].mean()

avg_values.loc[top5]


# a good color scheme for a black background
colors = ['aqua', 'lime', 'red', 'fuchsia', 'yellow']


# EXAMPLE: enumerating a list
example_list = [10, 12, 5, 13, 40]

for i, label_num in enumerate(example_list):
    print(f"i = {i}")
    print(f"label_num = {label_num}")

i = 0
label_num = 10
i = 1
label_num = 12
i = 2
label_num = 5
i = 3
label_num = 13
i = 4
label_num = 40


# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")

# Plot noise in grey
noise = features.loc[features["label"] == -1]
ax.scatter(noise["pickup_x"], noise["pickup_y"], c="grey", s=5, linewidth=0)

# specify colors for each of the top 5 clusters
colors = ["aqua", "lime", "red", "fuchsia", "yellow"]

# loop over top 5 largest clusters
for i, label_num in enumerate(top5):
    print(f"Plotting cluster #{label_num}...")

    # select all the samples with label equals "label_num"
    this_cluster = features.loc[features["label"] == label_num]

    # plot pickups
    ax.scatter(
        this_cluster["pickup_x"],
        this_cluster["pickup_y"],
        linewidth=0,
        color=colors[i],
        s=5,
        alpha=0.3,
    )

    # plot dropoffs
    ax.scatter(
        this_cluster["dropoff_x"],
        this_cluster["dropoff_y"],
        linewidth=0,
        color=colors[i],
        s=5,
        alpha=0.3,
    )

# Display the figure
ax.set_axis_off()

Plotting cluster #1...
Plotting cluster #2...
Plotting cluster #3...
Plotting cluster #0...
Plotting cluster #6...


def plot_taxi_cluster(label_num):
    """
    Plot the pickups and dropoffs for the input cluster label
    """
    # Setup figure and axis
    f, ax = plt.subplots(figsize=(10, 10), facecolor="black")

    # Plot noise in grey
    noise = features.loc[features["label"] == -1]
    ax.scatter(noise["pickup_x"], noise["pickup_y"], c="grey", s=5, linewidth=0)

    # Get the features for "label_num"
    this_cluster = features.loc[features["label"] == label_num]

    # Plot pickups in fuchsia
    ax.scatter(
        this_cluster["pickup_x"],
        this_cluster["pickup_y"],
        linewidth=0,
        color="fuchsia",
        s=5,
        alpha=0.3,
    )

    # Plot dropoffs in aqua
    ax.scatter(
        this_cluster["dropoff_x"],
        this_cluster["dropoff_y"],
        linewidth=0,
        color="aqua",
        s=5,
        alpha=0.3,
    )

    # Display the figure
    ax.set_axis_off()

    # Add a label
    ax.text(
        0.1,
        0.9,
        f"Cluster #{label_num}",
        ha="left",
        color="white",
        weight='bold',
        fontsize=30,
        transform=ax.transAxes,
    )


top5

[1, 2, 3, 0, 6]


# Plot pickups (fuchsia) and dropoffs (aqua) for a specific cluster
plot_taxi_cluster(label_num=6)


data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()


import hvplot.pandas


data.hvplot.scatter(
    x="gdp_per_capita",
    y="life_satisfaction",
    hover_cols=["Country"],
    ylim=(4, 9),
    xlim=(1e3, 1.1e5),
)


from sklearn.linear_model import LinearRegression


model = LinearRegression()
model

LinearRegression()

LinearRegression()


# Our input features (in this case we only have 1)
X = data['gdp_per_capita'].values
X = X[:, np.newaxis]

# The labels (values we are trying to predict)
y = data['life_satisfaction'].values


X.shape

(40, 1)


y.shape

(40,)


model.fit(X, y)

LinearRegression()

LinearRegression()


intercept = model.intercept_
slope = model.coef_[0]

print(f"bestfit intercept = {intercept:.2f}")
print(f"bestfit slope = {slope:.2e}")

bestfit intercept = 5.72
bestfit slope = 2.47e-05


Rsq = model.score(X, y)
Rsq

0.519153782362894


# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)

# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]

y_pred = model.predict(X_pred)


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot the predicted values
    ax.plot(X_pred / 1e5, y_pred, label="Predicted values", color="#666666")

    # Training data
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Training data",
        s=100,
        zorder=10,
        color="#f40000",
    )

    ax.legend()
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")


from sklearn.model_selection import train_test_split


# I'll use a 70/30% split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)


print("size of full dataset = ", len(data))
print("size of training dataset = ", len(train_set))
print("size of test dataset = ", len(test_set))

size of full dataset =  40
size of training dataset =  28
size of test dataset =  12


# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]

X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]

# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values


from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()


# Scale the training features
X_train_scaled = scaler.fit_transform(X_train)

# Scale the test features
X_test_scaled = scaler.fit_transform(X_test)


model.fit(X_train_scaled, y_train)

LinearRegression()

LinearRegression()


model.score(X_test_scaled, y_test)

0.35959585147159556


from sklearn.preprocessing import PolynomialFeatures


poly = PolynomialFeatures(degree=3)


# Training
X_train_scaled_poly = poly.fit_transform(scaler.fit_transform(X_train))

# Test
X_test_scaled_poly = poly.fit_transform(scaler.fit_transform(X_test))


X_train.shape

(28, 1)


X_train_scaled_poly.shape

(28, 4)


model.fit(X_train_scaled_poly, y_train)

LinearRegression()

LinearRegression()


model.score(X_test_scaled_poly, y_test)

0.5597457659851046


from sklearn.pipeline import make_pipeline


pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))

pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures(degree=3))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures(degree=3))])

StandardScaler()

PolynomialFeatures(degree=3)


# Step 1
pipe['standardscaler']

StandardScaler()

StandardScaler()


# Step 2
pipe['polynomialfeatures']

PolynomialFeatures(degree=3)

PolynomialFeatures(degree=3)


y_pred = model.predict(pipe.fit_transform(X_pred))


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot the predicted values
    y_pred = model.predict(pipe.fit_transform(X_pred))
    ax.plot(X_pred / 1e5 , y_pred, label="Predicted values", color="#666666")

    # Training data
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Training data",
        s=100,
        zorder=10,
        color="#f40000",
    )

    ax.legend()
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    # Original data set
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Training data",
        s=100,
        zorder=10,
        color="#666666",
    )
    
    # Plot the predicted values
    for degree in [3, 5, 10]:
        print(f"degree = {degree}")
        
        # Create out pipeline
        p = make_pipeline(StandardScaler(), PolynomialFeatures(degree=degree))
        
        # Fit the model on the training set
        model.fit(p.fit_transform(X_train), y_train)
        
        # Evaluate on the training set
        training_score = model.score(p.fit_transform(X_train), y_train)
        print(f"Training Score = {training_score}")
        
        # Evaluate on the test set
        test_score = model.score(p.fit_transform(X_test), y_test)
        print(f"Test Score = {test_score}")
        
        # Plot
        y_pred = model.predict(p.fit_transform(X_pred))
        ax.plot(X_pred / 1e5, y_pred, label=f"Degree = {degree}")
        
        print()

    ax.legend(ncol=2, loc=0)
    ax.set_ylim(4, 9)
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")

degree = 3
Training Score = 0.6458898101593082
Test Score = 0.5597457659851046

degree = 5
Training Score = 0.6846206186564368
Test Score = -3.9465752545551567

degree = 10
Training Score = 0.8020213670053926
Test Score = -26330.208554357912


from sklearn.linear_model import Ridge


# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)

# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]


# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))

# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    ## Plot the data
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Data",
        s=100,
        zorder=10,
        color="#666666",
    )

    ## Evaluate the linear fit
    print("Linear fit")
    training_score = linear.score(scaler.fit_transform(X_train), y_train)
    print(f"Training Score = {training_score}")

    test_score = linear.score(scaler.fit_transform(X_test), y_test)
    print(f"Test Score = {test_score}")
    print()


    ## Plot the linear fit
    ax.plot(
        X_pred / 1e5,
        linear.predict(scaler.fit_transform(X_pred)),
        color="k",
        label="Linear fit",
    )

    ## Ridge regression: linear model with regularization 
    # Plot the predicted values for each alpha
    for alpha in [0, 10, 100, 1e5]:
        print(f"alpha = {alpha}")

        # Create out Ridge model with this alpha
        ridge = Ridge(alpha=alpha)

        # Fit the model on the training set
        # NOTE: Use the pipeline that includes polynomial features
        ridge.fit(pipe.fit_transform(X_train), y_train)

        # Evaluate on the training set
        training_score = ridge.score(pipe.fit_transform(X_train), y_train)
        print(f"Training Score = {training_score}")

        # Evaluate on the test set
        test_score = ridge.score(pipe.fit_transform(X_test), y_test)
        print(f"Test Score = {test_score}")

        # Plot the ridge results
        y_pred = ridge.predict(pipe.fit_transform(X_pred))
        ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")

        print()

    # Plot formatting
    ax.legend(ncol=2, loc=0)
    ax.set_ylim(4, 8)
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")

Linear fit
Training Score = 0.4638100579740343
Test Score = 0.35959585147159556

alpha = 0
Training Score = 0.6458898101593082
Test Score = 0.5597457659851048

alpha = 10
Training Score = 0.5120282691427858
Test Score = 0.38335642103788325

alpha = 100
Training Score = 0.1815398751108913
Test Score = -0.05242399995626967

alpha = 100000.0
Training Score = 0.0020235571180508005
Test Score = -0.26129559971586125

	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	pickup_x	pickup_y	dropoff_x	dropoff_y	fare_amount	tip_amount	dropoff_hour	pickup_hour
0	2015-01-15 19:05:41	2015-01-15 19:20:22	2	7.13	-8223667.0	4979065.0	-8232341.0	4970922.0	21.5	4.50	19	19
1	2015-01-15 19:05:44	2015-01-15 19:17:44	1	2.92	-8237459.0	4971133.5	-8232725.0	4970482.5	12.5	2.70	19	19
2	2015-01-25 00:13:06	2015-01-25 00:34:32	1	3.05	-8236711.5	4972170.5	-8232267.0	4970362.0	16.5	5.34	0	0
3	2015-01-26 12:41:15	2015-01-26 12:59:22	1	8.10	-8222485.5	4978445.5	-8233442.5	4969903.5	24.5	5.05	12	12
4	2015-01-20 22:49:11	2015-01-20 22:58:46	1	3.50	-8236294.5	4970916.5	-8231820.5	4971722.0	12.5	2.00	22	22

	pickup_x	pickup_y	dropoff_x	dropoff_y	trip_distance	pickup_hour
0	-8223667.0	4979065.0	-8232341.0	4970922.0	7.13	19
1	-8237459.0	4971133.5	-8232725.0	4970482.5	2.92	19
2	-8236711.5	4972170.5	-8232267.0	4970362.0	3.05	0
3	-8222485.5	4978445.5	-8233442.5	4969903.5	8.10	12
4	-8236294.5	4970916.5	-8231820.5	4971722.0	3.50	22

	pickup_hour	trip_distance
label
1	20.127405	4.025859
2	1.699943	3.915581
3	9.536905	1.175154
0	18.599643	7.508730
6	1.494714	2.620546

	Country	life_satisfaction	gdp_per_capita
0	Australia	7.3	50961.87
1	Austria	7.1	43724.03
2	Belgium	6.9	40106.63
3	Brazil	6.4	8670.00
4	Canada	7.4	43331.96

Week 11A: Supervised Learning with Scikit-Learn¶

Housekeeping¶

Last week¶

Where we left off: Extending DBSCAN beyond just spatial coordinates¶

Exercise: Extracting patterns from NYC taxi rides¶

Step 1: Load the data¶

Step 2: Extract and normalize several features¶

Step 3: Run DBSCAN to extract high-density clusters¶

Step 4: Identify the 5 largest clusters¶

Step 5: Get mean statistics for the top 5 largest clusters¶

Step 6a: Visualize the top 5 largest clusters¶

Step 6b: Visualizing one cluster at a time¶

Interpreting clustering results: the perils of algorithms¶

Algorithmic bias¶

Recommended readings¶

Now onto new material...¶

Reminder: clustering is an example of unsupervised learning¶

Today: an example of supervised learning¶

Examples¶

Model-based learning¶

Machine learning is really just an optimization problem¶

1. Choose a model¶

2. The model has an associated cost function¶

3. "Learn" the best model parameters¶

Recap: the steps involved¶

What could go wrong?¶

Mistake #1: "bad data"¶

Mistake #2: "bad algorithm"¶

Regularization: keeping it simple¶

Key question: How do we know if a model will perform well on new data?¶

Crossing your fingers and hoping for the best is not the recommended strategy¶

Option #1: a train/test split¶

Option #2: k-fold cross-validation¶

Let's try out a simple example: does money make people happier?¶

Make a quick plot¶

There's a roughly linear trend here...let's start there¶

Reminder: What's with the "_" at the end of variable names?¶

How good is the fit?¶

Let's plot the data and the predicted values¶

Not bad....but what did we do wrong?¶

Two Problems!¶

1. We also fit and evaluated our model on the same training set!¶

2. We didn't scale our input data features!¶

Can we do better? Let's do some feature engineering...¶

Let's try up to degree 3 polynomials ($x^3$)¶

Now we have two transformations to make:¶

Pipelines: making multiple transformations much easier¶

How about large polynomial degrees?¶

Overfitting alert!¶

Regularization to the rescue?¶

Takeaways¶

Recap: what we learned so far¶