from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)
Nov 14, 2022
DBSCAN can perform high-density clusters from more than just spatial coordinates, as long as they are properly normalized
from sklearn.cluster import dbscan
from sklearn.preprocessing import StandardScaler
I've extracted data for taxi pickups or drop offs occurring in the Williamsburg neighborhood of NYC from the NYC taxi open data.
Includes data for:
Goal: identify clusters of similar taxi rides that are not only clustered spatially, but also clustered for features like hour of day and trip distance
Inspired by this CARTO blog post
taxi = pd.read_csv("./data/williamsburg_taxi_trips.csv")
taxi.head()
tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | pickup_x | pickup_y | dropoff_x | dropoff_y | fare_amount | tip_amount | dropoff_hour | pickup_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-15 19:05:41 | 2015-01-15 19:20:22 | 2 | 7.13 | -8223667.0 | 4979065.0 | -8232341.0 | 4970922.0 | 21.5 | 4.50 | 19 | 19 |
1 | 2015-01-15 19:05:44 | 2015-01-15 19:17:44 | 1 | 2.92 | -8237459.0 | 4971133.5 | -8232725.0 | 4970482.5 | 12.5 | 2.70 | 19 | 19 |
2 | 2015-01-25 00:13:06 | 2015-01-25 00:34:32 | 1 | 3.05 | -8236711.5 | 4972170.5 | -8232267.0 | 4970362.0 | 16.5 | 5.34 | 0 | 0 |
3 | 2015-01-26 12:41:15 | 2015-01-26 12:59:22 | 1 | 8.10 | -8222485.5 | 4978445.5 | -8233442.5 | 4969903.5 | 24.5 | 5.05 | 12 | 12 |
4 | 2015-01-20 22:49:11 | 2015-01-20 22:58:46 | 1 | 3.50 | -8236294.5 | 4970916.5 | -8231820.5 | 4971722.0 | 12.5 | 2.00 | 22 | 22 |
We will focus on the following columns:
pickup_x
and pickup_y
dropoff_x
and dropoff_y
trip_distance
pickup_hour
Use the StandardScaler
to normalize these features.
feature_columns = [
"pickup_x",
"pickup_y",
"dropoff_x",
"dropoff_y",
"trip_distance",
"pickup_hour",
]
features = taxi[feature_columns].copy()
features.head()
pickup_x | pickup_y | dropoff_x | dropoff_y | trip_distance | pickup_hour | |
---|---|---|---|---|---|---|
0 | -8223667.0 | 4979065.0 | -8232341.0 | 4970922.0 | 7.13 | 19 |
1 | -8237459.0 | 4971133.5 | -8232725.0 | 4970482.5 | 2.92 | 19 |
2 | -8236711.5 | 4972170.5 | -8232267.0 | 4970362.0 | 3.05 | 0 |
3 | -8222485.5 | 4978445.5 | -8233442.5 | 4969903.5 | 8.10 | 12 |
4 | -8236294.5 | 4970916.5 | -8231820.5 | 4971722.0 | 3.50 | 22 |
# Scale these features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features
array([[ 3.35254171e+00, 2.18196697e+00, 8.59345108e-02, 1.89871932e-01, -2.58698769e-03, 8.16908067e-01], [-9.37728802e-01, -4.08167622e-01, -9.76176333e-02, -9.77141849e-03, -2.81690985e-03, 8.16908067e-01], [-7.05204353e-01, -6.95217705e-02, 1.21306539e-01, -6.45086739e-02, -2.80981012e-03, -1.32713022e+00], ..., [-1.32952083e+00, -1.14848599e+00, -3.37095821e-01, -1.09933782e-01, -2.76011198e-03, 7.04063946e-01], [-7.52953521e-01, -7.01094651e-01, -2.61571762e-01, -3.00037860e-01, -2.84530879e-03, 7.04063946e-01], [-3.97090015e-01, -1.71084059e-02, -1.11647543e+00, 2.84810408e-01, -2.93269014e-03, 8.16908067e-01]])
print(scaled_features.shape)
print(features.shape)
(223722, 6) (223722, 6)
We want the highest density clusters, ideally no more than about 30-50 clusters.
Run the DBSCAN and experiment with different values of eps
and min_samples
eps
of 0.25 and min_samples
of 50Add the labels to the original data frame and calculate the number of clusters. It should be less than 50 or so.
Hint: If the algorithm is taking a long time to run (more than a few minutes), the eps
is probably too big!
# Run DBSCAN
cores, labels = dbscan(scaled_features, eps=0.25, min_samples=50)
# Add the labels back to the original (unscaled) dataset
features['label'] = labels
# Extract the number of clusters
num_clusters = features['label'].nunique() - 1
print(num_clusters)
27
Group by the label, calculate and sort the sizes to find the label numbers of the top 5 largest clusters
# Get cluster sizes, from largest to smallest
N = features.groupby('label').size().sort_values(ascending=False)
print(N)
label -1 101292 1 50673 2 33277 3 24360 0 4481 6 2270 5 2215 7 1459 4 912 9 519 11 414 8 254 12 224 13 211 10 183 17 143 14 116 23 97 20 86 16 85 18 76 15 70 19 69 24 52 22 51 21 49 26 43 25 41 dtype: int64
# Extract labels (ignoring label -1 for noise)
top5 = list(N.iloc[1:6].index)
print(top5)
[1, 2, 3, 0, 6]
To better identify trends in the top 5 clusters, calculate the mean trip distance and pickup_hour for each of the clusters.
# get the features for the top 5 labels
selection = features['label'].isin(top5)
# select top 5 and groupby by the label
grps = features.loc[selection].groupby('label')
# calculate average pickup hour and trip distance per cluster
avg_values = grps[['pickup_hour', 'trip_distance']].mean()
avg_values.loc[top5]
pickup_hour | trip_distance | |
---|---|---|
label | ||
1 | 20.127405 | 4.025859 |
2 | 1.699943 | 3.915581 |
3 | 9.536905 | 1.175154 |
0 | 18.599643 | 7.508730 |
6 | 1.494714 | 2.620546 |
Now visualize the top 5 largest clusters:
Hints:
# a good color scheme for a black background
colors = ['aqua', 'lime', 'red', 'fuchsia', 'yellow']
# EXAMPLE: enumerating a list
example_list = [10, 12, 5, 13, 40]
for i, label_num in enumerate(example_list):
print(f"i = {i}")
print(f"label_num = {label_num}")
i = 0 label_num = 10 i = 1 label_num = 12 i = 2 label_num = 5 i = 3 label_num = 13 i = 4 label_num = 40
# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")
# Plot noise in grey
noise = features.loc[features["label"] == -1]
ax.scatter(noise["pickup_x"], noise["pickup_y"], c="grey", s=5, linewidth=0)
# specify colors for each of the top 5 clusters
colors = ["aqua", "lime", "red", "fuchsia", "yellow"]
# loop over top 5 largest clusters
for i, label_num in enumerate(top5):
print(f"Plotting cluster #{label_num}...")
# select all the samples with label equals "label_num"
this_cluster = features.loc[features["label"] == label_num]
# plot pickups
ax.scatter(
this_cluster["pickup_x"],
this_cluster["pickup_y"],
linewidth=0,
color=colors[i],
s=5,
alpha=0.3,
)
# plot dropoffs
ax.scatter(
this_cluster["dropoff_x"],
this_cluster["dropoff_y"],
linewidth=0,
color=colors[i],
s=5,
alpha=0.3,
)
# Display the figure
ax.set_axis_off()
Plotting cluster #1... Plotting cluster #2... Plotting cluster #3... Plotting cluster #0... Plotting cluster #6...
Another good way to visualize the results is to explore the other clusters one at a time, plotting both the pickups and dropoffs to identify the trends.
Use different colors for pickups/dropoffs to easily identify them.
Make it a function so we can repeat it easily:
def plot_taxi_cluster(label_num):
"""
Plot the pickups and dropoffs for the input cluster label
"""
# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")
# Plot noise in grey
noise = features.loc[features["label"] == -1]
ax.scatter(noise["pickup_x"], noise["pickup_y"], c="grey", s=5, linewidth=0)
# Get the features for "label_num"
this_cluster = features.loc[features["label"] == label_num]
# Plot pickups in fuchsia
ax.scatter(
this_cluster["pickup_x"],
this_cluster["pickup_y"],
linewidth=0,
color="fuchsia",
s=5,
alpha=0.3,
)
# Plot dropoffs in aqua
ax.scatter(
this_cluster["dropoff_x"],
this_cluster["dropoff_y"],
linewidth=0,
color="aqua",
s=5,
alpha=0.3,
)
# Display the figure
ax.set_axis_off()
# Add a label
ax.text(
0.1,
0.9,
f"Cluster #{label_num}",
ha="left",
color="white",
weight='bold',
fontsize=30,
transform=ax.transAxes,
)
top5
[1, 2, 3, 0, 6]
# Plot pickups (fuchsia) and dropoffs (aqua) for a specific cluster
plot_taxi_cluster(label_num=6)
An example from the CARTO analysis:
We wanted to explore how we can use data to better understand and define communities of people, going beyond spatial borders like zip code and neighborhood boundaries.
However, their clusters include groupings by class and religion...e.g, "working class" and "Orthodox Jewish" residents. While the intention of this analysis may have been benign, the results could have easily been misused to target residents in a potentially discriminatory way.
We'll see more examples of algorithmic fairness on assignment #7 when modeling housing prices in Philadelphia.
Today, we'll walk through an end-to-end regression example to predict Philadelphia's housing prices
Given your training set of data, which model parameters best represent the observed data?
The cost function measures the difference between the model's predictions and the observed data
In scikit-learn, you will call the fit()
method on your algorithm.
Key goal: how we can do this in a way to ensure the model is as generalizable as possible?
Or: "garbage in, garbage out"
Common issues:
Common to use 80% of data for your training set and 20% for your test set
For more information, see the scikit-learn docs
We'll load data compiled from two data sources:
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()
Country | life_satisfaction | gdp_per_capita | |
---|---|---|---|
0 | Australia | 7.3 | 50961.87 |
1 | Austria | 7.1 | 43724.03 |
2 | Belgium | 6.9 | 40106.63 |
3 | Brazil | 6.4 | 8670.00 |
4 | Canada | 7.4 | 43331.96 |
import hvplot.pandas
data.hvplot.scatter(
x="gdp_per_capita",
y="life_satisfaction",
hover_cols=["Country"],
ylim=(4, 9),
xlim=(1e3, 1.1e5),
)
A simple model with only two parameters: $\theta_1$ and $\theta_2$
Use the LinearRegression model object from scikit-learn.
This is not really machine learning — it simply finds the Ordinary Least Squares fit to the data.
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Our input features (in this case we only have 1)
X = data['gdp_per_capita'].values
X = X[:, np.newaxis]
# The labels (values we are trying to predict)
y = data['life_satisfaction'].values
X.shape
(40, 1)
y.shape
(40,)
Note: scikit learn expects the features to be a 2D array with shape: (number of observations, number of features)
.
We are explicitly adding a second axis with the np.newaxis
variable.
Now, fit the model using the model.fit(X, y)
syntax.
This will "train" our model, using an optimization algorithm to identify the bestfit parameters.
model.fit(X, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
intercept = model.intercept_
slope = model.coef_[0]
print(f"bestfit intercept = {intercept:.2f}")
print(f"bestfit slope = {slope:.2e}")
bestfit intercept = 5.72 bestfit slope = 2.47e-05
Note: In this case, our model is the same as ordinary least squares, and no actually optimization is performed since an exact solution exists.
score()
function that provides a score to evaluate the fit by.Note: you must call the fit()
function before calling the score()
function.
Rsq = model.score(X, y)
Rsq
0.519153782362894
Use the predict()
function to predict new values.
# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)
# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]
y_pred = model.predict(X_pred)
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the predicted values
ax.plot(X_pred / 1e5, y_pred, label="Predicted values", color="#666666")
# Training data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Training data",
s=100,
zorder=10,
color="#f40000",
)
ax.legend()
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
Scikit learn provides a utility function to split our input data:
from sklearn.model_selection import train_test_split
# I'll use a 70/30% split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)
These are new DataFrame objects, with lengths determined by the split percentage:
print("size of full dataset = ", len(data))
print("size of training dataset = ", len(train_set))
print("size of test dataset = ", len(test_set))
size of full dataset = 40 size of training dataset = 28 size of test dataset = 12
Now, make our feature and label arrays:
# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]
X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]
# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values
Use the StandardScaler
to scale the GDP per capita:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Scale the training features
X_train_scaled = scaler.fit_transform(X_train)
# Scale the test features
X_test_scaled = scaler.fit_transform(X_test)
Now, let's fit on the training set and evaluate on the test set
model.fit(X_train_scaled, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.score(X_test_scaled, y_test)
0.35959585147159556
Unsurprisingly, our fit gets worst when we test on unseen data
Our accuracy was artifically inflated the first time, since we trained and tested on the same data.
We'll use scikit learn's PolynomialFeatures
to add new polynomial features from the GDP per capita.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
# Training
X_train_scaled_poly = poly.fit_transform(scaler.fit_transform(X_train))
# Test
X_test_scaled_poly = poly.fit_transform(scaler.fit_transform(X_test))
X_train.shape
(28, 1)
X_train_scaled_poly.shape
(28, 4)
model.fit(X_train_scaled_poly, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.score(X_test_scaled_poly, y_test)
0.5597457659851046
The accuracy improved!
We can turn our preprocessing steps into a Pipeline
object using the make_pipeline()
function.
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))
pipe
Pipeline(steps=[('standardscaler', StandardScaler()), ('polynomialfeatures', PolynomialFeatures(degree=3))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()), ('polynomialfeatures', PolynomialFeatures(degree=3))])
StandardScaler()
PolynomialFeatures(degree=3)
Individual steps can be accessed via their names in a dict-like fashion:
# Step 1
pipe['standardscaler']
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Step 2
pipe['polynomialfeatures']
PolynomialFeatures(degree=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PolynomialFeatures(degree=3)
Let's apply this pipeline to our predicted GDP values for our plot:
y_pred = model.predict(pipe.fit_transform(X_pred))
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the predicted values
y_pred = model.predict(pipe.fit_transform(X_pred))
ax.plot(X_pred / 1e5 , y_pred, label="Predicted values", color="#666666")
# Training data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Training data",
s=100,
zorder=10,
color="#f40000",
)
ax.legend()
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
The additional polynomial features introduced some curvature and improved the fit!
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
# Original data set
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Training data",
s=100,
zorder=10,
color="#666666",
)
# Plot the predicted values
for degree in [3, 5, 10]:
print(f"degree = {degree}")
# Create out pipeline
p = make_pipeline(StandardScaler(), PolynomialFeatures(degree=degree))
# Fit the model on the training set
model.fit(p.fit_transform(X_train), y_train)
# Evaluate on the training set
training_score = model.score(p.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
# Evaluate on the test set
test_score = model.score(p.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
# Plot
y_pred = model.predict(p.fit_transform(X_pred))
ax.plot(X_pred / 1e5, y_pred, label=f"Degree = {degree}")
print()
ax.legend(ncol=2, loc=0)
ax.set_ylim(4, 9)
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
degree = 3 Training Score = 0.6458898101593082 Test Score = 0.5597457659851046 degree = 5 Training Score = 0.6846206186564368 Test Score = -3.9465752545551567 degree = 10 Training Score = 0.8020213670053926 Test Score = -26330.208554357912
As we increase the polynomial degree, two things happen:
This is the classic case of overfitting — our model does not generalize well at all.
Ridge
adds regularization to the linear regression least squares modelRemember, regularization penalizes large parameter values and complex fits
from sklearn.linear_model import Ridge
Let's gain some intuition:
Important
LinearModel
and scales input features with StandardScaler
StandardScaler
and PolynomialFeatures(degree=3)
pre-processing to featuresSet up a grid of GDP per capita points to make predictions for:
# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)
# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]
# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))
# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)
with plt.style.context("fivethirtyeight"):
fig, ax = plt.subplots(figsize=(10, 6))
## Plot the data
ax.scatter(
data["gdp_per_capita"] / 1e5,
data["life_satisfaction"],
label="Data",
s=100,
zorder=10,
color="#666666",
)
## Evaluate the linear fit
print("Linear fit")
training_score = linear.score(scaler.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
test_score = linear.score(scaler.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
print()
## Plot the linear fit
ax.plot(
X_pred / 1e5,
linear.predict(scaler.fit_transform(X_pred)),
color="k",
label="Linear fit",
)
## Ridge regression: linear model with regularization
# Plot the predicted values for each alpha
for alpha in [0, 10, 100, 1e5]:
print(f"alpha = {alpha}")
# Create out Ridge model with this alpha
ridge = Ridge(alpha=alpha)
# Fit the model on the training set
# NOTE: Use the pipeline that includes polynomial features
ridge.fit(pipe.fit_transform(X_train), y_train)
# Evaluate on the training set
training_score = ridge.score(pipe.fit_transform(X_train), y_train)
print(f"Training Score = {training_score}")
# Evaluate on the test set
test_score = ridge.score(pipe.fit_transform(X_test), y_test)
print(f"Test Score = {test_score}")
# Plot the ridge results
y_pred = ridge.predict(pipe.fit_transform(X_pred))
ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")
print()
# Plot formatting
ax.legend(ncol=2, loc=0)
ax.set_ylim(4, 8)
ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
ax.set_ylabel("Life Satisfaction")
Linear fit Training Score = 0.4638100579740343 Test Score = 0.35959585147159556 alpha = 0 Training Score = 0.6458898101593082 Test Score = 0.5597457659851048 alpha = 10 Training Score = 0.5120282691427858 Test Score = 0.38335642103788325 alpha = 100 Training Score = 0.1815398751108913 Test Score = -0.05242399995626967 alpha = 100000.0 Training Score = 0.0020235571180508005 Test Score = -0.26129559971586125
LinearRegression()
with the polynomial features