from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)


import altair as alt
from vega_datasets import data as vega_data


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


airbnb = pd.read_csv("data/philly_airbnb_by_neighborhoods.csv")
airbnb.head()


# Initialize the Kmeans object
kmeans = KMeans(n_clusters=5, random_state=42)

# Scale the data features we want
scaler = StandardScaler()
scaled_airbnb_data = scaler.fit_transform(airbnb[['price_per_person', 'overall_satisfaction', 'N']])


# Run the fit!
kmeans.fit(scaled_airbnb_data)

# Save the cluster labels
airbnb['label'] = kmeans.labels_


# New column "label"!
airbnb.head()


airbnb.groupby('label', as_index=False).size()


airbnb.groupby('label', as_index=False).mean().sort_values(by='price_per_person')


hoods = gpd.read_file("./data/philly_neighborhoods.geojson")
hoods.head()


airbnb.head()


# do the merge
airbnb2 = hoods.merge(airbnb, left_on='Name', right_on='neighborhood', how='left')

# assign -1 to the neighborhoods without any listings
airbnb2['label'] = airbnb2['label'].fillna(-1)


# plot the data
airbnb2 = airbnb2.to_crs(epsg=3857)

# setup the figure
f, ax = plt.subplots(figsize=(10, 8))

# plot, coloring by label column
# specify categorical data and add legend
airbnb2.plot(
    column="label",
    cmap="Dark2",
    categorical=True,
    legend=True,
    edgecolor="k",
    lw=0.5,
    ax=ax,
)


ax.set_axis_off()
plt.axis("equal");


# plot map, where variables ares nested within `properties`,
alt.Chart(airbnb2.to_crs(epsg=4326)).mark_geoshape().properties(
    width=500, height=400,
).encode(
    tooltip=["Name:N", "label:N"],
    color=alt.Color("label:N", scale=alt.Scale(scheme="Dark2")),
)


# Number of clusters to try out
n_clusters = list(range(2, 10))

# Run kmeans for each value of k
inertias = []
for k in n_clusters:
    
    # Initialize and run
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(scaled_airbnb_data)
    
    # Save the "inertia"
    inertias.append(kmeans.inertia_)
    
# Plot it!
plt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3);


n_clusters

[2, 3, 4, 5, 6, 7, 8, 9]


from kneed import KneeLocator

# Initialize the knee algorithm
kn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing')

# Print out the knee 
print(kn.knee)

5


coords = gpd.read_file('./data/osm_gps_philadelphia.geojson')
coords.head()


num_points = len(coords)

print(f"Total number of points = {num_points}")

Total number of points = 52358


from sklearn.cluster import dbscan


dbscan?

Signature:
dbscan(
    X,
    eps=0.5,
    *,
    min_samples=5,
    metric='minkowski',
    metric_params=None,
    algorithm='auto',
    leaf_size=30,
    p=2,
    sample_weight=None,
    n_jobs=None,
)
Docstring:
Perform DBSCAN clustering from vector array or distance matrix.

Read more in the :ref:`User Guide <dbscan>`.

Parameters
----------
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or             (n_samples, n_samples)
    A feature array, or array of distances between samples if
    ``metric='precomputed'``.

eps : float, default=0.5
    The maximum distance between two samples for one to be considered
    as in the neighborhood of the other. This is not a maximum bound
    on the distances of points within a cluster. This is the most
    important DBSCAN parameter to choose appropriately for your data set
    and distance function.

min_samples : int, default=5
    The number of samples (or total weight) in a neighborhood for a point
    to be considered as a core point. This includes the point itself.

metric : str or callable, default='minkowski'
    The metric to use when calculating distance between instances in a
    feature array. If metric is a string or callable, it must be one of
    the options allowed by :func:`sklearn.metrics.pairwise_distances` for
    its metric parameter.
    If metric is "precomputed", X is assumed to be a distance matrix and
    must be square during fit.
    X may be a :term:`sparse graph <sparse graph>`,
    in which case only "nonzero" elements may be considered neighbors.

metric_params : dict, default=None
    Additional keyword arguments for the metric function.

    .. versionadded:: 0.19

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
    The algorithm to be used by the NearestNeighbors module
    to compute pointwise distances and find nearest neighbors.
    See NearestNeighbors module documentation for details.

leaf_size : int, default=30
    Leaf size passed to BallTree or cKDTree. This can affect the speed
    of the construction and query, as well as the memory required
    to store the tree. The optimal value depends
    on the nature of the problem.

p : float, default=2
    The power of the Minkowski metric to be used to calculate distance
    between points.

sample_weight : array-like of shape (n_samples,), default=None
    Weight of each sample, such that a sample with a weight of at least
    ``min_samples`` is by itself a core sample; a sample with negative
    weight may inhibit its eps-neighbor from being core.
    Note that weights are absolute, and default to 1.

n_jobs : int, default=None
    The number of parallel jobs to run for neighbors search. ``None`` means
    1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
    using all processors. See :term:`Glossary <n_jobs>` for more details.
    If precomputed distance are used, parallel execution is not available
    and thus n_jobs will have no effect.

Returns
-------
core_samples : ndarray of shape (n_core_samples,)
    Indices of core samples.

labels : ndarray of shape (n_samples,)
    Cluster labels for each point.  Noisy samples are given the label -1.

See Also
--------
DBSCAN : An estimator interface for this clustering algorithm.
OPTICS : A similar estimator interface clustering at multiple values of
    eps. Our implementation is optimized for memory usage.

Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.

This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.

One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.

Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.

:func:`cluster.optics <sklearn.cluster.optics>` provides a similar
clustering with lower memory usage.

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996

Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
ACM Transactions on Database Systems (TODS), 42(3), 19.
File:      ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/sklearn/cluster/_dbscan.py
Type:      function


# some parameters to start with
eps = 1  # in meters
min_samples = 50

cores, labels = dbscan(coords[["x", "y"]], eps=eps, min_samples=min_samples)


# The first 5 elements
cores[:5]

array([ 4,  6, 29, 35, 46])


num_cores = len(cores)
print(f"Number of core samples = {num_cores}")

Number of core samples = 4624


# The first 5 elements
labels[:5]

array([-1, -1, -1, -1,  0])


# Add our labels to the original data
coords['label'] = labels


coords.head()


num_clusters = coords['label'].nunique() - 1
print(f"number of clusters = {num_clusters}")

number of clusters = 18


cluster_sizes = coords.groupby('label', as_index=False).size()

cluster_sizes


# All points get assigned a cluster label (-1 reserved for noise)
cluster_sizes['size'].sum() == num_points

True


num_noise = cluster_sizes.iloc[0]['size']

print(f"number of noise points = {num_noise}")

number of noise points = 47253


num_edges = num_points - num_cores - num_noise
print(f"Number of edge points = {num_edges}")

Number of edge points = 481


# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")

# Plot the noise samples in grey
noise = coords.loc[coords["label"] == -1]
ax.scatter(noise["x"], noise["y"], c="grey", s=5, linewidth=0)

# Loop over each cluster number
for label_num in range(0, num_clusters):

    # Extract the samples with this label number
    this_cluster = coords.loc[coords["label"] == label_num]

    # Calculate the mean (x,y) point for this cluster in red
    x_mean = this_cluster["x"].mean()
    y_mean = this_cluster["y"].mean()
    
    # Plot this centroid point in red
    ax.scatter(x_mean, y_mean, linewidth=0, color="red")

# Format
ax.set_axis_off()
ax.set_aspect("equal")


taxi = pd.read_csv("./data/williamsburg_taxi_trips.csv")
taxi.head()


feature_columns = [
    "pickup_x",
    "pickup_y",
    "dropoff_x",
    "dropoff_y",
    "trip_distance",
    "pickup_hour",
]
features = taxi[feature_columns].copy()


features.head()


# Scale these features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

scaled_features

array([[ 3.35254171e+00,  2.18196697e+00,  8.59345108e-02,
         1.89871932e-01, -2.58698769e-03,  8.16908067e-01],
       [-9.37728802e-01, -4.08167622e-01, -9.76176333e-02,
        -9.77141849e-03, -2.81690985e-03,  8.16908067e-01],
       [-7.05204353e-01, -6.95217705e-02,  1.21306539e-01,
        -6.45086739e-02, -2.80981012e-03, -1.32713022e+00],
       ...,
       [-1.32952083e+00, -1.14848599e+00, -3.37095821e-01,
        -1.09933782e-01, -2.76011198e-03,  7.04063946e-01],
       [-7.52953521e-01, -7.01094651e-01, -2.61571762e-01,
        -3.00037860e-01, -2.84530879e-03,  7.04063946e-01],
       [-3.97090015e-01, -1.71084059e-02, -1.11647543e+00,
         2.84810408e-01, -2.93269014e-03,  8.16908067e-01]])


print(scaled_features.shape)
print(features.shape)

(223722, 6)
(223722, 6)


# Run DBSCAN 
cores, labels = dbscan(scaled_features, eps=0.25, min_samples=50)

# Add the labels back to the original (unscaled) dataset
features['label'] = labels


# Extract the number of clusters 
num_clusters = features['label'].nunique() - 1
print(num_clusters)

27


# Same thing!
features["label"].max() + 1

27


N = features.groupby('label').size()
print(N)

label
-1     101292
 0       4481
 1      50673
 2      33277
 3      24360
 4        912
 5       2215
 6       2270
 7       1459
 8        254
 9        519
 10       183
 11       414
 12       224
 13       211
 14       116
 15        70
 16        85
 17       143
 18        76
 19        69
 20        86
 21        49
 22        51
 23        97
 24        52
 25        41
 26        43
dtype: int64


# sort from largest to smallest
N = N.sort_values(ascending=False)

N

label
-1     101292
 1      50673
 2      33277
 3      24360
 0       4481
 6       2270
 5       2215
 7       1459
 4        912
 9        519
 11       414
 8        254
 12       224
 13       211
 10       183
 17       143
 14       116
 23        97
 20        86
 16        85
 18        76
 15        70
 19        69
 24        52
 22        51
 21        49
 26        43
 25        41
dtype: int64


# extract labels (ignoring label -1 for noise)
top5 = N.iloc[1:6]

top5

label
1    50673
2    33277
3    24360
0     4481
6     2270
dtype: int64


top5_labels = top5.index.tolist()

top5_labels

[1, 2, 3, 0, 6]


# get the features for the top 5 labels
selection = features['label'].isin(top5_labels)

# select top 5 and groupby by the label
grps = features.loc[selection].groupby('label')

# calculate average pickup hour and trip distance per cluster
avg_values = grps[['pickup_hour', 'trip_distance']].mean()

avg_values


# a good color scheme for a black background
colors = ['aqua', 'lime', 'red', 'fuchsia', 'yellow']


# EXAMPLE: enumerating a list
for i, label_num in enumerate([0, 1, 2, 3, 4]):
    print(f"i = {i}")
    print(f"label_num = {label_num}")

i = 0
label_num = 0
i = 1
label_num = 1
i = 2
label_num = 2
i = 3
label_num = 3
i = 4
label_num = 4


# Setup figure and axis
f, ax = plt.subplots(figsize=(10, 10), facecolor="black")

# Plot noise in grey
noise = features.loc[features["label"] == -1]
ax.scatter(noise["pickup_x"], noise["pickup_y"], c="grey", s=5, linewidth=0)

# specify colors for each of the top 5 clusters
colors = ["aqua", "lime", "red", "fuchsia", "yellow"]

# loop over top 5 largest clusters
for i, label_num in enumerate(top5_labels):
    print(f"Plotting cluster #{label_num}...")

    # select all the samples with label equals "label_num"
    this_cluster = features.loc[features["label"] == label_num]

    # plot pickups
    ax.scatter(
        this_cluster["pickup_x"],
        this_cluster["pickup_y"],
        linewidth=0,
        color=colors[i],
        s=5,
        alpha=0.3,
    )

    # plot dropoffs
    ax.scatter(
        this_cluster["dropoff_x"],
        this_cluster["dropoff_y"],
        linewidth=0,
        color=colors[i],
        s=5,
        alpha=0.3,
    )

# Display the figure
ax.set_axis_off()

Plotting cluster #1...
Plotting cluster #2...
Plotting cluster #3...
Plotting cluster #0...
Plotting cluster #6...


def plot_taxi_cluster(label_num):
    """
    Plot the pickups and dropoffs for the input cluster label
    """
    # Setup figure and axis
    f, ax = plt.subplots(figsize=(10, 10), facecolor='black')

    # Plot noise in grey
    noise = features.loc[features['label']==-1]
    ax.scatter(noise['pickup_x'], noise['pickup_y'], c='grey', s=5, linewidth=0)

    # get this cluster
    this_cluster = features.loc[features['label']==label_num]

    # Plot pickups in fuchsia
    ax.scatter(this_cluster['pickup_x'], this_cluster['pickup_y'], 
               linewidth=0, color='fuchsia', s=5, alpha=0.3)

    # Plot dropoffs in aqua
    ax.scatter(this_cluster['dropoff_x'], this_cluster['dropoff_y'], 
               linewidth=0, color='aqua', s=5, alpha=0.3)

    # Display the figure
    ax.set_axis_off()
    plt.show()


# plot specific label
plot_taxi_cluster(label_num=2)


# map colors to the top 5 cluster labels
color_key = dict(zip(top5_labels, ['aqua', 'lime', 'red', 'fuchsia', 'yellow']))
print(color_key)

{1: 'aqua', 2: 'lime', 3: 'red', 0: 'fuchsia', 6: 'yellow'}


# extract the features for the top 5 clusters
top5_features = features.loc[features['label'].isin(top5_labels)]


import hvplot.pandas
import datashader as ds
import geoviews as gv

/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


# Pickups using a categorical color map
pickups = top5_features.hvplot.scatter(
    x="pickup_x",
    y="pickup_y",
    c="label",
    aggregator=ds.count_cat("label"),
    datashade=True,
    cmap=color_key,
    width=800,
    height=600,
)

# Dropoffs using a categorical color map
dropoffs = top5_features.hvplot.scatter(
    x="dropoff_x",
    y="dropoff_y",
    c="label",
    aggregator=ds.count_cat("label"),
    datashade=True,
    cmap=color_key,
    width=800,
    height=600,
)


pickups * dropoffs * gv.tile_sources.CartoDark

/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/holoviews/operation/datashader.py:450: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[category] = df[category].astype('category')

	neighborhood	price_per_person	overall_satisfaction	N
0	ALLEGHENY_WEST	120.791667	4.666667	23
1	BELLA_VISTA	87.407920	3.158333	204
2	BELMONT	69.425000	3.250000	11
3	BREWERYTOWN	71.788188	1.943182	142
4	BUSTLETON	55.833333	1.250000	19

	neighborhood	price_per_person	overall_satisfaction	N	label
0	ALLEGHENY_WEST	120.791667	4.666667	23	2
1	BELLA_VISTA	87.407920	3.158333	204	2
2	BELMONT	69.425000	3.250000	11	2
3	BREWERYTOWN	71.788188	1.943182	142	1
4	BUSTLETON	55.833333	1.250000	19	1

	label	price_per_person	overall_satisfaction	N
2	2	67.526755	3.232236	69.864407
1	1	78.935232	0.920238	38.347826
0	0	119.329173	2.947605	313.000000
4	4	136.263996	3.000924	1499.000000
3	3	387.626984	5.000000	31.000000

	Name	geometry
0	LAWNDALE	POLYGON ((-75.08616 40.05013, -75.08893 40.044...
1	ASTON_WOODBRIDGE	POLYGON ((-75.00860 40.05369, -75.00861 40.053...
2	CARROLL_PARK	POLYGON ((-75.22673 39.97720, -75.22022 39.974...
3	CHESTNUT_HILL	POLYGON ((-75.21278 40.08637, -75.21272 40.086...
4	BURNHOLME	POLYGON ((-75.08768 40.06861, -75.08758 40.068...

	neighborhood	price_per_person	overall_satisfaction	N	label
0	ALLEGHENY_WEST	120.791667	4.666667	23	2
1	BELLA_VISTA	87.407920	3.158333	204	2
2	BELMONT	69.425000	3.250000	11	2
3	BREWERYTOWN	71.788188	1.943182	142	1
4	BUSTLETON	55.833333	1.250000	19	1

Lecture 10B: Clustering Analysis in Python¶

Housekeeping¶

Last lecture (10A)¶

Today¶

Last time: Clustering neighborhoods by Airbnb stats¶

Two good references for Airbnb data¶

Step 1: Load the data with pandas¶

Step 2: Perform the K-Means fit¶

Step 3: Calculate average features per cluster¶

Step 4: Plot a choropleth, coloring neighborhoods by their cluster label¶

Step 5: Plot an interactive map¶

Based on these results, where would you want to stay?¶

Why 5 groups?¶

The `kneed` package¶

Part 2: Spatial clustering¶

DBSCAN¶

Two key parameters¶

Example Scenario¶

Example Scenario¶

Importance of parameter choices¶

Example: OpenStreetMap GPS traces in Philadelphia¶

DBSCAN basics¶

Now let's plot the noise and clusters¶

Extending DBSCAN beyond just spatial coordinates¶

Exercise: Extracting patterns from NYC taxi rides¶

Step 1: Load the data¶

Step 2: Extract and normalize several features¶

Step 3: Run DBSCAN to extract high-density clusters¶

Step 4: Identify the 5 largest clusters¶

Step 5: Get mean statistics for the top 5 largest clusters¶

Step 6a: Visualize the top 5 largest clusters¶

If you're feeling ambitious, and time-permitting...¶

Step 6b: Visualizing one cluster at a time¶

Step 7: an interactive map of clusters with hvplot + datashader¶

That's it! Next time, regressions with scikit-learn!¶

	x	y	geometry
0	-8370750.5	4865303.0	POINT (-8370750.500 4865303.000)
1	-8368298.0	4859096.5	POINT (-8368298.000 4859096.500)
2	-8365991.0	4860380.0	POINT (-8365991.000 4860380.000)
3	-8372306.5	4868231.0	POINT (-8372306.500 4868231.000)
4	-8376768.5	4864341.0	POINT (-8376768.500 4864341.000)

	label	size
0	-1	47253
1	0	2378
2	1	332
3	2	1145
4	3	113
5	4	65
6	5	242
7	6	54
8	7	51
9	8	95
10	9	72
11	10	137
12	11	51
13	12	82
14	13	52
15	14	76
16	15	51
17	16	58
18	17	51

	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	pickup_x	pickup_y	dropoff_x	dropoff_y	fare_amount	tip_amount	dropoff_hour	pickup_hour
0	2015-01-15 19:05:41	2015-01-15 19:20:22	2	7.13	-8223667.0	4979065.0	-8232341.0	4970922.0	21.5	4.50	19	19
1	2015-01-15 19:05:44	2015-01-15 19:17:44	1	2.92	-8237459.0	4971133.5	-8232725.0	4970482.5	12.5	2.70	19	19
2	2015-01-25 00:13:06	2015-01-25 00:34:32	1	3.05	-8236711.5	4972170.5	-8232267.0	4970362.0	16.5	5.34	0	0
3	2015-01-26 12:41:15	2015-01-26 12:59:22	1	8.10	-8222485.5	4978445.5	-8233442.5	4969903.5	24.5	5.05	12	12
4	2015-01-20 22:49:11	2015-01-20 22:58:46	1	3.50	-8236294.5	4970916.5	-8231820.5	4971722.0	12.5	2.00	22	22

	pickup_hour	trip_distance
label
0	18.599643	7.508730
1	20.127405	4.025859
2	1.699943	3.915581
3	9.536905	1.175154
6	1.494714	2.620546

Lecture 10B: Clustering Analysis in Python¶

Housekeeping¶

Last lecture (10A)¶

Today¶

Last time: Clustering neighborhoods by Airbnb stats¶

Two good references for Airbnb data¶

Step 1: Load the data with pandas¶

Step 2: Perform the K-Means fit¶

Step 3: Calculate average features per cluster¶

Step 4: Plot a choropleth, coloring neighborhoods by their cluster label¶

Step 5: Plot an interactive map¶

Based on these results, where would you want to stay?¶

Why 5 groups?¶

The kneed package¶

Part 2: Spatial clustering¶

DBSCAN¶

Two key parameters¶

Example Scenario¶

Example Scenario¶

Importance of parameter choices¶

Example: OpenStreetMap GPS traces in Philadelphia¶

DBSCAN basics¶

Now let's plot the noise and clusters¶

Extending DBSCAN beyond just spatial coordinates¶

Exercise: Extracting patterns from NYC taxi rides¶

Step 1: Load the data¶

Step 2: Extract and normalize several features¶

Step 3: Run DBSCAN to extract high-density clusters¶

Step 4: Identify the 5 largest clusters¶

Step 5: Get mean statistics for the top 5 largest clusters¶

Step 6a: Visualize the top 5 largest clusters¶

If you're feeling ambitious, and time-permitting...¶

Step 6b: Visualizing one cluster at a time¶

Step 7: an interactive map of clusters with hvplot + datashader¶

That's it! Next time, regressions with scikit-learn!¶

The `kneed` package¶