from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)
Nov 7, 2022
Example: A map of households without internet for US counties
Two ways:
folium.Choropleth
folium.GeoJson
import folium
Load the data, remove counties with no households and add our new column:
# Load CSV data from the data/ folder
census_data = pd.read_csv("./data/internet_avail_census.csv", dtype={"geoid": str})
# Remove counties with no households
valid = census_data['universe'] > 0
census_data = census_data.loc[valid]
# Calculate the percent without internet
census_data['percent_no_internet'] = census_data['no_internet'] / census_data['universe']
census_data.head()
NAME | universe | no_internet | state | county | geoid | percent_no_internet | |
---|---|---|---|---|---|---|---|
0 | Washington County, Mississippi | 18299.0 | 6166.0 | 28 | 151 | 28151 | 0.336958 |
1 | Perry County, Mississippi | 4563.0 | 1415.0 | 28 | 111 | 28111 | 0.310103 |
2 | Choctaw County, Mississippi | 3164.0 | 1167.0 | 28 | 19 | 28019 | 0.368837 |
3 | Itawamba County, Mississippi | 8706.0 | 1970.0 | 28 | 57 | 28057 | 0.226281 |
4 | Carroll County, Mississippi | 3658.0 | 1218.0 | 28 | 15 | 28015 | 0.332969 |
Load the counties geometry data too:
# Load counties GeoJSOn from the data/ folder
counties = gpd.read_file("./data/us-counties-10m.geojson")
counties.head()
id | geometry | |
---|---|---|
0 | 53073 | MULTIPOLYGON (((-120.85361 49.00011, -120.7674... |
1 | 30105 | POLYGON ((-106.11238 48.99904, -106.15187 48.8... |
2 | 30029 | POLYGON ((-114.06985 48.99904, -114.05908 48.8... |
3 | 16021 | POLYGON ((-116.04755 49.00065, -116.04755 48.5... |
4 | 30071 | POLYGON ((-107.17840 49.00011, -107.20712 48.9... |
Note: this is different than using folium.Choropleth
, where data and features are stored in two separate data frames.
# Merge the county geometries with census data
# Left column: "id"
# Right column: "geoid"
census_joined = counties.merge(census_data, left_on="id", right_on="geoid")
census_joined.head()
id | geometry | NAME | universe | no_internet | state | county | geoid | percent_no_internet | |
---|---|---|---|---|---|---|---|---|---|
0 | 53073 | MULTIPOLYGON (((-120.85361 49.00011, -120.7674... | Whatcom County, Washington | 85008.0 | 9189.0 | 53 | 73 | 53073 | 0.108096 |
1 | 30105 | POLYGON ((-106.11238 48.99904, -106.15187 48.8... | Valley County, Montana | 3436.0 | 672.0 | 30 | 105 | 30105 | 0.195576 |
2 | 30029 | POLYGON ((-114.06985 48.99904, -114.05908 48.8... | Flathead County, Montana | 38252.0 | 5662.0 | 30 | 29 | 30029 | 0.148018 |
3 | 16021 | POLYGON ((-116.04755 49.00065, -116.04755 48.5... | Boundary County, Idaho | 4605.0 | 1004.0 | 16 | 21 | 16021 | 0.218024 |
4 | 30071 | POLYGON ((-107.17840 49.00011, -107.20712 48.9... | Phillips County, Montana | 1770.0 | 484.0 | 30 | 71 | 30071 | 0.273446 |
# Minimum
min_val = census_joined['percent_no_internet'].min()
# Maximum
max_val = census_joined['percent_no_internet'].max()
# Calculate a normalized column
normalized = (census_joined['percent_no_internet'] - min_val) / (max_val - min_val)
# Add to the dataframe
census_joined['percent_no_internet_normalized'] = normalized
plt.get_cmap()
import matplotlib.colors as mcolors
# Use a red-purple colorbrewer color scheme
cmap = plt.get_cmap('RdPu')
# The minimum value of the color map as an RGB tuple
cmap(0)
(1.0, 0.9686274509803922, 0.9529411764705882, 1.0)
# The minimum value of the color map as a hex string
mcolors.rgb2hex(cmap(0.0))
'#fff7f3'
# The maximum value of the color map as a hex string
mcolors.rgb2hex(cmap(1.0))
'#49006a'
def get_style(feature):
"""
Given an input GeoJSON feature, return a style dict.
Notes
-----
The color in the style dict is determined by the
"percent_no_internet_normalized" column in the
input "feature".
"""
# Get the data value from the feature
value = feature['properties']['percent_no_internet_normalized']
# Evaluate the color map
# NOTE: value must between 0 and 1
rgb_color = cmap(value) # this is an RGB tuple
# Convert to hex string
color = mcolors.rgb2hex(rgb_color)
# Return the style dictionary
return {'weight': 0.5, 'color': color, 'fillColor': color, "fillOpacity": 0.75}
def get_highlighted_style(feature):
"""
Return a style dict to use when the user highlights a
feature with the mouse.
"""
return {"weight": 3, "color": "black"}
.to_json()
function to convert to a GeoJSON stringneeded_cols = ['NAME', 'percent_no_internet', 'percent_no_internet_normalized', 'geometry']
census_json = census_joined[needed_cols].to_json()
# STEP 1: Initialize the map
m = folium.Map(location=[40, -98], zoom_start=4)
# STEP 2: Add the GeoJson to the map
folium.GeoJson(
census_json, # The geometry + data columns in GeoJSON format
style_function=get_style, # The style function to color counties differently
highlight_function=get_highlighted_style,
tooltip=folium.GeoJsonTooltip(['NAME', 'percent_no_internet'])
).add_to(m)
# avoid a rendering bug by saving as HTML and re-loading
m.save('percent_no_internet.html')
The hard way is harder, but we have a tooltip and highlight interactivity!
from IPython.display import IFrame
IFrame('percent_no_internet.html', width=800, height=500)
Try to replicate the above interactive map exactly (minus the background tiles). This includes:
Note: Altair's syntax is similar to the folium.Choropleth syntax — you should pass the counties GeoDataFrame to the alt.Chart()
and then use the transform_lookup()
to merge those geometries to the census data and pull in the census data column we need ("percent_without_internet").
Hints
import altair as alt
# Initialize the chart with the counties data
alt.Chart(counties).mark_geoshape(stroke="white", strokeWidth=0.25).encode(
# Encode the color
color=alt.Color(
"percent_no_internet:Q",
title="Percent Without Internet",
scale=alt.Scale(scheme="redpurple"),
legend=alt.Legend(format=".0%")
),
# Tooltip
tooltip=[
alt.Tooltip("NAME:N", title="Name"),
alt.Tooltip("percent_no_internet:Q", title="Percent Without Internet", format=".1%"),
],
).transform_lookup(
lookup="id", # The column name in the counties data to match on
from_=alt.LookupData(census_data, "geoid", ["percent_no_internet", "NAME"]), # Match census data on "geoid"
).project(
type="albersUsa"
).properties(
width=700, height=500
)
One of leaflet's strengths: a rich set of open-source plugins
https://leafletjs.com/plugins.html
Many of these are available in Folium!
from folium.plugins import HeatMap
HeatMap?
Init signature: HeatMap( data, name=None, min_opacity=0.5, max_zoom=18, radius=25, blur=15, gradient=None, overlay=True, control=True, show=True, **kwargs, ) Docstring: Create a Heatmap layer Parameters ---------- data : list of points of the form [lat, lng] or [lat, lng, weight] The points you want to plot. You can also provide a numpy.array of shape (n,2) or (n,3). name : string, default None The name of the Layer, as it will appear in LayerControls. min_opacity : default 1. The minimum opacity the heat will start at. max_zoom : default 18 Zoom level where the points reach maximum intensity (as intensity scales with zoom), equals maxZoom of the map by default radius : int, default 25 Radius of each "point" of the heatmap blur : int, default 15 Amount of blur gradient : dict, default None Color gradient config. e.g. {0.4: 'blue', 0.65: 'lime', 1: 'red'} overlay : bool, default True Adds the layer as an optional overlay (True) or the base layer (False). control : bool, default True Whether the Layer will be included in LayerControls. show: bool, default True Whether the layer will be shown on opening (only for overlays). File: ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/folium/plugins/heat_map.py Type: type Subclasses:
New construction in Philadelphia requires a building permit, which we can pull from Open Data Philly.
carto2gpd
package to get the dataThe "current_date" variable in SQL databases
You can use the pre-defined "current_date" variable to get the current date. For example, to get the permits from the past 30 days, we could do:
SELECT * FROM permits WHERE permitissuedate >= current_date - 30
Selecting only new construction permits
To select new construction permits, you can use the "permitdescription" field. There are two relevant categories:
We can use the SQL IN
command (documentation) to easily select rows that have these categories.
import carto2gpd
# API URL
url = "https://phl.carto.com/api/v2/sql"
# Table name on CARTO
table_name = "permits"
# The where clause, with two parts
DAYS = 30
where = f"permitissuedate >= current_date - {DAYS}"
where += " and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"
where
"permitissuedate >= current_date - 30 and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"
# Run the query
permits = carto2gpd.get(url, table_name, where=where)
len(permits)
493
permits.head()
geometry | cartodb_id | objectid | permitnumber | addressobjectid | parcel_id_num | permittype | permitdescription | commercialorresidential | typeofwork | ... | unit_type | unit_num | zip | censustract | council_district | opa_owner | systemofrecord | geocode_x | geocode_y | posse_jobid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.16551 40.04532) | 4461 | 4898 | CP-2022-002038 | 15732535 | 210522 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | ADDITION AND/OR ALTERATION | ... | None | None | 19144-1206 | 248 | 8 | GAROFALO HELEN V | ECLIPSE | 2.691986e+06 | 269953.280616 | 465300244 |
1 | POINT (-75.00539 40.03600) | 4768 | 4904 | CP-2022-003337 | 156760706 | 440714 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | NEW CONSTRUCTION | ... | None | None | 19136-1604 | 9891 | 6 | PHILA MUNICIPAL AUTH | ECLIPSE | 2.736901e+06 | 267914.438827 | 486707349 |
2 | POINT (-75.09335 40.05158) | 4789 | 4909 | CP-2022-004987 | 184460163 | 352755 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | ADDITION AND/OR ALTERATION | ... | None | None | 19111-5246 | 306 | 9 | GRACE YANG INC | ECLIPSE | 2.712111e+06 | 272832.652136 | 514998496 |
3 | POINT (-75.16733 40.06235) | 5514 | 5680 | RP-2022-005876 | 265435068 | 331238 | RESIDENTIAL BUILDING | RESIDENTIAL BUILDING PERMIT | RESIDENTIAL | ADDITION AND/OR ALTERATION | ... | None | None | 19150-3300 | 264 | 9 | CMPJ LLC | ECLIPSE | 2.691293e+06 | 276139.761238 | 480078356 |
4 | POINT (-75.22938 39.93547) | 5537 | 5683 | RP-2022-007783 | 132249898 | 366914 | RESIDENTIAL BUILDING | RESIDENTIAL BUILDING PERMIT | RESIDENTIAL | ADDITION AND/OR ALTERATION | ... | None | None | 19143-5516 | 65 | 2 | DH1 FUND LP | ECLIPSE | 2.675252e+06 | 229438.672154 | 497420889 |
5 rows × 33 columns
Some permits don't have locations — use the .geometry.notnull()
function to trim the data frame to those incidents with valid geometries.
permits = permits.loc[permits.geometry.notnull()].copy()
Note: We need an array of (latitude, longitude) pairs. Be careful about the order!
# Extract the lat and longitude from the geometery column
permits['lat'] = permits.geometry.y
permits['lng'] = permits.geometry.x
permits.head()
geometry | cartodb_id | objectid | permitnumber | addressobjectid | parcel_id_num | permittype | permitdescription | commercialorresidential | typeofwork | ... | zip | censustract | council_district | opa_owner | systemofrecord | geocode_x | geocode_y | posse_jobid | lat | lng | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.16551 40.04532) | 4461 | 4898 | CP-2022-002038 | 15732535 | 210522 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | ADDITION AND/OR ALTERATION | ... | 19144-1206 | 248 | 8 | GAROFALO HELEN V | ECLIPSE | 2.691986e+06 | 269953.280616 | 465300244 | 40.045317 | -75.165507 |
1 | POINT (-75.00539 40.03600) | 4768 | 4904 | CP-2022-003337 | 156760706 | 440714 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | NEW CONSTRUCTION | ... | 19136-1604 | 9891 | 6 | PHILA MUNICIPAL AUTH | ECLIPSE | 2.736901e+06 | 267914.438827 | 486707349 | 40.036004 | -75.005385 |
2 | POINT (-75.09335 40.05158) | 4789 | 4909 | CP-2022-004987 | 184460163 | 352755 | BUILDING | COMMERCIAL BUILDING PERMIT | COMMERCIAL | ADDITION AND/OR ALTERATION | ... | 19111-5246 | 306 | 9 | GRACE YANG INC | ECLIPSE | 2.712111e+06 | 272832.652136 | 514998496 | 40.051579 | -75.093348 |
3 | POINT (-75.16733 40.06235) | 5514 | 5680 | RP-2022-005876 | 265435068 | 331238 | RESIDENTIAL BUILDING | RESIDENTIAL BUILDING PERMIT | RESIDENTIAL | ADDITION AND/OR ALTERATION | ... | 19150-3300 | 264 | 9 | CMPJ LLC | ECLIPSE | 2.691293e+06 | 276139.761238 | 480078356 | 40.062348 | -75.167333 |
4 | POINT (-75.22938 39.93547) | 5537 | 5683 | RP-2022-007783 | 132249898 | 366914 | RESIDENTIAL BUILDING | RESIDENTIAL BUILDING PERMIT | RESIDENTIAL | ADDITION AND/OR ALTERATION | ... | 19143-5516 | 65 | 2 | DH1 FUND LP | ECLIPSE | 2.675252e+06 | 229438.672154 | 497420889 | 39.935474 | -75.229383 |
5 rows × 35 columns
# Split out the residential and commercial
residential = permits.query("permitdescription == 'RESIDENTIAL BUILDING PERMIT'")
commercial = permits.query("permitdescription == 'COMMERCIAL BUILDING PERMIT'")
# Make a NumPy array (use the "values" attribute)
residential_coords = residential[['lat', 'lng']].values
commercial_coords = commercial[['lat', 'lng']].values
commercial_coords[:5]
array([[ 40.045317, -75.165507], [ 40.036004, -75.005385], [ 40.051579, -75.093348], [ 39.951968, -75.156049], [ 39.969465, -75.2167 ]])
The HeatMap takes the list of coordinates: the first column is latitude and the second column longitude
# Initialize map
m = folium.Map(
location=[39.99, -75.13],
tiles='Cartodb Positron',
zoom_start=12
)
# Add heat map coordinates
HeatMap(commercial_coords).add_to(m)
m
# Initialize map
m = folium.Map(
location=[39.99, -75.13],
tiles='Cartodb Positron',
zoom_start=12
)
# Add heat map
HeatMap(residential_coords, radius=15).add_to(m)
m
Commercial construction concentrated in the greater Center City area while residential construction is primarily outside of Center City...
Now on to clustering...
https://scikit-learn.org/stable/
Note: We will focus on clustering algorithms today and discuss a few other machine learning techniques in the next two weeks. If there is a specific scikit-learn use case we won't cover, I'm open to ideas for incorporating it as part of the final project.
The goal
Partition a dataset into groups that have a similar set of attributes, or features, within the group and a dissimilar set of features between groups.
Minimize the intra-cluster variance and maximize the inter-cluster variance of features.
https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods
Minimizes the intra-cluster variance: minimizes the sum of the squared distances between all points in a cluster and the cluster centroid
import altair as alt
from vega_datasets import data as vega_data
Read the data from a URL:
gapminder = pd.read_csv(vega_data.gapminder_health_income.url)
gapminder.head()
country | income | health | population | |
---|---|---|---|---|
0 | Afghanistan | 1925 | 57.63 | 32526562 |
1 | Albania | 10620 | 76.00 | 2896679 |
2 | Algeria | 13434 | 76.50 | 39666519 |
3 | Andorra | 46577 | 84.10 | 70473 |
4 | Angola | 7615 | 61.00 | 25021974 |
(
alt.Chart(gapminder)
.mark_circle()
.encode(
alt.X("income:Q", scale=alt.Scale(type="log")),
alt.Y("health:Q", scale=alt.Scale(zero=False)),
size="population:Q",
tooltip=list(gapminder.columns),
)
.properties(width=800, height=600)
.interactive()
)
from sklearn.cluster import KMeans
Let's start with 5 clusters
KMeans?
Init signature: KMeans( n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, algorithm='lloyd', ) Docstring: K-Means clustering. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- n_clusters : int, default=8 The number of clusters to form as well as the number of centroids to generate. init : {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++' Method for initialization: 'k-means++' : selects initial cluster centroids using sampling based on an empirical probability distribution of the points' contribution to the overall inertia. This technique speeds up convergence, and is theoretically proven to be :math:`\mathcal{O}(\log k)`-optimal. See the description of `n_init` for more details. 'random': choose `n_clusters` observations (rows) at random from data for the initial centroids. If an array is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization. n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, default=300 Maximum number of iterations of the k-means algorithm for a single run. tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. verbose : int, default=0 Verbosity mode. random_state : int, RandomState instance or None, default=None Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. copy_x : bool, default=True When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True (default), then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. Note that if the original data is not C-contiguous, a copy will be made even if copy_x is False. If the original data is sparse, but not in CSR format, a copy will be made even if copy_x is False. algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd" K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. The `"elkan"` variation can be more efficient on some datasets with well-defined clusters, by using the triangle inequality. However it's more memory intensive due to the allocation of an extra array of shape `(n_samples, n_clusters)`. `"auto"` and `"full"` are deprecated and they will be removed in Scikit-Learn 1.3. They are both aliases for `"lloyd"`. .. versionchanged:: 0.18 Added Elkan algorithm .. versionchanged:: 1.1 Renamed "full" to "lloyd", and deprecated "auto" and "full". Changed "auto" to use "lloyd" instead of "elkan". Attributes ---------- cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers. If the algorithm stops before fully converging (see ``tol`` and ``max_iter``), these will not be consistent with ``labels_``. labels_ : ndarray of shape (n_samples,) Labels of each point inertia_ : float Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided. n_iter_ : int Number of iterations run. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- MiniBatchKMeans : Alternative online implementation that does incremental updates of the centers positions using mini-batches. For large scale learning (say n_samples > 10k) MiniBatchKMeans is probably much faster than the default batch implementation. Notes ----- The k-means problem is solved using either Lloyd's or Elkan's algorithm. The average complexity is given by O(k n T), where n is the number of samples and T is the number of iteration. The worst case complexity is given by O(n^(k+2/p)) with n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii, 'How slow is the k-means method?' SoCG2006) In practice, the k-means algorithm is very fast (one of the fastest clustering algorithms available), but it falls in local minima. That's why it can be useful to restart it several times. If the algorithm stops before fully converging (because of ``tol`` or ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent, i.e. the ``cluster_centers_`` will not be the means of the points in each cluster. Also, the estimator will reassign ``labels_`` after the last iteration to make ``labels_`` consistent with ``predict`` on the training set. Examples -------- >>> from sklearn.cluster import KMeans >>> import numpy as np >>> X = np.array([[1, 2], [1, 4], [1, 0], ... [10, 2], [10, 4], [10, 0]]) >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) >>> kmeans.labels_ array([1, 1, 1, 0, 0, 0], dtype=int32) >>> kmeans.predict([[0, 0], [12, 3]]) array([1, 0], dtype=int32) >>> kmeans.cluster_centers_ array([[10., 2.], [ 1., 2.]]) File: ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py Type: ABCMeta Subclasses:
kmeans = KMeans(n_clusters=5)
Lot's of optional parameters, but n_clusters
is the most important:
Use the fit()
function
kmeans.fit(gapminder[['income']]);
Use the labels_
attribute
gapminder['label'] = kmeans.labels_
gapminder.groupby('label').size()
label 0 106 1 24 2 6 3 50 4 1 dtype: int64
(
alt.Chart(gapminder)
.mark_circle()
.encode(
alt.X("income:Q", scale=alt.Scale(type="log")),
alt.Y("health:Q", scale=alt.Scale(zero=False)),
size="population:Q",
color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
tooltip=list(gapminder.columns),
)
.properties(width=800, height=600)
.interactive()
)
gapminder.groupby("label")['income'].mean().sort_values()
label 0 5279.830189 3 21040.820000 1 42835.500000 2 74966.166667 4 132877.000000 Name: income, dtype: float64
Data is nicely partitioned into income levels
# Fit all three columns
kmeans.fit(gapminder[['income', 'health', 'population']])
# Extract the labels
gapminder['label'] = kmeans.labels_
gapminder
country | income | health | population | label | |
---|---|---|---|---|---|
0 | Afghanistan | 1925 | 57.63 | 32526562 | 4 |
1 | Albania | 10620 | 76.00 | 2896679 | 4 |
2 | Algeria | 13434 | 76.50 | 39666519 | 0 |
3 | Andorra | 46577 | 84.10 | 70473 | 4 |
4 | Angola | 7615 | 61.00 | 25021974 | 4 |
... | ... | ... | ... | ... | ... |
182 | Vietnam | 5623 | 76.50 | 93447601 | 0 |
183 | West Bank and Gaza | 4319 | 75.20 | 4668466 | 4 |
184 | Yemen | 3887 | 67.60 | 26832215 | 4 |
185 | Zambia | 4034 | 58.96 | 16211767 | 4 |
186 | Zimbabwe | 1801 | 60.01 | 15602751 | 4 |
187 rows × 5 columns
(
alt.Chart(gapminder)
.mark_circle()
.encode(
alt.X("income:Q", scale=alt.Scale(type="log")),
alt.Y("health:Q", scale=alt.Scale(zero=False)),
size="population:Q",
color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
tooltip=list(gapminder.columns),
)
.properties(width=800, height=600)
.interactive()
)
What's wrong?
K-means is distance-based, but our features have wildly different distance scales
StandardScaler
classfrom sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
fit_transform()
function to scale your features¶gapminder_scaled = scaler.fit_transform(gapminder[['income', 'health', 'population']])
Important: The fit_transform()
function converts the DataFrame to a numpy array:
# fit_transform() converts the data into a numpy array
gapminder_scaled[:5]
array([[-0.79481258, -1.8171424 , -0.04592039], [-0.34333373, 0.55986273, -0.25325837], [-0.1972197 , 0.62456075, 0.00404216], [ 1.52369617, 1.6079706 , -0.27303503], [-0.49936524, -1.38107777, -0.09843447]])
# mean of zero
gapminder_scaled.mean(axis=0)
array([ 8.07434927e-17, -1.70511258e-15, -1.89984689e-17])
# variance of one
gapminder_scaled.std(axis=0)
array([1., 1., 1.])
# Perform the fit
kmeans.fit(gapminder_scaled)
# Extract the labels
gapminder['label'] = kmeans.labels_
(
alt.Chart(gapminder)
.mark_circle()
.encode(
alt.X("income:Q", scale=alt.Scale(type="log")),
alt.Y("health:Q", scale=alt.Scale(zero=False)),
size="population:Q",
color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
tooltip=list(gapminder.columns),
)
.properties(width=800, height=600)
.interactive()
)
# Number of countries per cluster
gapminder.groupby("label").size()
label 0 86 1 2 2 32 3 61 4 6 dtype: int64
# Average population per cluster
gapminder.groupby("label")['population'].mean().sort_values() / 1e6
label 4 2.988746 3 21.441296 0 26.239937 2 32.501032 1 1343.549735 Name: population, dtype: float64
# Average life expectancy per cluster
gapminder.groupby("label")['health'].mean().sort_values()
label 3 62.232951 1 71.850000 0 74.313837 2 80.806250 4 81.033333 Name: health, dtype: float64
# Average income per cluster
gapminder.groupby("label")['income'].mean().sort_values() / 1e3
label 3 4.150623 1 9.618500 0 13.229907 2 40.322094 4 86.987500 Name: income, dtype: float64
gapminder.loc[gapminder['label']==4]
country | income | health | population | label | |
---|---|---|---|---|---|
24 | Brunei | 73003 | 78.7 | 423188 | 4 |
88 | Kuwait | 82633 | 80.7 | 3892115 | 4 |
97 | Luxembourg | 88314 | 81.1 | 567110 | 4 |
124 | Norway | 64304 | 81.6 | 5210967 | 4 |
134 | Qatar | 132877 | 82.0 | 2235355 | 4 |
145 | Singapore | 80794 | 82.1 | 5603740 | 4 |
kmeans.inertia_
80.84493050696736
gapminder.loc[gapminder['label']==4]
country | income | health | population | label | |
---|---|---|---|---|---|
24 | Brunei | 73003 | 78.7 | 423188 | 4 |
88 | Kuwait | 82633 | 80.7 | 3892115 | 4 |
97 | Luxembourg | 88314 | 81.1 | 567110 | 4 |
124 | Norway | 64304 | 81.6 | 5210967 | 4 |
134 | Qatar | 132877 | 82.0 | 2235355 | 4 |
145 | Singapore | 80794 | 82.1 | 5603740 | 4 |
I've extracted neighborhood Airbnb statistics for Philadelphia neighborhoods from Tom Slee's website.
The data includes average price per person, overall satisfaction, and number of listings.
Original research study: How Airbnb's Data Hid the Facts in New York City
The data is available in CSV format ("philly_airbnb_by_neighborhoods.csv") in the "data/" folder of the repository.
airbnb = pd.read_csv("data/philly_airbnb_by_neighborhoods.csv")
airbnb.head()
neighborhood | price_per_person | overall_satisfaction | N | |
---|---|---|---|---|
0 | ALLEGHENY_WEST | 120.791667 | 4.666667 | 23 |
1 | BELLA_VISTA | 87.407920 | 3.158333 | 204 |
2 | BELMONT | 69.425000 | 3.250000 | 11 |
3 | BREWERYTOWN | 71.788188 | 1.943182 | 142 |
4 | BUSTLETON | 55.833333 | 1.250000 | 19 |
price_per_person
, overall_satisfaction
, N
# Initialize the Kmeans object
kmeans = KMeans(n_clusters=5, random_state=42)
# Scale the data features we want
scaler = StandardScaler()
scaled_airbnb_data = scaler.fit_transform(airbnb[['price_per_person', 'overall_satisfaction', 'N']])
# Run the fit!
kmeans.fit(scaled_airbnb_data)
KMeans(n_clusters=5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5, random_state=42)
# Save the cluster labels
airbnb["label"] = kmeans.labels_
# New column "label"!
airbnb.head()
neighborhood | price_per_person | overall_satisfaction | N | label | |
---|---|---|---|---|---|
0 | ALLEGHENY_WEST | 120.791667 | 4.666667 | 23 | 2 |
1 | BELLA_VISTA | 87.407920 | 3.158333 | 204 | 2 |
2 | BELMONT | 69.425000 | 3.250000 | 11 | 2 |
3 | BREWERYTOWN | 71.788188 | 1.943182 | 142 | 1 |
4 | BUSTLETON | 55.833333 | 1.250000 | 19 | 1 |
To gain some insight into our clusters, after calculating the K-Means labels:
label
columnmean()
of each of our featuresairbnb.groupby('label', as_index=False).size()
label | size | |
---|---|---|
0 | 0 | 21 |
1 | 1 | 23 |
2 | 2 | 59 |
3 | 3 | 1 |
4 | 4 | 1 |
airbnb.groupby('label', as_index=False).mean().sort_values(by='price_per_person')
label | price_per_person | overall_satisfaction | N | |
---|---|---|---|---|
2 | 2 | 67.526755 | 3.232236 | 69.864407 |
1 | 1 | 78.935232 | 0.920238 | 38.347826 |
0 | 0 | 119.329173 | 2.947605 | 313.000000 |
4 | 4 | 136.263996 | 3.000924 | 1499.000000 |
3 | 3 | 387.626984 | 5.000000 | 31.000000 |
airbnb.loc[airbnb['label'] == 2]
neighborhood | price_per_person | overall_satisfaction | N | label | |
---|---|---|---|---|---|
0 | ALLEGHENY_WEST | 120.791667 | 4.666667 | 23 | 2 |
1 | BELLA_VISTA | 87.407920 | 3.158333 | 204 | 2 |
2 | BELMONT | 69.425000 | 3.250000 | 11 | 2 |
5 | CALLOWHILL | 94.733269 | 3.017241 | 143 | 2 |
6 | CEDAR_PARK | 39.053171 | 3.220000 | 261 | 2 |
8 | CHESTNUT_HILL | 69.123478 | 3.444444 | 74 | 2 |
9 | CHINATOWN | 48.476667 | 2.984848 | 101 | 2 |
11 | DICKINSON_NARROWS | 62.626393 | 3.227273 | 125 | 2 |
12 | EASTWICK | 63.566667 | 2.142857 | 18 | 2 |
13 | EAST_FALLS | 64.491228 | 2.522222 | 156 | 2 |
14 | EAST_KENSINGTON | 58.984428 | 2.602564 | 108 | 2 |
15 | EAST_OAK_LANE | 113.243590 | 4.800000 | 20 | 2 |
17 | EAST_PASSYUNK | 81.253676 | 3.716981 | 190 | 2 |
18 | EAST_POPLAR | 69.164368 | 3.350000 | 36 | 2 |
24 | FRANKFORD | 28.500000 | 3.583333 | 19 | 2 |
25 | FRANKLINVILLE | 74.968750 | 2.409091 | 12 | 2 |
26 | GARDEN_COURT | 45.954375 | 3.467391 | 93 | 2 |
27 | GERMANTOWN_EAST | 86.875000 | 3.750000 | 12 | 2 |
28 | GERMANTOWN_MORTON | 37.357143 | 3.750000 | 12 | 2 |
29 | GERMANTOWN_PENN_KNOX | 36.030702 | 4.666667 | 21 | 2 |
30 | GERMANTOWN_SOUTHWEST | 39.105263 | 3.083333 | 25 | 2 |
31 | GERMANTOWN_WESTSIDE | 49.830128 | 2.375000 | 27 | 2 |
32 | GERMANTOWN_WEST_CENT | 57.004884 | 2.891892 | 90 | 2 |
33 | GERMANY_HILL | 135.208333 | 5.000000 | 20 | 2 |
34 | GIRARD_ESTATES | 90.461111 | 2.875000 | 78 | 2 |
36 | GRAYS_FERRY | 66.833535 | 3.035714 | 65 | 2 |
37 | GREENWICH | 57.557576 | 2.500000 | 14 | 2 |
39 | HARROWGATE | 24.796429 | 3.458333 | 16 | 2 |
40 | HARTRANFT | 77.105300 | 2.800000 | 73 | 2 |
41 | HAVERFORD_NORTH | 102.025000 | 4.000000 | 11 | 2 |
44 | KINGSESSING | 41.005000 | 2.781250 | 44 | 2 |
46 | LOGAN | 63.582598 | 2.800000 | 18 | 2 |
48 | LOWER_MOYAMENSING | 72.605479 | 2.838710 | 100 | 2 |
49 | LUDLOW | 63.333333 | 4.400000 | 11 | 2 |
51 | MANTUA | 96.527778 | 2.968750 | 162 | 2 |
53 | MILL_CREEK | 53.020833 | 4.357143 | 37 | 2 |
54 | MOUNT_AIRY_EAST | 47.598086 | 3.435484 | 93 | 2 |
55 | MOUNT_AIRY_WEST | 52.799916 | 3.468750 | 115 | 2 |
56 | NEWBOLD | 82.836628 | 2.976190 | 121 | 2 |
59 | OGONTZ | 35.389194 | 2.250000 | 13 | 2 |
61 | OLD_KENSINGTON | 75.641209 | 2.945946 | 118 | 2 |
62 | OLNEY | 66.591479 | 2.416667 | 22 | 2 |
66 | PASCHALL | 21.722222 | 4.071429 | 12 | 2 |
67 | PASSYUNK_SQUARE | 86.239845 | 3.742647 | 232 | 2 |
68 | PENNSPORT | 88.773897 | 3.100000 | 100 | 2 |
71 | POWELTON | 77.697032 | 2.673077 | 79 | 2 |
74 | RICHMOND | 79.432611 | 2.613636 | 71 | 2 |
77 | ROXBOROUGH | 88.504646 | 3.270833 | 124 | 2 |
81 | SOUTHWEST_SCHUYLKILL | 83.098639 | 2.375000 | 26 | 2 |
84 | STADIUM_DISTRICT | 118.981061 | 2.875000 | 31 | 2 |
87 | TIOGA | 44.636364 | 3.833333 | 14 | 2 |
89 | UPPER_ROXBOROUGH | 96.724194 | 3.230769 | 43 | 2 |
90 | WALNUT_HILL | 55.033582 | 3.104167 | 82 | 2 |
92 | WEST_KENSINGTON | 39.422287 | 3.119048 | 54 | 2 |
94 | WEST_PASSYUNK | 48.846491 | 3.500000 | 48 | 2 |
96 | WEST_POWELTON | 59.056780 | 2.733333 | 139 | 2 |
97 | WHITMAN | 63.386458 | 3.409091 | 50 | 2 |
98 | WISSAHICKON | 62.733060 | 2.600000 | 83 | 2 |
101 | WOODLAND_TERRACE | 66.902778 | 3.062500 | 22 | 2 |
airbnb.loc[airbnb['label'] == 3]
neighborhood | price_per_person | overall_satisfaction | N | label | |
---|---|---|---|---|---|
78 | SHARSWOOD | 387.626984 | 5.0 | 31 | 3 |
airbnb.loc[airbnb['label'] == 4]
neighborhood | price_per_person | overall_satisfaction | N | label | |
---|---|---|---|---|---|
75 | RITTENHOUSE | 136.263996 | 3.000924 | 1499 | 4 |
./data/philly_neighborhoods.geojson
categorical=True
and legend=True
keywords will be useful herehoods = gpd.read_file("./data/philly_neighborhoods.geojson")
hoods.head()
Name | geometry | |
---|---|---|
0 | LAWNDALE | POLYGON ((-75.08616 40.05013, -75.08893 40.044... |
1 | ASTON_WOODBRIDGE | POLYGON ((-75.00860 40.05369, -75.00861 40.053... |
2 | CARROLL_PARK | POLYGON ((-75.22673 39.97720, -75.22022 39.974... |
3 | CHESTNUT_HILL | POLYGON ((-75.21278 40.08637, -75.21272 40.086... |
4 | BURNHOLME | POLYGON ((-75.08768 40.06861, -75.08758 40.068... |
airbnb.head()
neighborhood | price_per_person | overall_satisfaction | N | label | |
---|---|---|---|---|---|
0 | ALLEGHENY_WEST | 120.791667 | 4.666667 | 23 | 2 |
1 | BELLA_VISTA | 87.407920 | 3.158333 | 204 | 2 |
2 | BELMONT | 69.425000 | 3.250000 | 11 | 2 |
3 | BREWERYTOWN | 71.788188 | 1.943182 | 142 | 1 |
4 | BUSTLETON | 55.833333 | 1.250000 | 19 | 1 |
# do the merge
airbnb2 = hoods.merge(airbnb, left_on='Name', right_on='neighborhood', how='left')
# assign -1 to the neighborhoods without any listings
airbnb2['label'] = airbnb2['label'].fillna(-1)
# plot the data
airbnb2 = airbnb2.to_crs(epsg=3857)
# setup the figure
f, ax = plt.subplots(figsize=(10, 8))
# plot, coloring by label column
# specify categorical data and add legend
airbnb2.plot(
column="label",
cmap="Dark2",
categorical=True,
legend=True,
edgecolor="k",
lw=0.5,
ax=ax,
)
ax.set_axis_off()
plt.axis("equal");
Use altair to plot the clustering results with a tooltip for neighborhood name and tooltip.
Hint: See week 3B's lecture on interactive choropleth's with altair
# plot map, where variables ares nested within `properties`,
(
alt.Chart(airbnb2.to_crs(epsg=4326))
.mark_geoshape()
.properties(
width=500,
height=400,
)
.encode(
tooltip=["Name:N", "label:N"],
color=alt.Color("label:N", scale=alt.Scale(scheme="Dark2")),
)
)
Group 2!