from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
np.random.seed(42)


import folium


# Load CSV data from the data/ folder
census_data = pd.read_csv("./data/internet_avail_census.csv", dtype={"geoid": str})

# Remove counties with no households
valid = census_data['universe'] > 0
census_data = census_data.loc[valid]

# Calculate the percent without internet
census_data['percent_no_internet'] = census_data['no_internet'] / census_data['universe']


census_data.head()


# Load counties GeoJSOn from the data/ folder
counties = gpd.read_file("./data/us-counties-10m.geojson")


counties.head()


# Merge the county geometries with census data
# Left column: "id"
# Right column: "geoid"
census_joined = counties.merge(census_data, left_on="id", right_on="geoid")


census_joined.head()


# Minimum
min_val = census_joined['percent_no_internet'].min()

# Maximum
max_val = census_joined['percent_no_internet'].max()

# Calculate a normalized column
normalized = (census_joined['percent_no_internet'] - min_val) / (max_val - min_val)

# Add to the dataframe
census_joined['percent_no_internet_normalized'] = normalized


import matplotlib.colors as mcolors


# Use a red-purple colorbrewer color scheme
cmap = plt.get_cmap('RdPu')


# The minimum value of the color map as an RGB tuple
cmap(0)

(1.0, 0.9686274509803922, 0.9529411764705882, 1.0)


# The minimum value of the color map as a hex string
mcolors.rgb2hex(cmap(0.0))

'#fff7f3'


# The maximum value of the color map as a hex string
mcolors.rgb2hex(cmap(1.0))

'#49006a'


def get_style(feature):
    """
    Given an input GeoJSON feature, return a style dict.
    
    Notes
    -----
    The color in the style dict is determined by the 
    "percent_no_internet_normalized" column in the 
    input "feature".
    """
    # Get the data value from the feature
    value = feature['properties']['percent_no_internet_normalized']
    
    # Evaluate the color map
    # NOTE: value must between 0 and 1
    rgb_color = cmap(value) # this is an RGB tuple
    
    # Convert to hex string
    color = mcolors.rgb2hex(rgb_color)
    
    # Return the style dictionary
    return {'weight': 0.5, 'color': color, 'fillColor': color, "fillOpacity": 0.75}


def get_highlighted_style(feature):
    """
    Return a style dict to use when the user highlights a 
    feature with the mouse.
    """
    
    return {"weight": 3, "color": "black"}


needed_cols = ['NAME', 'percent_no_internet', 'percent_no_internet_normalized', 'geometry']
census_json = census_joined[needed_cols].to_json()


# STEP 1: Initialize the map
m = folium.Map(location=[40, -98], zoom_start=4)

# STEP 2: Add the GeoJson to the map
folium.GeoJson(
    census_json, # The geometry + data columns in GeoJSON format
    style_function=get_style, # The style function to color counties differently
    highlight_function=get_highlighted_style, 
    tooltip=folium.GeoJsonTooltip(['NAME', 'percent_no_internet'])
).add_to(m)



# avoid a rendering bug by saving as HTML and re-loading
m.save('percent_no_internet.html')


from IPython.display import IFrame


IFrame('percent_no_internet.html', width=800, height=500)


import altair as alt


# Initialize the chart with the counties data
alt.Chart(counties).mark_geoshape(stroke="white", strokeWidth=0.25).encode(
    # Encode the color 
    color=alt.Color(
        "percent_no_internet:Q",
        title="Percent Without Internet",
        scale=alt.Scale(scheme="redpurple"),
        legend=alt.Legend(format=".0%")
    ),
    # Tooltip
    tooltip=[
        alt.Tooltip("NAME:N", title="Name"),
        alt.Tooltip("percent_no_internet:Q", title="Percent Without Internet", format=".1%"),
    ],
).transform_lookup(
    lookup="id", # The column name in the counties data to match on
    from_=alt.LookupData(census_data, "geoid", ["percent_no_internet", "NAME"]), # Match census data on "geoid"
).project(
    type="albersUsa"
).properties(
    width=700, height=500
)


from folium.plugins import HeatMap


HeatMap?

Init signature:
HeatMap(
    data,
    name=None,
    min_opacity=0.5,
    max_zoom=18,
    radius=25,
    blur=15,
    gradient=None,
    overlay=True,
    control=True,
    show=True,
    **kwargs,
)
Docstring:     
Create a Heatmap layer

Parameters
----------
data : list of points of the form [lat, lng] or [lat, lng, weight]
    The points you want to plot.
    You can also provide a numpy.array of shape (n,2) or (n,3).
name : string, default None
    The name of the Layer, as it will appear in LayerControls.
min_opacity  : default 1.
    The minimum opacity the heat will start at.
max_zoom : default 18
    Zoom level where the points reach maximum intensity (as intensity
    scales with zoom), equals maxZoom of the map by default
radius : int, default 25
    Radius of each "point" of the heatmap
blur : int, default 15
    Amount of blur
gradient : dict, default None
    Color gradient config. e.g. {0.4: 'blue', 0.65: 'lime', 1: 'red'}
overlay : bool, default True
    Adds the layer as an optional overlay (True) or the base layer (False).
control : bool, default True
    Whether the Layer will be included in LayerControls.
show: bool, default True
    Whether the layer will be shown on opening (only for overlays).
File:           ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/folium/plugins/heat_map.py
Type:           type
Subclasses:


import carto2gpd


# API URL
url = "https://phl.carto.com/api/v2/sql"

# Table name on CARTO
table_name = "permits"

# The where clause, with two parts
DAYS = 30
where = f"permitissuedate >= current_date - {DAYS}"
where += " and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"

where

"permitissuedate >= current_date - 30 and permitdescription IN ('RESIDENTIAL BUILDING PERMIT', 'COMMERCIAL BUILDING PERMIT')"


# Run the query
permits = carto2gpd.get(url, table_name, where=where)


len(permits)

493


permits.head()


permits = permits.loc[permits.geometry.notnull()].copy()


# Extract the lat and longitude from the geometery column
permits['lat'] = permits.geometry.y
permits['lng'] = permits.geometry.x


permits.head()


# Split out the residential and commercial
residential = permits.query("permitdescription == 'RESIDENTIAL BUILDING PERMIT'")
commercial = permits.query("permitdescription == 'COMMERCIAL BUILDING PERMIT'")


# Make a NumPy array (use the "values" attribute)
residential_coords = residential[['lat', 'lng']].values
commercial_coords = commercial[['lat', 'lng']].values


commercial_coords[:5]

array([[ 40.045317, -75.165507],
       [ 40.036004, -75.005385],
       [ 40.051579, -75.093348],
       [ 39.951968, -75.156049],
       [ 39.969465, -75.2167  ]])


# Initialize map
m = folium.Map(
    location=[39.99, -75.13],
    tiles='Cartodb Positron',
    zoom_start=12
)


# Add heat map coordinates
HeatMap(commercial_coords).add_to(m)

m


# Initialize map
m = folium.Map(
    location=[39.99, -75.13],
    tiles='Cartodb Positron',
    zoom_start=12
)


# Add heat map
HeatMap(residential_coords, radius=15).add_to(m)

m


import altair as alt
from vega_datasets import data as vega_data


gapminder = pd.read_csv(vega_data.gapminder_health_income.url)
gapminder.head()


(
    alt.Chart(gapminder)
    .mark_circle()
    .encode(
        alt.X("income:Q", scale=alt.Scale(type="log")),
        alt.Y("health:Q", scale=alt.Scale(zero=False)),
        size="population:Q",
        tooltip=list(gapminder.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)


from sklearn.cluster import KMeans


KMeans?

Init signature:
KMeans(
    n_clusters=8,
    *,
    init='k-means++',
    n_init=10,
    max_iter=300,
    tol=0.0001,
    verbose=0,
    random_state=None,
    copy_x=True,
    algorithm='lloyd',
)
Docstring:     
K-Means clustering.

Read more in the :ref:`User Guide <k_means>`.

Parameters
----------

n_clusters : int, default=8
    The number of clusters to form as well as the number of
    centroids to generate.

init : {'k-means++', 'random'}, callable or array-like of shape             (n_clusters, n_features), default='k-means++'
    Method for initialization:

    'k-means++' : selects initial cluster centroids using sampling based on
    an empirical probability distribution of the points' contribution to the
    overall inertia. This technique speeds up convergence, and is
    theoretically proven to be :math:`\mathcal{O}(\log k)`-optimal.
    See the description of `n_init` for more details.

    'random': choose `n_clusters` observations (rows) at random from data
    for the initial centroids.

    If an array is passed, it should be of shape (n_clusters, n_features)
    and gives the initial centers.

    If a callable is passed, it should take arguments X, n_clusters and a
    random state and return an initialization.

n_init : int, default=10
    Number of time the k-means algorithm will be run with different
    centroid seeds. The final results will be the best output of
    n_init consecutive runs in terms of inertia.

max_iter : int, default=300
    Maximum number of iterations of the k-means algorithm for a
    single run.

tol : float, default=1e-4
    Relative tolerance with regards to Frobenius norm of the difference
    in the cluster centers of two consecutive iterations to declare
    convergence.

verbose : int, default=0
    Verbosity mode.

random_state : int, RandomState instance or None, default=None
    Determines random number generation for centroid initialization. Use
    an int to make the randomness deterministic.
    See :term:`Glossary <random_state>`.

copy_x : bool, default=True
    When pre-computing distances it is more numerically accurate to center
    the data first. If copy_x is True (default), then the original data is
    not modified. If False, the original data is modified, and put back
    before the function returns, but small numerical differences may be
    introduced by subtracting and then adding the data mean. Note that if
    the original data is not C-contiguous, a copy will be made even if
    copy_x is False. If the original data is sparse, but not in CSR format,
    a copy will be made even if copy_x is False.

algorithm : {"lloyd", "elkan", "auto", "full"}, default="lloyd"
    K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
    The `"elkan"` variation can be more efficient on some datasets with
    well-defined clusters, by using the triangle inequality. However it's
    more memory intensive due to the allocation of an extra array of shape
    `(n_samples, n_clusters)`.

    `"auto"` and `"full"` are deprecated and they will be removed in
    Scikit-Learn 1.3. They are both aliases for `"lloyd"`.

    .. versionchanged:: 0.18
        Added Elkan algorithm

    .. versionchanged:: 1.1
        Renamed "full" to "lloyd", and deprecated "auto" and "full".
        Changed "auto" to use "lloyd" instead of "elkan".

Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
    Coordinates of cluster centers. If the algorithm stops before fully
    converging (see ``tol`` and ``max_iter``), these will not be
    consistent with ``labels_``.

labels_ : ndarray of shape (n_samples,)
    Labels of each point

inertia_ : float
    Sum of squared distances of samples to their closest cluster center,
    weighted by the sample weights if provided.

n_iter_ : int
    Number of iterations run.

n_features_in_ : int
    Number of features seen during :term:`fit`.

    .. versionadded:: 0.24

feature_names_in_ : ndarray of shape (`n_features_in_`,)
    Names of features seen during :term:`fit`. Defined only when `X`
    has feature names that are all strings.

    .. versionadded:: 1.0

See Also
--------
MiniBatchKMeans : Alternative online implementation that does incremental
    updates of the centers positions using mini-batches.
    For large scale learning (say n_samples > 10k) MiniBatchKMeans is
    probably much faster than the default batch implementation.

Notes
-----
The k-means problem is solved using either Lloyd's or Elkan's algorithm.

The average complexity is given by O(k n T), where n is the number of
samples and T is the number of iteration.

The worst case complexity is given by O(n^(k+2/p)) with
n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
'How slow is the k-means method?' SoCG2006)

In practice, the k-means algorithm is very fast (one of the fastest
clustering algorithms available), but it falls in local minima. That's why
it can be useful to restart it several times.

If the algorithm stops before fully converging (because of ``tol`` or
``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent,
i.e. the ``cluster_centers_`` will not be the means of the points in each
cluster. Also, the estimator will reassign ``labels_`` after the last
iteration to make ``labels_`` consistent with ``predict`` on the training
set.

Examples
--------

>>> from sklearn.cluster import KMeans
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
...               [10, 2], [10, 4], [10, 0]])
>>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
>>> kmeans.labels_
array([1, 1, 1, 0, 0, 0], dtype=int32)
>>> kmeans.predict([[0, 0], [12, 3]])
array([1, 0], dtype=int32)
>>> kmeans.cluster_centers_
array([[10.,  2.],
       [ 1.,  2.]])
File:           ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py
Type:           ABCMeta
Subclasses:


kmeans = KMeans(n_clusters=5)


kmeans.fit(gapminder[['income']]);


gapminder['label'] = kmeans.labels_


gapminder.groupby('label').size()

label
0    106
1     24
2      6
3     50
4      1
dtype: int64


(
    alt.Chart(gapminder)
    .mark_circle()
    .encode(
        alt.X("income:Q", scale=alt.Scale(type="log")),
        alt.Y("health:Q", scale=alt.Scale(zero=False)),
        size="population:Q",
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(gapminder.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)


gapminder.groupby("label")['income'].mean().sort_values()

label
0      5279.830189
3     21040.820000
1     42835.500000
2     74966.166667
4    132877.000000
Name: income, dtype: float64


# Fit all three columns
kmeans.fit(gapminder[['income', 'health', 'population']])

# Extract the labels
gapminder['label'] = kmeans.labels_


gapminder


(
    alt.Chart(gapminder)
    .mark_circle()
    .encode(
        alt.X("income:Q", scale=alt.Scale(type="log")),
        alt.Y("health:Q", scale=alt.Scale(zero=False)),
        size="population:Q",
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(gapminder.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)


from sklearn.preprocessing import MinMaxScaler, RobustScaler


from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()


gapminder_scaled = scaler.fit_transform(gapminder[['income', 'health', 'population']])


# fit_transform() converts the data into a numpy array
gapminder_scaled[:5]

array([[-0.79481258, -1.8171424 , -0.04592039],
       [-0.34333373,  0.55986273, -0.25325837],
       [-0.1972197 ,  0.62456075,  0.00404216],
       [ 1.52369617,  1.6079706 , -0.27303503],
       [-0.49936524, -1.38107777, -0.09843447]])


# mean of zero
gapminder_scaled.mean(axis=0)

array([ 8.07434927e-17, -1.70511258e-15, -1.89984689e-17])


# variance of one
gapminder_scaled.std(axis=0)

array([1., 1., 1.])


# Perform the fit
kmeans.fit(gapminder_scaled)

# Extract the labels
gapminder['label'] = kmeans.labels_


(
    alt.Chart(gapminder)
    .mark_circle()
    .encode(
        alt.X("income:Q", scale=alt.Scale(type="log")),
        alt.Y("health:Q", scale=alt.Scale(zero=False)),
        size="population:Q",
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(gapminder.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)


# Number of countries per cluster
gapminder.groupby("label").size()

label
0    86
1     2
2    32
3    61
4     6
dtype: int64


# Average population per cluster
gapminder.groupby("label")['population'].mean().sort_values() / 1e6

label
4       2.988746
3      21.441296
0      26.239937
2      32.501032
1    1343.549735
Name: population, dtype: float64


# Average life expectancy per cluster
gapminder.groupby("label")['health'].mean().sort_values()

label
3    62.232951
1    71.850000
0    74.313837
2    80.806250
4    81.033333
Name: health, dtype: float64


# Average income per cluster
gapminder.groupby("label")['income'].mean().sort_values() / 1e3

label
3     4.150623
1     9.618500
0    13.229907
2    40.322094
4    86.987500
Name: income, dtype: float64


gapminder.loc[gapminder['label']==4]


kmeans.inertia_

80.84493050696736


gapminder.loc[gapminder['label']==4]


airbnb = pd.read_csv("data/philly_airbnb_by_neighborhoods.csv")
airbnb.head()


# Initialize the Kmeans object
kmeans = KMeans(n_clusters=5, random_state=42)


# Scale the data features we want
scaler = StandardScaler()
scaled_airbnb_data = scaler.fit_transform(airbnb[['price_per_person', 'overall_satisfaction', 'N']])


# Run the fit!
kmeans.fit(scaled_airbnb_data)

KMeans(n_clusters=5, random_state=42)

KMeans(n_clusters=5, random_state=42)


# Save the cluster labels
airbnb["label"] = kmeans.labels_


# New column "label"!
airbnb.head()


airbnb.groupby('label', as_index=False).size()


airbnb.groupby('label', as_index=False).mean().sort_values(by='price_per_person')


airbnb.loc[airbnb['label'] == 2]


airbnb.loc[airbnb['label'] == 3]


airbnb.loc[airbnb['label'] == 4]


hoods = gpd.read_file("./data/philly_neighborhoods.geojson")
hoods.head()


airbnb.head()


# do the merge
airbnb2 = hoods.merge(airbnb, left_on='Name', right_on='neighborhood', how='left')

# assign -1 to the neighborhoods without any listings
airbnb2['label'] = airbnb2['label'].fillna(-1)


# plot the data
airbnb2 = airbnb2.to_crs(epsg=3857)

# setup the figure
f, ax = plt.subplots(figsize=(10, 8))

# plot, coloring by label column
# specify categorical data and add legend
airbnb2.plot(
    column="label",
    cmap="Dark2",
    categorical=True,
    legend=True,
    edgecolor="k",
    lw=0.5,
    ax=ax,
)


ax.set_axis_off()
plt.axis("equal");


# plot map, where variables ares nested within `properties`,
(
    alt.Chart(airbnb2.to_crs(epsg=4326))
    .mark_geoshape()
    .properties(
        width=500,
        height=400,
    )
    .encode(
        tooltip=["Name:N", "label:N"],
        color=alt.Color("label:N", scale=alt.Scale(scheme="Dark2")),
    )
)

	NAME	universe	no_internet	state	county	geoid	percent_no_internet
0	Washington County, Mississippi	18299.0	6166.0	28	151	28151	0.336958
1	Perry County, Mississippi	4563.0	1415.0	28	111	28111	0.310103
2	Choctaw County, Mississippi	3164.0	1167.0	28	19	28019	0.368837
3	Itawamba County, Mississippi	8706.0	1970.0	28	57	28057	0.226281
4	Carroll County, Mississippi	3658.0	1218.0	28	15	28015	0.332969

	id	geometry
0	53073	MULTIPOLYGON (((-120.85361 49.00011, -120.7674...
1	30105	POLYGON ((-106.11238 48.99904, -106.15187 48.8...
2	30029	POLYGON ((-114.06985 48.99904, -114.05908 48.8...
3	16021	POLYGON ((-116.04755 49.00065, -116.04755 48.5...
4	30071	POLYGON ((-107.17840 49.00011, -107.20712 48.9...

	id	geometry	NAME	universe	no_internet	state	county	geoid	percent_no_internet
0	53073	MULTIPOLYGON (((-120.85361 49.00011, -120.7674...	Whatcom County, Washington	85008.0	9189.0	53	73	53073	0.108096
1	30105	POLYGON ((-106.11238 48.99904, -106.15187 48.8...	Valley County, Montana	3436.0	672.0	30	105	30105	0.195576
2	30029	POLYGON ((-114.06985 48.99904, -114.05908 48.8...	Flathead County, Montana	38252.0	5662.0	30	29	30029	0.148018
3	16021	POLYGON ((-116.04755 49.00065, -116.04755 48.5...	Boundary County, Idaho	4605.0	1004.0	16	21	16021	0.218024
4	30071	POLYGON ((-107.17840 49.00011, -107.20712 48.9...	Phillips County, Montana	1770.0	484.0	30	71	30071	0.273446

	geometry	cartodb_id	objectid	permitnumber	addressobjectid	parcel_id_num	permittype	permitdescription	commercialorresidential	typeofwork	...	unit_type	unit_num	zip	censustract	council_district	opa_owner	systemofrecord	geocode_x	geocode_y	posse_jobid
0	POINT (-75.16551 40.04532)	4461	4898	CP-2022-002038	15732535	210522	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	ADDITION AND/OR ALTERATION	...	None	None	19144-1206	248	8	GAROFALO HELEN V	ECLIPSE	2.691986e+06	269953.280616	465300244
1	POINT (-75.00539 40.03600)	4768	4904	CP-2022-003337	156760706	440714	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	NEW CONSTRUCTION	...	None	None	19136-1604	9891	6	PHILA MUNICIPAL AUTH	ECLIPSE	2.736901e+06	267914.438827	486707349
2	POINT (-75.09335 40.05158)	4789	4909	CP-2022-004987	184460163	352755	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	ADDITION AND/OR ALTERATION	...	None	None	19111-5246	306	9	GRACE YANG INC	ECLIPSE	2.712111e+06	272832.652136	514998496
3	POINT (-75.16733 40.06235)	5514	5680	RP-2022-005876	265435068	331238	RESIDENTIAL BUILDING	RESIDENTIAL BUILDING PERMIT	RESIDENTIAL	ADDITION AND/OR ALTERATION	...	None	None	19150-3300	264	9	CMPJ LLC	ECLIPSE	2.691293e+06	276139.761238	480078356
4	POINT (-75.22938 39.93547)	5537	5683	RP-2022-007783	132249898	366914	RESIDENTIAL BUILDING	RESIDENTIAL BUILDING PERMIT	RESIDENTIAL	ADDITION AND/OR ALTERATION	...	None	None	19143-5516	65	2	DH1 FUND LP	ECLIPSE	2.675252e+06	229438.672154	497420889

	geometry	cartodb_id	objectid	permitnumber	addressobjectid	parcel_id_num	permittype	permitdescription	commercialorresidential	typeofwork	...	zip	censustract	council_district	opa_owner	systemofrecord	geocode_x	geocode_y	posse_jobid	lat	lng
0	POINT (-75.16551 40.04532)	4461	4898	CP-2022-002038	15732535	210522	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	ADDITION AND/OR ALTERATION	...	19144-1206	248	8	GAROFALO HELEN V	ECLIPSE	2.691986e+06	269953.280616	465300244	40.045317	-75.165507
1	POINT (-75.00539 40.03600)	4768	4904	CP-2022-003337	156760706	440714	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	NEW CONSTRUCTION	...	19136-1604	9891	6	PHILA MUNICIPAL AUTH	ECLIPSE	2.736901e+06	267914.438827	486707349	40.036004	-75.005385
2	POINT (-75.09335 40.05158)	4789	4909	CP-2022-004987	184460163	352755	BUILDING	COMMERCIAL BUILDING PERMIT	COMMERCIAL	ADDITION AND/OR ALTERATION	...	19111-5246	306	9	GRACE YANG INC	ECLIPSE	2.712111e+06	272832.652136	514998496	40.051579	-75.093348
3	POINT (-75.16733 40.06235)	5514	5680	RP-2022-005876	265435068	331238	RESIDENTIAL BUILDING	RESIDENTIAL BUILDING PERMIT	RESIDENTIAL	ADDITION AND/OR ALTERATION	...	19150-3300	264	9	CMPJ LLC	ECLIPSE	2.691293e+06	276139.761238	480078356	40.062348	-75.167333
4	POINT (-75.22938 39.93547)	5537	5683	RP-2022-007783	132249898	366914	RESIDENTIAL BUILDING	RESIDENTIAL BUILDING PERMIT	RESIDENTIAL	ADDITION AND/OR ALTERATION	...	19143-5516	65	2	DH1 FUND LP	ECLIPSE	2.675252e+06	229438.672154	497420889	39.935474	-75.229383

	country	income	health	population
0	Afghanistan	1925	57.63	32526562
1	Albania	10620	76.00	2896679
2	Algeria	13434	76.50	39666519
3	Andorra	46577	84.10	70473
4	Angola	7615	61.00	25021974

	country	income	health	population	label
24	Brunei	73003	78.7	423188	4
88	Kuwait	82633	80.7	3892115	4
97	Luxembourg	88314	81.1	567110	4
124	Norway	64304	81.6	5210967	4
134	Qatar	132877	82.0	2235355	4
145	Singapore	80794	82.1	5603740	4

	neighborhood	price_per_person	overall_satisfaction	N
0	ALLEGHENY_WEST	120.791667	4.666667	23
1	BELLA_VISTA	87.407920	3.158333	204
2	BELMONT	69.425000	3.250000	11
3	BREWERYTOWN	71.788188	1.943182	142
4	BUSTLETON	55.833333	1.250000	19

	label	price_per_person	overall_satisfaction	N
2	2	67.526755	3.232236	69.864407
1	1	78.935232	0.920238	38.347826
0	0	119.329173	2.947605	313.000000
4	4	136.263996	3.000924	1499.000000
3	3	387.626984	5.000000	31.000000

	Name	geometry
0	LAWNDALE	POLYGON ((-75.08616 40.05013, -75.08893 40.044...
1	ASTON_WOODBRIDGE	POLYGON ((-75.00860 40.05369, -75.00861 40.053...
2	CARROLL_PARK	POLYGON ((-75.22673 39.97720, -75.22022 39.974...
3	CHESTNUT_HILL	POLYGON ((-75.21278 40.08637, -75.21272 40.086...
4	BURNHOLME	POLYGON ((-75.08768 40.06861, -75.08758 40.068...

	label	size
0	0	21
1	1	23
2	2	59
3	3	1
4	4	1

	neighborhood	price_per_person	overall_satisfaction	N	label
0	ALLEGHENY_WEST	120.791667	4.666667	23	2
1	BELLA_VISTA	87.407920	3.158333	204	2
2	BELMONT	69.425000	3.250000	11	2
5	CALLOWHILL	94.733269	3.017241	143	2
6	CEDAR_PARK	39.053171	3.220000	261	2
8	CHESTNUT_HILL	69.123478	3.444444	74	2
9	CHINATOWN	48.476667	2.984848	101	2
11	DICKINSON_NARROWS	62.626393	3.227273	125	2
12	EASTWICK	63.566667	2.142857	18	2
13	EAST_FALLS	64.491228	2.522222	156	2
14	EAST_KENSINGTON	58.984428	2.602564	108	2
15	EAST_OAK_LANE	113.243590	4.800000	20	2
17	EAST_PASSYUNK	81.253676	3.716981	190	2
18	EAST_POPLAR	69.164368	3.350000	36	2
24	FRANKFORD	28.500000	3.583333	19	2
25	FRANKLINVILLE	74.968750	2.409091	12	2
26	GARDEN_COURT	45.954375	3.467391	93	2
27	GERMANTOWN_EAST	86.875000	3.750000	12	2
28	GERMANTOWN_MORTON	37.357143	3.750000	12	2
29	GERMANTOWN_PENN_KNOX	36.030702	4.666667	21	2
30	GERMANTOWN_SOUTHWEST	39.105263	3.083333	25	2
31	GERMANTOWN_WESTSIDE	49.830128	2.375000	27	2
32	GERMANTOWN_WEST_CENT	57.004884	2.891892	90	2
33	GERMANY_HILL	135.208333	5.000000	20	2
34	GIRARD_ESTATES	90.461111	2.875000	78	2
36	GRAYS_FERRY	66.833535	3.035714	65	2
37	GREENWICH	57.557576	2.500000	14	2
39	HARROWGATE	24.796429	3.458333	16	2
40	HARTRANFT	77.105300	2.800000	73	2
41	HAVERFORD_NORTH	102.025000	4.000000	11	2
44	KINGSESSING	41.005000	2.781250	44	2
46	LOGAN	63.582598	2.800000	18	2
48	LOWER_MOYAMENSING	72.605479	2.838710	100	2
49	LUDLOW	63.333333	4.400000	11	2
51	MANTUA	96.527778	2.968750	162	2
53	MILL_CREEK	53.020833	4.357143	37	2
54	MOUNT_AIRY_EAST	47.598086	3.435484	93	2
55	MOUNT_AIRY_WEST	52.799916	3.468750	115	2
56	NEWBOLD	82.836628	2.976190	121	2
59	OGONTZ	35.389194	2.250000	13	2

Week 10: Clustering Analysis in Python¶

Housekeeping¶

Recap¶

Where we left off last week: choropleth maps in Folium¶

The hard way: use folium.GeoJson¶

The steps involved¶

Step 1: Join the census data and features¶

Step 2: Normalize the data column to 0 to 1¶

Step 3: Define our style functions¶

Step 4: Convert our data to GeoJSON¶

And viola!¶

At-home exercise: Can we repeat this with altair?¶

Leaflet/Folium plugins¶

Example: Heatmap¶

Example: A heatmap of new construction permits in Philadelphia in the last 30 days¶

Step 1: Download the data from CARTO¶

Step 2: Remove missing geometries¶

Step 3: Extract out the lat/lng coordinates¶

Step 4: Make a Folium map and add a HeatMap¶

Commercial building permits¶

Residential building permits¶

Takeaways¶

That's it for interactive maps w/ Folium...¶

Clustering in Python¶

"Machine learning"¶

Supervised¶

Unsupervised¶

Machine learning in Python: scikit-learn¶

Clustering is just one (of many) features¶

Part 1: Non-spatial clustering¶

Some intuition¶

K-Means clustering¶

This is just one of several clustering methods¶

A good introduction¶

How does it work?¶

K-means in action¶

Example: Clustering countries by health and income¶

Plot it with altair¶

K-Means with scikit-learn¶

Let's fit just income first¶

Extract the cluster labels¶

How big are our clusters?¶

Plot it again, coloring by our labels¶

Calculate average income by group¶

How about health, income, and population?¶

It....didn't work that well¶

scikit-learn to the rescue: pre-processing¶

Use the fit_transform() function to scale your features¶

Now fit the scaled features¶

Exercise: Clustering neighborhoods by Airbnb stats¶

Two good references for Airbnb data¶

Step 1: Load the data with pandas¶

Step 2: Perform the K-Means fit¶

Step 3: Calculate average features per cluster¶

Step 4: Plot a choropleth, coloring neighborhoods by their cluster label¶

Step 5: Plot an interactive map¶

Based on these results, where would you want to stay?¶

More clustering on Wednesday!¶

Use the `fit_transform()` function to scale your features¶