from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas
np.random.seed(42)
pd.options.display.max_columns = 999
Nov 21, 2022
Focus: much more hands-on experience with featuring engineering and adding spatial based features
First, let's setup all of the imports we'll need from scikit learn:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# Pipelines
from sklearn.pipeline import make_pipeline
# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
Let's download data for single-family properties in Philadelphia that had their last sale during 2021.
Sources:
import carto2gpd
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"
# The table name
table_name = "opa_properties_public"
# Only pull 2021 sales for single family residential properties
where = "sale_date >= '2021-01-01' and sale_date <= '2021-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"
# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)
# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")
salesRaw.head()
geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | census_tract | central_air | cross_reference | date_exterior_condition | depth | exempt_building | exempt_land | exterior_condition | fireplaces | frontage | fuel | garage_spaces | garage_type | general_construction | geographic_ward | homestead_exemption | house_extension | house_number | interior_condition | location | mailing_address_1 | mailing_address_2 | mailing_care_of | mailing_city_state | mailing_street | mailing_zip | market_value | market_value_date | number_of_bathrooms | number_of_bedrooms | number_of_rooms | number_stories | off_street_open | other_building | owner_1 | owner_2 | parcel_number | parcel_shape | quality_grade | recording_date | registry_number | sale_date | sale_price | separate_utilities | sewer | site_type | state_code | street_code | street_designation | street_direction | street_name | suffix | taxable_building | taxable_land | topography | total_area | total_livable_area | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | pin | objectid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18473 | POINT (-75.14692 39.93129) | 41645 | 2021-07-16T00:00:00Z | D | 15D94 W HOWARD ST | 53843064 | O50 | ROW 3 STY MASONRY | 1 | SINGLE FAMILY | 27 | Y | None | None | 49.0 | 80000 | 0 | 2 | 0.0 | 16.0 | None | 0.0 | None | A | 1 | 80000.0 | None | 110 | 2 | 110 WHARTON ST | None | None | None | PHILADELPHIA PA | 110 WHARTON ST | 19147-5425 | 379600 | None | 1.0 | 3.0 | NaN | 3.0 | 770.0 | None | FINLEY BRIAN | None | 011000700 | E | C | 2021-06-05T00:00:00Z | 009S170046 | 2021-04-02T00:00:00Z | 430000.0 | None | None | None | PA | 82740 | ST | None | WHARTON | None | 223680 | 75920 | F | 779.0 | 1203.0 | H | None | None | None | I | 1920 | Y | 19147 | RSA5 | 1001563065 | 226380222 |
20663 | POINT (-75.14703 39.93123) | 44294 | 2021-07-16T00:00:00Z | C | 45'2" W HOWARD ST | 53834792 | O50 | ROW 3 STY MASONRY | 1 | SINGLE FAMILY | 27 | Y | None | None | 100.0 | 0 | 0 | 4 | 0.0 | 14.0 | A | 0.0 | 0 | A | 1 | 0.0 | None | 114 | 3 | 114 WHARTON ST | SIMPLIFILE LC E-RECORDING | None | None | PHILADELPHIA PA | 114 WHARTON ST | 19147-5425 | 383000 | None | 2.0 | 4.0 | NaN | 2.0 | 770.0 | None | NICOLO KATIE M | NICOLO THOMAS E | 011000900 | E | C+ | 2021-05-19T00:00:00Z | 009S170129 | 2021-03-01T00:00:00Z | 520000.0 | None | 0 | None | PA | 82740 | ST | None | WHARTON | None | 306400 | 76600 | F | 1433.0 | 2050.0 | A | None | None | None | I | 1920 | Y | 19147 | RSA5 | 1001563070 | 226381406 |
2059 | POINT (-75.14854 39.93144) | 21596 | 2021-07-16T00:00:00Z | 0 | 54'7" E OF AMERICAN | 53958642 | R30 | ROW B/GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 27 | Y | None | None | 90.0 | 80000 | 0 | 4 | 0.0 | 18.0 | None | 1.0 | None | A | 1 | 80000.0 | None | 222 | 4 | 222 WHARTON ST | SIMPLIFILE LC E-RECORDING | None | None | PHILADELPHIA PA | 222 WHARTON ST | 19147-5336 | 365600 | None | 2.0 | 3.0 | NaN | 2.0 | 711.0 | None | CELLMER TROY | SCHWARTZ KATHERINE | 011001660 | E | C | 2022-01-28T00:00:00Z | 009S170309 | 2021-11-19T00:00:00Z | 600000.0 | None | None | None | PA | 82740 | ST | None | WHARTON | None | 212480 | 73120 | F | 1623.0 | 1625.0 | A | None | None | None | I | 1960 | Y | 19147 | RSA5 | 1001563091 | 226361243 |
3142 | POINT (-75.14849 39.93119) | 22960 | 2022-04-27T00:00:00Z | D | 20' W PHILIP ST | 53969048 | P50 | ROW W/GAR 3 STY MASONRY | 1 | SINGLE FAMILY | 27 | Y | None | None | 45.0 | 493120 | 0 | 1 | 0.0 | 20.0 | A | 1.0 | None | B | 1 | 0.0 | None | 210 | 1 | 210 SEARS ST | SIMPLIFILE LC E-RECORDING | None | None | PHILADELPHIA PA | 210 SEARS ST | 19147-6007 | 616400 | None | 3.0 | 3.0 | NaN | 3.0 | 601.0 | None | COTTER ALLISON SHIRLEY | COTTER BENJAMIN CHARTIER | 011004110 | E | C+ | 2022-02-09T00:00:00Z | 009S170376 | 2021-09-10T00:00:00Z | 1.0 | None | None | None | PA | 71440 | ST | None | SEARS | None | 0 | 123280 | F | 909.0 | 2163.0 | A | None | None | None | I | 2016 | None | 19147 | RSA5 | 1001475221 | 226359605 |
18555 | POINT (-75.14798 39.93012) | 41737 | 2022-04-27T00:00:00Z | A | ES O S.2ND ST | 53845333 | P51 | ROW W/GAR 3 STY MAS+OTHER | 1 | SINGLE FAMILY | 27 | Y | None | None | 84.0 | 578720 | 0 | 1 | 0.0 | 17.0 | A | 1.0 | None | C | 1 | 0.0 | None | 142 | 1 | 142 REED ST | SIMPLIFILE LC E-RECORDING | None | None | PHILADELPHIA PA | 142 REED ST | 19147-6117 | 723400 | None | 0.0 | 3.0 | NaN | 2.0 | 296.0 | None | JADHAV GAURAV P | JADHAV PRITI PRADEEP | 011011425 | E | C+ | 2021-06-10T00:00:00Z | 010S110345 | 2021-04-09T00:00:00Z | 765000.0 | None | Y | None | PA | 67780 | ST | None | REED | None | 0 | 144680 | F | 1400.0 | 2558.0 | A | None | None | None | I | 2014 | None | 19147 | ICMX | 1001442224 | 226379883 |
len(salesRaw)
25706
# The feature columns we want to use
cols = [
"sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
]
# Trim to these columns and remove NaNs
sales = salesRaw[cols + ["geometry"]].dropna()
# Trim zip code to only the first five digits
sales["zip_code"] = sales["zip_code"].astype(str).str.slice(0, 5)
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
len(sales)
19294
# Split the data 70/30
train_set, test_set = train_test_split(sales,
test_size=0.3,
random_state=42)
# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# The features
feature_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values
Run a linear regression model as a baseline:
# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())
# Fit on the training data
linear_pipeline.fit(X_train, y_train)
# What's the test score?
linear_pipeline.score(X_test, y_test)
0.23452440229126648
Run cross-validation on a random forest model:
# Make a random forest pipeline
forest_pipeline = make_pipeline(
StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)
# Run the 10-fold cross validation
scores = cross_val_score(
forest_pipeline,
X_train,
y_train,
cv=10,
)
# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores = [0.39981407 0.33859844 0.35040232 0.35257309 0.32632646 0.38019832 0.3111287 0.37412471 0.34844108 0.34621604] Scores mean = 0.35278232253535474 Score std dev = 0.024744515194105168
# Fit on the training data
forest_pipeline.fit(X_train, y_train)
# What's the test score?
forest_pipeline.score(X_test, y_test)
0.34475795326957004
The model appears to generalize reasonably well
Note: we should also be optimizing hyperparameters to see if we can find additional improvements!
# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]
# Create the data frame of importances
importance = pd.DataFrame(
{"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")
importance.hvplot.barh(x="Feature", y="Importance")
We can use a technique called one-hot encoding
Steps:
OneHotEncoder
object is a preprocessor that will perform the vectorization stepColumnTransformer
object will help us apply different transformers to numerical and categorical columnsfrom sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
Let's try out the example data of colors:
# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]
colors.shape
(4, 1)
colors
array([['red'], ['green'], ['blue'], ['red']], dtype='<U5')
# Initialize the OHE transformer
ohe = OneHotEncoder()
# Fit the transformer and then transform the colors
ohe.fit_transform(colors).toarray()
array([[0., 0., 1.], [0., 1., 0.], [1., 0., 0.], [0., 0., 1.]])
# The corresponding category for each column
ohe.categories_
[array(['blue', 'green', 'red'], dtype='<U5')]
Let's apply separate transformers for our numerical and categorical columns:
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
Note: the handle_unknown='ignore'
parameter ensures that if categories show up in our training set, but not our test set, no error will be raised.
Initialize the pipeline object, using the column transformer and the random forest regressor
# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=10,
random_state=42)
)
Now, let's fit the model.
train_set
and test_set
X_train
and X_test
numpy arrays.# Fit the training set
pipe.fit(train_set, y_train);
# What's the test score?
pipe.score(test_set, y_test)
0.5781569592950195
$R^2$ of ~0.36 improved to $R^2$ of ~0.58!
Takeaway: neighborhood based effects play a crucial role in determining housing prices.
Side Note: to fully validate the model we should run $k$-fold cross validation and optimize hyperparameters of the model as well...
This will be part of assignment #7
But first, we need to know the column names! The one-hot encoder created a column for each category type...
# The column transformer...
transformer
ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'zip_code'])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'zip_code'])])
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']
StandardScaler()
['exterior_condition', 'zip_code']
OneHotEncoder(handle_unknown='ignore')
# The steps in the column transformer
transformer.named_transformers_
{'num': StandardScaler(), 'cat': OneHotEncoder(handle_unknown='ignore'), 'remainder': 'drop'}
# The one-hot step
ohe = transformer.named_transformers_['cat']
ohe
OneHotEncoder(handle_unknown='ignore')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(handle_unknown='ignore')
# One column for each category type!
ohe_cols = ohe.get_feature_names_out()
ohe_cols
array(['exterior_condition_0', 'exterior_condition_1', 'exterior_condition_2', 'exterior_condition_3', 'exterior_condition_4', 'exterior_condition_5', 'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102', 'zip_code_19103', 'zip_code_19104', 'zip_code_19106', 'zip_code_19107', 'zip_code_19111', 'zip_code_19114', 'zip_code_19115', 'zip_code_19116', 'zip_code_19118', 'zip_code_19119', 'zip_code_19120', 'zip_code_19121', 'zip_code_19122', 'zip_code_19123', 'zip_code_19124', 'zip_code_19125', 'zip_code_19126', 'zip_code_19127', 'zip_code_19128', 'zip_code_19129', 'zip_code_19130', 'zip_code_19131', 'zip_code_19132', 'zip_code_19133', 'zip_code_19134', 'zip_code_19135', 'zip_code_19136', 'zip_code_19137', 'zip_code_19138', 'zip_code_19139', 'zip_code_19140', 'zip_code_19141', 'zip_code_19142', 'zip_code_19143', 'zip_code_19144', 'zip_code_19145', 'zip_code_19146', 'zip_code_19147', 'zip_code_19148', 'zip_code_19149', 'zip_code_19150', 'zip_code_19151', 'zip_code_19152', 'zip_code_19153', 'zip_code_19154'], dtype=object)
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
features
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories', 'exterior_condition_0', 'exterior_condition_1', 'exterior_condition_2', 'exterior_condition_3', 'exterior_condition_4', 'exterior_condition_5', 'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102', 'zip_code_19103', 'zip_code_19104', 'zip_code_19106', 'zip_code_19107', 'zip_code_19111', 'zip_code_19114', 'zip_code_19115', 'zip_code_19116', 'zip_code_19118', 'zip_code_19119', 'zip_code_19120', 'zip_code_19121', 'zip_code_19122', 'zip_code_19123', 'zip_code_19124', 'zip_code_19125', 'zip_code_19126', 'zip_code_19127', 'zip_code_19128', 'zip_code_19129', 'zip_code_19130', 'zip_code_19131', 'zip_code_19132', 'zip_code_19133', 'zip_code_19134', 'zip_code_19135', 'zip_code_19136', 'zip_code_19137', 'zip_code_19138', 'zip_code_19139', 'zip_code_19140', 'zip_code_19141', 'zip_code_19142', 'zip_code_19143', 'zip_code_19144', 'zip_code_19145', 'zip_code_19146', 'zip_code_19147', 'zip_code_19148', 'zip_code_19149', 'zip_code_19150', 'zip_code_19151', 'zip_code_19152', 'zip_code_19153', 'zip_code_19154']
random_forest = pipe["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": random_forest.feature_importances_}
)
importance.head(n=20)
Feature | Importance | |
---|---|---|
0 | total_livable_area | 0.192483 |
1 | total_area | 0.194151 |
2 | garage_spaces | 0.010269 |
3 | fireplaces | 0.001761 |
4 | number_of_bathrooms | 0.132411 |
5 | number_of_bedrooms | 0.038099 |
6 | number_stories | 0.020997 |
7 | exterior_condition_0 | 0.000226 |
8 | exterior_condition_1 | 0.004962 |
9 | exterior_condition_2 | 0.005188 |
10 | exterior_condition_3 | 0.012277 |
11 | exterior_condition_4 | 0.010224 |
12 | exterior_condition_5 | 0.012266 |
13 | exterior_condition_6 | 0.005376 |
14 | exterior_condition_7 | 0.016392 |
15 | zip_code_19102 | 0.000262 |
16 | zip_code_19103 | 0.002595 |
17 | zip_code_19104 | 0.003707 |
18 | zip_code_19106 | 0.000726 |
19 | zip_code_19107 | 0.000678 |
# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]
# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)
Interpretation
These North Philadelphia ZIP codes have some of the lowest valued homes in the city, which are inherently the most difficult to model accurately. It makes sense when included ZIP code information that these areas would be the most to improve.
Garbage in, garbage out
Takeway: If your input features are poorly designed (for example, completely unrelated to thing you want to predict), then no matter how good your machine learning model is or how well you "train" it, then the model will never be able to do the translation from features to predicted value.
Yes, let's add distance-based features
The strategy
Distance from each sale to:
Source: https://www.opendataphilly.org/dataset/311-service-and-information-requests
We'll only pull data from 2021.
# the 311 table
table_name = "public_cases_fc"
# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)
geometry | cartodb_id | objectid | service_request_id | subject | status | status_notes | service_name | service_code | agency_responsible | service_notice | requested_datetime | updated_datetime | expected_datetime | closed_datetime | address | zipcode | media_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.16006 39.94560) | 1 | 13593686 | 14807730 | Graffiti Removal | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2022-03-19T17:31:41Z | 2022-03-28T07:30:33Z | 2022-03-29T20:00:00Z | None | 316 S 11TH ST | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.945602 | -75.160064 |
# Select only those for grafitti and in 2021
where_2019 = "requested_datetime >= '01-01-2021' and requested_datetime < '01-01-2022'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)
# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]
len(graffiti)
19472
graffiti.head()
geometry | cartodb_id | objectid | service_request_id | subject | status | status_notes | service_name | service_code | agency_responsible | service_notice | requested_datetime | updated_datetime | expected_datetime | closed_datetime | address | zipcode | media_url | lat | lon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.15211 40.01255) | 3248076 | 13486482 | 14635630 | Graffiti Removal | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2021-12-24T11:44:34Z | 2022-03-16T10:31:02Z | 2022-01-05T19:00:00Z | None | 3901 GERMANTOWN AVE - BUCKET | None | None | 40.012552 | -75.152108 |
1 | POINT (-75.24306 40.04726) | 2236325 | 13882242 | 14269755 | Graffiti Removal | Closed | Issue Resolved | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2021-07-13T11:59:34Z | 2022-05-23T14:57:54Z | 2021-07-22T20:00:00Z | None | 401 DEARNLEY ST | None | None | 40.047261 | -75.243063 |
2 | POINT (-75.14392 39.95225) | 2304145 | 13894509 | 14570255 | Graffiti Removal | Closed | Other | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2021-11-17T16:12:50Z | 2022-05-26T08:31:50Z | 2021-11-28T19:00:00Z | None | 219 ARCH ST - BUCKET | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.952248 | -75.143922 |
3 | POINT (-75.17966 39.95066) | 3234163 | 13485307 | 14558564 | Graffiti Removal | Closed | Information Provided | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2021-11-11T18:07:12Z | 2022-03-16T06:43:07Z | 2021-11-22T19:00:00Z | None | 212-24 S 24TH ST # P22 | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.950656 | -75.179662 |
4 | POINT (-75.18026 39.95073) | 3234165 | 13485308 | 14558534 | Graffiti Removal | Closed | Other | Graffiti Removal | SR-CL01 | Community Life Improvement Program | 7 Business Days | 2021-11-11T17:28:38Z | 2022-03-16T06:44:15Z | 2021-11-22T19:00:00Z | None | 220 S 24TH ST | None | https://d17aqltn7cihbm.cloudfront.net/uploads/... | 39.950733 | -75.180258 |
We will need to:
# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
x = df.geometry.x
y = df.geometry.y
return np.column_stack((x, y)) # stack as columns
# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)
# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)
salesXY.shape
(19294, 2)
graffitiXY.shape
(19472, 2)
For this, we will use the $k$ nearest neighbors algorithm from scikit learn.
For each sale:
from sklearn.neighbors import NearestNeighbors
# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)
# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)
# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)
Note: I am using k=5
here without any real justification. In practice, you would want to try a few different k values to try to identify the best value to use.
grafDists
: For each sale, the distances to the 5 nearest graffiti callsgrafIndices
: For each sale, the index of each of the 5 neighbors in the original datasetprint("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)
length of sales = 19294 shape of grafDists = (19294, 5) shape of grafIndices = (19294, 5)
# The distances from the first sale to the 5 nearest neighbors
grafDists[0]
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])
salesXY[0]
array([-8365316.98040236, 4855961.96674844])
# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0
(-8365316.980402357, 4855961.966748442)
# The indices for the 5 nearest graffiti calls
grafIndices[0]
array([ 1182, 6051, 11120, 13899, 19127])
# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors
array([[-8365291.26559998, 4855909.56005081], [-8365313.86345662, 4855886.18762381], [-8365280.91288734, 4855893.44613563], [-8365280.91288734, 4855893.44613563], [-8365280.91288734, 4855893.30096534]])
# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]
# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)
# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5
manual_dists
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])
grafDists[0]
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])
We'll average over the column axis: axis=1
grafDists.mean(axis=1)
array([ 73.32953543, 72.14817954, 222.62189541, ..., 610.60262287, 610.60262287, 34.21559157])
# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)
# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5
# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)
sales.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | zip_code | geometry | logDistGraffiti | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
18473 | 430000.0 | 1203.0 | 779.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 2 | 19147 | POINT (-75.14692 39.93129) | 1.865279 |
20663 | 520000.0 | 2050.0 | 1433.0 | 0.0 | 0.0 | 2.0 | 4.0 | 2.0 | 4 | 19147 | POINT (-75.14703 39.93123) | 1.858225 |
2059 | 600000.0 | 1625.0 | 1623.0 | 1.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | 19147 | POINT (-75.14854 39.93144) | 2.347568 |
18555 | 765000.0 | 2558.0 | 1400.0 | 1.0 | 0.0 | 0.0 | 3.0 | 2.0 | 1 | 19147 | POINT (-75.14798 39.93012) | 2.151381 |
4024 | 255000.0 | 1008.0 | 546.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2.0 | 4 | 19147 | POINT (-75.14886 39.93012) | 2.285802 |
# Load the City Limits to plot too
import esri2gpd
# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Use the osmnx
package to get subway stops in Philly — we can use the ox.geometries_from_polygon()
function.
station=subway
: see the OSM Wikipediaimport osmnx as ox
city_limits.to_crs(epsg=4326).squeeze().geometry
# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry
city_limits_outline
# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})
# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)
subway.head(n=20)
addr:city | name | network | operator | platforms | public_transport | railway | station | subway | wheelchair | wikidata | wikipedia | geometry | addr:postcode | operator_1 | addr:housenumber | addr:street | railway:position | internet_access | old_name | addr:state | short_name | elevator | tram | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
element_type | osmid | ||||||||||||||||||||||||
node | 469917297 | Philadelphia | 15th-16th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | yes | Q4551078 | en:15–16th & Locust (PATCO station) | POINT (-8367552.610 4858465.747) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
469917298 | Philadelphia | 9th-10th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | yes | Q4646737 | en:9–10th & Locust (PATCO station) | POINT (-8366424.042 4858281.683) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
471026103 | Philadelphia | 12th-13th & Locust | PATCO | PATCO | 1 | station | station | subway | yes | no | Q4548965 | en:12–13th & Locust (PATCO station) | POINT (-8366949.703 4858366.817) | 19107 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
650938316 | NaN | 63rd Street | SEPTA | SEPTA | NaN | station | station | subway | yes | NaN | NaN | NaN | POINT (-8376424.717 4860524.238) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
650959043 | NaN | 56th Street | SEPTA | SEPTA | NaN | station | station | subway | yes | NaN | Q4640769 | NaN | POINT (-8374883.844 4860274.795) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
650960111 | NaN | 46th Street | NaN | SEPTA | NaN | station | station | subway | yes | yes | NaN | NaN | POINT (-8372784.826 4859935.838) | NaN | SEPTA | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
775915931 | Philadelphia | Lombard-South | SEPTA | SEPTA | 1 | station | station | subway | yes | no | Q6669414 | en:Lombard–South station | POINT (-8367373.508 4857829.684) | NaN | NaN | 500 | South Broad Street | mi:7.40 | NaN | NaN | NaN | NaN | NaN | NaN | |
775915932 | NaN | Ellsworth-Federal | SEPTA | SEPTA | 1 | station | station | subway | yes | no | Q11681426 | en:Ellsworth–Federal station | POINT (-8367565.011 4856679.778) | NaN | NaN | 1200 | South Broad Street | mi:7.95 | NaN | NaN | NaN | NaN | NaN | NaN | |
775915933 | Philadelphia | Tasker-Morris | SEPTA | SEPTA | 1 | station | station | subway | yes | no | Q7687362 | en:Tasker–Morris station | POINT (-8367718.220 4855760.268) | NaN | NaN | NaN | NaN | mi:8.40 | NaN | NaN | NaN | NaN | NaN | NaN | |
775915935 | Philadelphia | Snyder | SEPTA | SEPTA | 1 | station | station | subway | yes | NaN | Q7548885 | en:Snyder station | POINT (-8367839.558 4854864.750) | NaN | NaN | NaN | NaN | mi:8.78 | NaN | NaN | NaN | NaN | NaN | NaN | |
775915939 | Philadelphia | NRG | SEPTA | SEPTA | 2 | station | station | subway | yes | NaN | Q4654508 | en:NRG station | POINT (-8368232.627 4852290.639) | NaN | NaN | 3600 | South Broad Street | NaN | no | Pattison | NaN | NaN | NaN | NaN | |
775922743 | Philadelphia | Olney Transportation Center | NaN | SEPTA | 2 | station | station | subway | yes | yes | Q7088461 | en:Olney Transportation Center | POINT (-8365074.616 4871581.655) | NaN | NaN | 5600 | North Broad Street | mi:0.76 | NaN | NaN | PA | Olney T.C. | NaN | NaN | |
775922744 | Philadelphia | Logan | SEPTA | SEPTA | NaN | station | station | subway | yes | NaN | Q6666919 | en:Logan station | POINT (-8365293.013 4870275.497) | NaN | NaN | 5100 | North Broad Street | mi:1.38 | NaN | NaN | PA | NaN | NaN | NaN | |
775922745 | Philadelphia | Wyoming | SEPTA | SEPTA | 2 | station | station | subway | yes | NaN | Q5859777 | en:Wyoming station (SEPTA) | POINT (-8365420.875 4869513.586) | NaN | NaN | 4700 | North Broad Street | mi:1.75 | NaN | NaN | PA | NaN | NaN | NaN | |
775922746 | NaN | Hunting Park | SEPTA | SEPTA | 2 | station | station | subway | yes | NaN | Q389072 | en:Hunting Park station | POINT (-8365591.617 4868497.068) | NaN | NaN | 4200 | North Broad Street | mi:2.20 | NaN | NaN | NaN | NaN | NaN | NaN | |
775922747 | Philadelphia | Erie | SEPTA | SEPTA | NaN | station | station | subway | yes | NaN | Q2885954 | en:Erie station (SEPTA) | POINT (-8365794.240 4867285.291) | 19140 | NaN | 3700 | North Broad Street | mi:2.82 | NaN | NaN | PA | NaN | NaN | NaN | |
775922748 | NaN | Allegheny | NaN | septa | NaN | station | station | subway | yes | NaN | NaN | NaN | POINT (-8365979.788 4866173.656) | NaN | NaN | NaN | NaN | mi:3.34 | NaN | NaN | NaN | NaN | NaN | NaN | |
775922749 | Philadelphia | North Philadelphia | SEPTA | SEPTA | 2 | station | station | subway | yes | yes | Q7056322 | en:North Philadelphia station (Broad Street Line) | POINT (-8366148.637 4865165.705) | NaN | NaN | 2700 | North Broad Street | mi:3.79 | NaN | NaN | PA | NaN | NaN | NaN | |
775922751 | Philadelphia | Cecil B. Moore | SEPTA | SEPTA | NaN | station | station | subway | yes | yes | Q5055946 | en:Cecil B. Moore station | POINT (-8366539.324 4862828.662) | NaN | NaN | 1700 | North Broad Street | mi:4.96 | NaN | NaN | PA | NaN | NaN | NaN | |
775922752 | Philadelphia | Girard | SEPTA | SEPTA | NaN | station | station | subway | yes | yes | Q5564139 | en:Girard station (Broad Street Line) | POINT (-8366712.793 4861801.427) | NaN | NaN | NaN | North Broad Street | mi:5.47 | NaN | NaN | NaN | NaN | NaN | NaN |
fig, ax = plt.subplots(figsize=(6,6))
# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')
# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)
ax.set_axis_off()
The stops on the Market-Frankford and Broad St. subway lines!
We'll use $k=1$ to get the distance to the nearest stop.
# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))
# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)
# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)
# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)
# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))
# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
Looks like it worked!
Let's have a look at the correlations of numerical columns:
import seaborn as sns
cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway", # NEW
"sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);
# Numerical columns
num_cols = [
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"logDistGraffiti", # NEW
"logDistSubway" # NEW
]
# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
]
)
# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)
# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);
# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)
0.6542810337083442
$R^2$ of ~0.58 improved to $R^2$ of ~0.65
def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
"""
Utility function to plot the feature importances from the input
random forest regressor.
Parameters
----------
pipeline :
the pipeline object
num_cols :
list of the numerical columns
transformer :
the transformer preprocessing step
top : optional
the number of importances to plot
**kwargs : optional
extra keywords passed to the hvplot function
"""
# The one-hot step
ohe = transformer.named_transformers_["cat"]
# One column for each category type!
ohe_cols = ohe.get_feature_names_out()
# Full list of columns is numerical + one-hot
features = num_cols + list(ohe_cols)
# The regressor
regressor = pipeline["randomforestregressor"]
# Create the dataframe with importances
importance = pd.DataFrame(
{"Feature": features, "Importance": regressor.feature_importances_}
)
# Sort importance in descending order and get the top
importance = importance.sort_values("Importance", ascending=False).iloc[:top]
# Plot
return importance.hvplot.barh(
x="Feature", y="Importance", flip_yaxis=True, **kwargs
)
plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)
Modify the get_xy_from_geometry()
function to use the "centroid" of the geometry column.
Note: you can take the centroid of a Point() or Polygon() object. For a Point(), you just get the x/y coordinates back.
def get_xy_from_geometry(df):
"""
Return a numpy array with two columns, where the
first holds the `x` geometry coordinate and the second
column holds the `y` geometry coordinate
"""
# NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries
x = df.geometry.centroid.x
y = df.geometry.centroid.y
return np.column_stack((x, y)) # stack as columns
New feature: Distance to the nearest university/college
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0"
univs = esri2gpd.get(url)
# Get the X/Y
univXY = get_xy_from_geometry(univs.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(univXY)
univDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales['logDistUniv'] = np.log10(univDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=np.log10(univDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest University/College", fontsize=16, color='white');
New feature: Distance to the nearest park centroid
Notes
x
and y
coordinates of the park centroids and calculate the distance to these centroids.geometry.centroid.x
and geometry.centroid.y
values to access these coordinates.# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0"
parks = esri2gpd.get(url)
# Get the X/Y
parksXY = get_xy_from_geometry(parks.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(parksXY)
parksDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistParks"] = np.log10(parksDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(parksDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest Park", fontsize=16, color="white");
New feature: Distance to City Hall.
Notes
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0"
cityHall = esri2gpd.get(
url, where="NAME = 'City Hall' AND FEAT_TYPE = 'Municipal Building'"
)
# Get the X/Y
cityHallXY = get_xy_from_geometry(cityHall.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(cityHallXY)
cityHallDist, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistCityHall"] = np.log10(cityHallDist.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(cityHallDist.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to City Hall", fontsize=16, color="white");
New feature: Distance to the 5 nearest residential construction permits from 2021
Notes
permitdescription
equals 'RESIDENTRIAL CONSTRUCTION PERMIT'permitissuedate
column# Table name
table_name = "permits"
# Where clause
where = "permitissuedate >= '2021-01-01' AND permitissuedate < '2022-01-01'"
where = where + " AND permitdescription='RESIDENTIAL BUILDING PERMIT'"
# Query
permits = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
permits = permits.loc[permits.geometry.notnull()]
# Get the X/Y
permitsXY = get_xy_from_geometry(permits.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(permitsXY)
permitsDist, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales["logDistPermits"] = np.log10(permitsDist.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(permitsDist.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Building Permits", fontsize=16, color="white");
New feature: Distance to the 5 nearest aggravated assaults in 2021
Notes
Text_General_Code
equals 'Aggravated Assault No Firearm' or 'Aggravated Assault Firearm'dispatch_date
column# Table name
table_name = "incidents_part1_part2"
# Where selection
where = "dispatch_date >= '2021-01-01' AND dispatch_date < '2022-01-01'"
where = where + " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"
# Query
assaults = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
assaults = assaults.loc[assaults.geometry.notnull()]
# Get the X/Y
assaultsXY = get_xy_from_geometry(assaults.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(assaultsXY)
assaultDists, _ = nbrs.kneighbors(salesXY)
# Add the new feature
sales['logDistAssaults'] = np.log10(assaultDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(assaultDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Assaults", fontsize=16, color="white");
New feature: Distance to the 5 nearest abandoned vehicle 311 calls in 2021
Notes
service_name
equals 'Abandoned Vehicle'requested_datetime
column# Table name
table_name = "public_cases_fc"
# Where selection
where = "requested_datetime >= '2021-01-01' AND requested_datetime < '2022-01-01'"
where = where + " AND service_name = 'Abandoned Vehicle'"
# Query
cars = carto2gpd.get(carto_url, table_name, where=where)
# Remove missing
cars = cars.loc[cars.geometry.notnull()]
# Get the X/Y
carsXY = get_xy_from_geometry(cars.to_crs(epsg=3857))
# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(carsXY)
carDists, _ = nbrs.kneighbors(salesXY)
# Handle any sales that have 0 distances
carDists[carDists == 0] = 1e-5 # a small, arbitrary value
# Add the new feature
sales["logDistCars"] = np.log10(carDists.mean(axis=1))
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(carDists.mean(axis=1)), gridsize=60)
# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)
ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title(
"Distance to 5 Closest Abandoned Vehichle 311 Calls", fontsize=16, color="white"
);
sys:1: ResourceWarning: Unclosed socket <zmq.Socket(zmq.PUSH) at 0x1068d8a60>