In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas

np.random.seed(42)
In [2]:
pd.options.display.max_columns = 999

Lecture 12A: Predictive Modeling Part 2¶

Nov 21, 2022

Housekeeping¶

  • HW #6 (last optional HW) due on Wednesday
  • Important: No class on Wednesday due to Thanksgiving holiday
  • HW #7 (required) will be posted this Wednesday, due on Wednesday 12/07
    • Includes final project proposal
    • Predictive modeling of housing prices in Philadelphia
  • Final project due on Tuesday December 20

This week: Predictive modeling continued¶

Focus: much more hands-on experience with featuring engineering and adding spatial based features

  • Part 1: Housing price modeling continued
  • Part 2: Predicting bikeshare demand in Philadelphia

Last time¶

  • An introduction to supervised learning and regression with scikit learn
  • Example: modeling housing prices in Philadelphia
  • Key concepts:
    • Linear regression
    • Ridge regression with regularization
    • Test/train split and $k$-fold cross validation
    • Feature engineering
      • Scaling input features
      • Adding polynomial features
      • One-hot encoding + categorical variables
    • Decision trees and random forests

First, let's setup all of the imports we'll need from scikit learn:

In [3]:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

Review: Predicting housing prices in Philadelphia¶

Load data from the Office of Property Assessment¶

Let's download data for single-family properties in Philadelphia that had their last sale during 2021.

Sources:

  • OpenDataPhilly
  • Metadata
In [4]:
import carto2gpd
In [5]:
# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2021 sales for single family residential properties
where = "sale_date >= '2021-01-01' and sale_date <= '2021-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)

# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")
In [6]:
salesRaw.head()
Out[6]:
geometry cartodb_id assessment_date basements beginning_point book_and_page building_code building_code_description category_code category_code_description census_tract central_air cross_reference date_exterior_condition depth exempt_building exempt_land exterior_condition fireplaces frontage fuel garage_spaces garage_type general_construction geographic_ward homestead_exemption house_extension house_number interior_condition location mailing_address_1 mailing_address_2 mailing_care_of mailing_city_state mailing_street mailing_zip market_value market_value_date number_of_bathrooms number_of_bedrooms number_of_rooms number_stories off_street_open other_building owner_1 owner_2 parcel_number parcel_shape quality_grade recording_date registry_number sale_date sale_price separate_utilities sewer site_type state_code street_code street_designation street_direction street_name suffix taxable_building taxable_land topography total_area total_livable_area type_heater unfinished unit utility view_type year_built year_built_estimate zip_code zoning pin objectid
18473 POINT (-75.14692 39.93129) 41645 2021-07-16T00:00:00Z D 15D94 W HOWARD ST 53843064 O50 ROW 3 STY MASONRY 1 SINGLE FAMILY 27 Y None None 49.0 80000 0 2 0.0 16.0 None 0.0 None A 1 80000.0 None 110 2 110 WHARTON ST None None None PHILADELPHIA PA 110 WHARTON ST 19147-5425 379600 None 1.0 3.0 NaN 3.0 770.0 None FINLEY BRIAN None 011000700 E C 2021-06-05T00:00:00Z 009S170046 2021-04-02T00:00:00Z 430000.0 None None None PA 82740 ST None WHARTON None 223680 75920 F 779.0 1203.0 H None None None I 1920 Y 19147 RSA5 1001563065 226380222
20663 POINT (-75.14703 39.93123) 44294 2021-07-16T00:00:00Z C 45'2" W HOWARD ST 53834792 O50 ROW 3 STY MASONRY 1 SINGLE FAMILY 27 Y None None 100.0 0 0 4 0.0 14.0 A 0.0 0 A 1 0.0 None 114 3 114 WHARTON ST SIMPLIFILE LC E-RECORDING None None PHILADELPHIA PA 114 WHARTON ST 19147-5425 383000 None 2.0 4.0 NaN 2.0 770.0 None NICOLO KATIE M NICOLO THOMAS E 011000900 E C+ 2021-05-19T00:00:00Z 009S170129 2021-03-01T00:00:00Z 520000.0 None 0 None PA 82740 ST None WHARTON None 306400 76600 F 1433.0 2050.0 A None None None I 1920 Y 19147 RSA5 1001563070 226381406
2059 POINT (-75.14854 39.93144) 21596 2021-07-16T00:00:00Z 0 54'7" E OF AMERICAN 53958642 R30 ROW B/GAR 2 STY MASONRY 1 SINGLE FAMILY 27 Y None None 90.0 80000 0 4 0.0 18.0 None 1.0 None A 1 80000.0 None 222 4 222 WHARTON ST SIMPLIFILE LC E-RECORDING None None PHILADELPHIA PA 222 WHARTON ST 19147-5336 365600 None 2.0 3.0 NaN 2.0 711.0 None CELLMER TROY SCHWARTZ KATHERINE 011001660 E C 2022-01-28T00:00:00Z 009S170309 2021-11-19T00:00:00Z 600000.0 None None None PA 82740 ST None WHARTON None 212480 73120 F 1623.0 1625.0 A None None None I 1960 Y 19147 RSA5 1001563091 226361243
3142 POINT (-75.14849 39.93119) 22960 2022-04-27T00:00:00Z D 20' W PHILIP ST 53969048 P50 ROW W/GAR 3 STY MASONRY 1 SINGLE FAMILY 27 Y None None 45.0 493120 0 1 0.0 20.0 A 1.0 None B 1 0.0 None 210 1 210 SEARS ST SIMPLIFILE LC E-RECORDING None None PHILADELPHIA PA 210 SEARS ST 19147-6007 616400 None 3.0 3.0 NaN 3.0 601.0 None COTTER ALLISON SHIRLEY COTTER BENJAMIN CHARTIER 011004110 E C+ 2022-02-09T00:00:00Z 009S170376 2021-09-10T00:00:00Z 1.0 None None None PA 71440 ST None SEARS None 0 123280 F 909.0 2163.0 A None None None I 2016 None 19147 RSA5 1001475221 226359605
18555 POINT (-75.14798 39.93012) 41737 2022-04-27T00:00:00Z A ES O S.2ND ST 53845333 P51 ROW W/GAR 3 STY MAS+OTHER 1 SINGLE FAMILY 27 Y None None 84.0 578720 0 1 0.0 17.0 A 1.0 None C 1 0.0 None 142 1 142 REED ST SIMPLIFILE LC E-RECORDING None None PHILADELPHIA PA 142 REED ST 19147-6117 723400 None 0.0 3.0 NaN 2.0 296.0 None JADHAV GAURAV P JADHAV PRITI PRADEEP 011011425 E C+ 2021-06-10T00:00:00Z 010S110345 2021-04-09T00:00:00Z 765000.0 None Y None PA 67780 ST None REED None 0 144680 F 1400.0 2558.0 A None None None I 2014 None 19147 ICMX 1001442224 226379883
In [7]:
len(salesRaw)
Out[7]:
25706
In [8]:
# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols + ["geometry"]].dropna()

# Trim zip code to only the first five digits
sales["zip_code"] = sales["zip_code"].astype(str).str.slice(0, 5)
In [9]:
# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]
In [10]:
len(sales)
Out[10]:
19294

Let's focus on numerical features only first¶

In [11]:
# Split the data 70/30
train_set, test_set = train_test_split(sales, 
                                       test_size=0.3, 
                                       random_state=42)

# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])

# The features
feature_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values

Run a linear regression model as a baseline:

In [12]:
# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Fit on the training data
linear_pipeline.fit(X_train, y_train)

# What's the test score?
linear_pipeline.score(X_test, y_test)
Out[12]:
0.23452440229126648

Run cross-validation on a random forest model:

In [13]:
# Make a random forest pipeline
forest_pipeline = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 10-fold cross validation
scores = cross_val_score(
    forest_pipeline,
    X_train,
    y_train,
    cv=10,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())
R^2 scores =  [0.39981407 0.33859844 0.35040232 0.35257309 0.32632646 0.38019832
 0.3111287  0.37412471 0.34844108 0.34621604]
Scores mean =  0.35278232253535474
Score std dev =  0.024744515194105168
In [14]:
# Fit on the training data
forest_pipeline.fit(X_train, y_train)

# What's the test score?
forest_pipeline.score(X_test, y_test)
Out[14]:
0.34475795326957004

Test score improved!¶

The model appears to generalize reasonably well

Note: we should also be optimizing hyperparameters to see if we can find additional improvements!

Which variables were most important?¶

In [15]:
# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]
In [16]:
# Create the data frame of importances
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")


importance.hvplot.barh(x="Feature", y="Importance")
Out[16]:

On to new material...¶

How to handle categorical data?¶

We can use a technique called one-hot encoding

Steps:

  • Create a new column for each category
  • Represent each category as a vector of 1s and 0s

One-hot encoding in scikit learn¶

  • The OneHotEncoder object is a preprocessor that will perform the vectorization step
  • The ColumnTransformer object will help us apply different transformers to numerical and categorical columns
In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Let's try out the example data of colors:

In [18]:
# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]
In [19]:
colors.shape
Out[19]:
(4, 1)
In [20]:
colors
Out[20]:
array([['red'],
       ['green'],
       ['blue'],
       ['red']], dtype='<U5')
In [21]:
# Initialize the OHE transformer
ohe = OneHotEncoder()

# Fit the transformer and then transform the colors 
ohe.fit_transform(colors).toarray()
Out[21]:
array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])
In [22]:
# The corresponding category for each column
ohe.categories_
Out[22]:
[array(['blue', 'green', 'red'], dtype='<U5')]

Let's apply separate transformers for our numerical and categorical columns:

In [23]:
# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
In [24]:
# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns

transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

Note: the handle_unknown='ignore' parameter ensures that if categories show up in our training set, but not our test set, no error will be raised.

Initialize the pipeline object, using the column transformer and the random forest regressor

In [25]:
# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=10, 
                                       random_state=42)
)

Now, let's fit the model.

Important!¶

  • You must pass in the full training set and test set DataFrames: train_set and test_set
  • No need to create the X_train and X_test numpy arrays.
  • We told scikit learn which column strings to extract in the ColumnTransformer, so it needs the DataFrame with named columns.
In [26]:
# Fit the training set
pipe.fit(train_set, y_train);
In [27]:
# What's the test score?
pipe.score(test_set, y_test)
Out[27]:
0.5781569592950195

Substantial improvement on test set when including ZIP codes¶

$R^2$ of ~0.36 improved to $R^2$ of ~0.58!

Takeaway: neighborhood based effects play a crucial role in determining housing prices.

Side Note: to fully validate the model we should run $k$-fold cross validation and optimize hyperparameters of the model as well...

This will be part of assignment #7

But how crucial? Let's plot the importances¶

But first, we need to know the column names! The one-hot encoder created a column for each category type...

In [28]:
# The column transformer...
transformer
Out[28]:
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'zip_code'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'zip_code'])])
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']
StandardScaler()
['exterior_condition', 'zip_code']
OneHotEncoder(handle_unknown='ignore')
In [29]:
# The steps in the column transformer
transformer.named_transformers_
Out[29]:
{'num': StandardScaler(),
 'cat': OneHotEncoder(handle_unknown='ignore'),
 'remainder': 'drop'}
In [30]:
# The one-hot step
ohe = transformer.named_transformers_['cat']

ohe
Out[30]:
OneHotEncoder(handle_unknown='ignore')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OneHotEncoder(handle_unknown='ignore')
In [31]:
# One column for each category type!
ohe_cols = ohe.get_feature_names_out()

ohe_cols
Out[31]:
array(['exterior_condition_0', 'exterior_condition_1',
       'exterior_condition_2', 'exterior_condition_3',
       'exterior_condition_4', 'exterior_condition_5',
       'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102',
       'zip_code_19103', 'zip_code_19104', 'zip_code_19106',
       'zip_code_19107', 'zip_code_19111', 'zip_code_19114',
       'zip_code_19115', 'zip_code_19116', 'zip_code_19118',
       'zip_code_19119', 'zip_code_19120', 'zip_code_19121',
       'zip_code_19122', 'zip_code_19123', 'zip_code_19124',
       'zip_code_19125', 'zip_code_19126', 'zip_code_19127',
       'zip_code_19128', 'zip_code_19129', 'zip_code_19130',
       'zip_code_19131', 'zip_code_19132', 'zip_code_19133',
       'zip_code_19134', 'zip_code_19135', 'zip_code_19136',
       'zip_code_19137', 'zip_code_19138', 'zip_code_19139',
       'zip_code_19140', 'zip_code_19141', 'zip_code_19142',
       'zip_code_19143', 'zip_code_19144', 'zip_code_19145',
       'zip_code_19146', 'zip_code_19147', 'zip_code_19148',
       'zip_code_19149', 'zip_code_19150', 'zip_code_19151',
       'zip_code_19152', 'zip_code_19153', 'zip_code_19154'], dtype=object)
In [32]:
# Full list of columns is numerical + one-hot 
features = num_cols + list(ohe_cols)

features
Out[32]:
['total_livable_area',
 'total_area',
 'garage_spaces',
 'fireplaces',
 'number_of_bathrooms',
 'number_of_bedrooms',
 'number_stories',
 'exterior_condition_0',
 'exterior_condition_1',
 'exterior_condition_2',
 'exterior_condition_3',
 'exterior_condition_4',
 'exterior_condition_5',
 'exterior_condition_6',
 'exterior_condition_7',
 'zip_code_19102',
 'zip_code_19103',
 'zip_code_19104',
 'zip_code_19106',
 'zip_code_19107',
 'zip_code_19111',
 'zip_code_19114',
 'zip_code_19115',
 'zip_code_19116',
 'zip_code_19118',
 'zip_code_19119',
 'zip_code_19120',
 'zip_code_19121',
 'zip_code_19122',
 'zip_code_19123',
 'zip_code_19124',
 'zip_code_19125',
 'zip_code_19126',
 'zip_code_19127',
 'zip_code_19128',
 'zip_code_19129',
 'zip_code_19130',
 'zip_code_19131',
 'zip_code_19132',
 'zip_code_19133',
 'zip_code_19134',
 'zip_code_19135',
 'zip_code_19136',
 'zip_code_19137',
 'zip_code_19138',
 'zip_code_19139',
 'zip_code_19140',
 'zip_code_19141',
 'zip_code_19142',
 'zip_code_19143',
 'zip_code_19144',
 'zip_code_19145',
 'zip_code_19146',
 'zip_code_19147',
 'zip_code_19148',
 'zip_code_19149',
 'zip_code_19150',
 'zip_code_19151',
 'zip_code_19152',
 'zip_code_19153',
 'zip_code_19154']
In [33]:
random_forest = pipe["randomforestregressor"]

# Create the dataframe with importances
importance = pd.DataFrame(
    {"Feature": features, "Importance": random_forest.feature_importances_}
)
In [34]:
importance.head(n=20)
Out[34]:
Feature Importance
0 total_livable_area 0.192483
1 total_area 0.194151
2 garage_spaces 0.010269
3 fireplaces 0.001761
4 number_of_bathrooms 0.132411
5 number_of_bedrooms 0.038099
6 number_stories 0.020997
7 exterior_condition_0 0.000226
8 exterior_condition_1 0.004962
9 exterior_condition_2 0.005188
10 exterior_condition_3 0.012277
11 exterior_condition_4 0.010224
12 exterior_condition_5 0.012266
13 exterior_condition_6 0.005376
14 exterior_condition_7 0.016392
15 zip_code_19102 0.000262
16 zip_code_19103 0.002595
17 zip_code_19104 0.003707
18 zip_code_19106 0.000726
19 zip_code_19107 0.000678
In [35]:
# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]

# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)
Out[35]:

Takeaways¶

  • Number of bathrooms and area-based features still important
  • ZIP codes in North Philadelphia also important: 19140, 19132, 19134

Interpretation

These North Philadelphia ZIP codes have some of the lowest valued homes in the city, which are inherently the most difficult to model accurately. It makes sense when included ZIP code information that these areas would be the most to improve.

Why is feature engineering so important?¶

Garbage in, garbage out

  • What we're trying to do is build the best possible model for a particular thing we care about, e.g., housing price, bikeshare trips, etc
  • Our machine learning models try to translate from some set of input features to the thing we care about
  • You should think of the input features as having all of the same information as the predicted quantity — they are just a different representation

Takeway: If your input features are poorly designed (for example, completely unrelated to thing you want to predict), then no matter how good your machine learning model is or how well you "train" it, then the model will never be able to do the translation from features to predicted value.

Adding spatial features to the housing price model¶

  • Adding in ZIP code information captures a lot of the neighborhood-based amenity/disamenity properties
  • Can we explicitly add new features that also try to capture some of those features?

Yes, let's add distance-based features

Spatial amenity/disamenity features¶

The strategy

  • Get the data for a certain type of amenity, e.g., restaurants, bars, or disamenity, e.g., crimes
    • Data sources: 311 requests, crime incidents, Open Street Map
  • Use scikit learn's nearest neighbor algorithm to calculate the distance from each sale to its nearest neighbor in the amenity/disamenity datasets

Examples of new possible features...¶

Distance from each sale to:

  • Universities
  • Parks
  • City Hall
  • Subway Stops
  • New Construction Permits
  • Aggravated Assaults
  • Graffiti 311 Calls
  • Abandoned Vehicle 311 Calls

Example #1: 311 Graffiti Calls¶

Source: https://www.opendataphilly.org/dataset/311-service-and-information-requests

Step 1: Download the data from the CARTO database¶

We'll only pull data from 2021.

In [36]:
# the 311 table
table_name = "public_cases_fc"

# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)
Out[36]:
geometry cartodb_id objectid service_request_id subject status status_notes service_name service_code agency_responsible service_notice requested_datetime updated_datetime expected_datetime closed_datetime address zipcode media_url lat lon
0 POINT (-75.16006 39.94560) 1 13593686 14807730 Graffiti Removal Closed Issue Resolved Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2022-03-19T17:31:41Z 2022-03-28T07:30:33Z 2022-03-29T20:00:00Z None 316 S 11TH ST None https://d17aqltn7cihbm.cloudfront.net/uploads/... 39.945602 -75.160064
In [37]:
# Select only those for grafitti and in 2021
where_2019 = "requested_datetime >= '01-01-2021' and requested_datetime < '01-01-2022'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
 
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)
In [38]:
# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]
In [39]:
len(graffiti)
Out[39]:
19472
In [40]:
graffiti.head()
Out[40]:
geometry cartodb_id objectid service_request_id subject status status_notes service_name service_code agency_responsible service_notice requested_datetime updated_datetime expected_datetime closed_datetime address zipcode media_url lat lon
0 POINT (-75.15211 40.01255) 3248076 13486482 14635630 Graffiti Removal Closed Issue Resolved Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2021-12-24T11:44:34Z 2022-03-16T10:31:02Z 2022-01-05T19:00:00Z None 3901 GERMANTOWN AVE - BUCKET None None 40.012552 -75.152108
1 POINT (-75.24306 40.04726) 2236325 13882242 14269755 Graffiti Removal Closed Issue Resolved Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2021-07-13T11:59:34Z 2022-05-23T14:57:54Z 2021-07-22T20:00:00Z None 401 DEARNLEY ST None None 40.047261 -75.243063
2 POINT (-75.14392 39.95225) 2304145 13894509 14570255 Graffiti Removal Closed Other Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2021-11-17T16:12:50Z 2022-05-26T08:31:50Z 2021-11-28T19:00:00Z None 219 ARCH ST - BUCKET None https://d17aqltn7cihbm.cloudfront.net/uploads/... 39.952248 -75.143922
3 POINT (-75.17966 39.95066) 3234163 13485307 14558564 Graffiti Removal Closed Information Provided Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2021-11-11T18:07:12Z 2022-03-16T06:43:07Z 2021-11-22T19:00:00Z None 212-24 S 24TH ST # P22 None https://d17aqltn7cihbm.cloudfront.net/uploads/... 39.950656 -75.179662
4 POINT (-75.18026 39.95073) 3234165 13485308 14558534 Graffiti Removal Closed Other Graffiti Removal SR-CL01 Community Life Improvement Program 7 Business Days 2021-11-11T17:28:38Z 2022-03-16T06:44:15Z 2021-11-22T19:00:00Z None 220 S 24TH ST None https://d17aqltn7cihbm.cloudfront.net/uploads/... 39.950733 -75.180258

Step 2: Get the x/y coordinates of both datasets¶

We will need to:

  • We'll want distances in meters (rather than degrees), so we'll convert the CRS to EPSG=3857
  • Extract out the x/y coordinates of the geometry column of each dataset (sales and grafitti calls)
In [41]:
# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)
In [42]:
def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    x = df.geometry.x
    y = df.geometry.y
    
    return np.column_stack((x, y)) # stack as columns
In [43]:
# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)

# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)
In [44]:
salesXY.shape
Out[44]:
(19294, 2)
In [45]:
graffitiXY.shape
Out[45]:
(19472, 2)

Step 3: Calculate the nearest neighbor distances¶

For this, we will use the $k$ nearest neighbors algorithm from scikit learn.

For each sale:

  • Find the $k$ nearest neighbors in the second dataset (graffiti calls, crimes, etc)
  • Calculate the average distance from the sale to those $k$ neighbors
In [46]:
from sklearn.neighbors import NearestNeighbors
In [47]:
# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)

# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)

# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY) 

Note: I am using k=5 here without any real justification. In practice, you would want to try a few different k values to try to identify the best value to use.

What did we just calculate?¶

  • grafDists: For each sale, the distances to the 5 nearest graffiti calls
    • This should have 5 columns and the same length as the sales dataset
  • grafIndices: For each sale, the index of each of the 5 neighbors in the original dataset
    • This allows you to access the original 311 graffiti data
In [48]:
print("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)
length of sales =  19294
shape of grafDists =  (19294, 5)
shape of grafIndices =  (19294, 5)
In [49]:
# The distances from the first sale to the 5 nearest neighbors
grafDists[0]
Out[49]:
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])

Can we reproduce these distances?¶

In [50]:
salesXY[0]
Out[50]:
array([-8365316.98040236,  4855961.96674844])
In [51]:
# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0
Out[51]:
(-8365316.980402357, 4855961.966748442)
In [52]:
# The indices for the 5 nearest graffiti calls
grafIndices[0]
Out[52]:
array([ 1182,  6051, 11120, 13899, 19127])
In [53]:
# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors
Out[53]:
array([[-8365291.26559998,  4855909.56005081],
       [-8365313.86345662,  4855886.18762381],
       [-8365280.91288734,  4855893.44613563],
       [-8365280.91288734,  4855893.44613563],
       [-8365280.91288734,  4855893.30096534]])
In [54]:
# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]

# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)

# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5
In [55]:
manual_dists
Out[55]:
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])
In [56]:
grafDists[0]
Out[56]:
array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])

Use the log of the average distance as the new feature¶

We'll average over the column axis: axis=1

In [57]:
grafDists.mean(axis=1)
Out[57]:
array([ 73.32953543,  72.14817954, 222.62189541, ..., 610.60262287,
       610.60262287,  34.21559157])
In [58]:
# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)

# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5

# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)
In [59]:
sales.head()
Out[59]:
sale_price total_livable_area total_area garage_spaces fireplaces number_of_bathrooms number_of_bedrooms number_stories exterior_condition zip_code geometry logDistGraffiti
18473 430000.0 1203.0 779.0 0.0 0.0 1.0 3.0 3.0 2 19147 POINT (-75.14692 39.93129) 1.865279
20663 520000.0 2050.0 1433.0 0.0 0.0 2.0 4.0 2.0 4 19147 POINT (-75.14703 39.93123) 1.858225
2059 600000.0 1625.0 1623.0 1.0 0.0 2.0 3.0 2.0 4 19147 POINT (-75.14854 39.93144) 2.347568
18555 765000.0 2558.0 1400.0 1.0 0.0 0.0 3.0 2.0 1 19147 POINT (-75.14798 39.93012) 2.151381
4024 255000.0 1008.0 546.0 0.0 0.0 1.0 3.0 2.0 4 19147 POINT (-75.14886 39.93012) 2.285802

Let's plot a hex map of the new feature!¶

In [60]:
# Load the City Limits to plot too
import esri2gpd

# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)
In [61]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")

Example #2: Subway stops¶

Use the osmnx package to get subway stops in Philly — we can use the ox.geometries_from_polygon() function.

  • To select subway stations, we can use station=subway: see the OSM Wikipedia
  • See Lecture 9A for a reminder on osmnx!
In [62]:
import osmnx as ox
In [63]:
 city_limits.to_crs(epsg=4326).squeeze().geometry
Out[63]:
b''
In [64]:
# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry

city_limits_outline
Out[64]:
b''
In [65]:
# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})

# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)

subway.head(n=20)
Out[65]:
addr:city name network operator platforms public_transport railway station subway wheelchair wikidata wikipedia geometry addr:postcode operator_1 addr:housenumber addr:street railway:position internet_access old_name addr:state short_name elevator tram
element_type osmid
node 469917297 Philadelphia 15th-16th & Locust PATCO PATCO 1 station station subway yes yes Q4551078 en:15–16th & Locust (PATCO station) POINT (-8367552.610 4858465.747) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
469917298 Philadelphia 9th-10th & Locust PATCO PATCO 1 station station subway yes yes Q4646737 en:9–10th & Locust (PATCO station) POINT (-8366424.042 4858281.683) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
471026103 Philadelphia 12th-13th & Locust PATCO PATCO 1 station station subway yes no Q4548965 en:12–13th & Locust (PATCO station) POINT (-8366949.703 4858366.817) 19107 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
650938316 NaN 63rd Street SEPTA SEPTA NaN station station subway yes NaN NaN NaN POINT (-8376424.717 4860524.238) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
650959043 NaN 56th Street SEPTA SEPTA NaN station station subway yes NaN Q4640769 NaN POINT (-8374883.844 4860274.795) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
650960111 NaN 46th Street NaN SEPTA NaN station station subway yes yes NaN NaN POINT (-8372784.826 4859935.838) NaN SEPTA NaN NaN NaN NaN NaN NaN NaN NaN NaN
775915931 Philadelphia Lombard-South SEPTA SEPTA 1 station station subway yes no Q6669414 en:Lombard–South station POINT (-8367373.508 4857829.684) NaN NaN 500 South Broad Street mi:7.40 NaN NaN NaN NaN NaN NaN
775915932 NaN Ellsworth-Federal SEPTA SEPTA 1 station station subway yes no Q11681426 en:Ellsworth–Federal station POINT (-8367565.011 4856679.778) NaN NaN 1200 South Broad Street mi:7.95 NaN NaN NaN NaN NaN NaN
775915933 Philadelphia Tasker-Morris SEPTA SEPTA 1 station station subway yes no Q7687362 en:Tasker–Morris station POINT (-8367718.220 4855760.268) NaN NaN NaN NaN mi:8.40 NaN NaN NaN NaN NaN NaN
775915935 Philadelphia Snyder SEPTA SEPTA 1 station station subway yes NaN Q7548885 en:Snyder station POINT (-8367839.558 4854864.750) NaN NaN NaN NaN mi:8.78 NaN NaN NaN NaN NaN NaN
775915939 Philadelphia NRG SEPTA SEPTA 2 station station subway yes NaN Q4654508 en:NRG station POINT (-8368232.627 4852290.639) NaN NaN 3600 South Broad Street NaN no Pattison NaN NaN NaN NaN
775922743 Philadelphia Olney Transportation Center NaN SEPTA 2 station station subway yes yes Q7088461 en:Olney Transportation Center POINT (-8365074.616 4871581.655) NaN NaN 5600 North Broad Street mi:0.76 NaN NaN PA Olney T.C. NaN NaN
775922744 Philadelphia Logan SEPTA SEPTA NaN station station subway yes NaN Q6666919 en:Logan station POINT (-8365293.013 4870275.497) NaN NaN 5100 North Broad Street mi:1.38 NaN NaN PA NaN NaN NaN
775922745 Philadelphia Wyoming SEPTA SEPTA 2 station station subway yes NaN Q5859777 en:Wyoming station (SEPTA) POINT (-8365420.875 4869513.586) NaN NaN 4700 North Broad Street mi:1.75 NaN NaN PA NaN NaN NaN
775922746 NaN Hunting Park SEPTA SEPTA 2 station station subway yes NaN Q389072 en:Hunting Park station POINT (-8365591.617 4868497.068) NaN NaN 4200 North Broad Street mi:2.20 NaN NaN NaN NaN NaN NaN
775922747 Philadelphia Erie SEPTA SEPTA NaN station station subway yes NaN Q2885954 en:Erie station (SEPTA) POINT (-8365794.240 4867285.291) 19140 NaN 3700 North Broad Street mi:2.82 NaN NaN PA NaN NaN NaN
775922748 NaN Allegheny NaN septa NaN station station subway yes NaN NaN NaN POINT (-8365979.788 4866173.656) NaN NaN NaN NaN mi:3.34 NaN NaN NaN NaN NaN NaN
775922749 Philadelphia North Philadelphia SEPTA SEPTA 2 station station subway yes yes Q7056322 en:North Philadelphia station (Broad Street Line) POINT (-8366148.637 4865165.705) NaN NaN 2700 North Broad Street mi:3.79 NaN NaN PA NaN NaN NaN
775922751 Philadelphia Cecil B. Moore SEPTA SEPTA NaN station station subway yes yes Q5055946 en:Cecil B. Moore station POINT (-8366539.324 4862828.662) NaN NaN 1700 North Broad Street mi:4.96 NaN NaN PA NaN NaN NaN
775922752 Philadelphia Girard SEPTA SEPTA NaN station station subway yes yes Q5564139 en:Girard station (Broad Street Line) POINT (-8366712.793 4861801.427) NaN NaN NaN North Broad Street mi:5.47 NaN NaN NaN NaN NaN NaN
In [66]:
fig, ax = plt.subplots(figsize=(6,6))

# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')

# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)

ax.set_axis_off()

The stops on the Market-Frankford and Broad St. subway lines!

Now, get the distances to the nearest subway stop¶

We'll use $k=1$ to get the distance to the nearest stop.

In [67]:
# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)

# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)

# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))

Let's plot a hex map again!¶

In [68]:
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")

Looks like it worked!

What about correlations?¶

Let's have a look at the correlations of numerical columns:

In [69]:
import seaborn as sns
In [70]:
cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway",  # NEW
    "sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);

Now, let's re-run our model...did it help?¶

In [71]:
# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway" # NEW
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]
In [72]:
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)
In [73]:
# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)
In [74]:
# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])
In [75]:
# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);
In [76]:
# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)
Out[76]:
0.6542810337083442

A small improvement!¶

$R^2$ of ~0.58 improved to $R^2$ of ~0.65

How about the top 30 feature importances now?¶

In [77]:
def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
    """
    Utility function to plot the feature importances from the input
    random forest regressor.

    Parameters
    ----------
    pipeline :
        the pipeline object
    num_cols :
        list of the numerical columns
    transformer :
        the transformer preprocessing step
    top : optional
        the number of importances to plot
    **kwargs : optional
        extra keywords passed to the hvplot function
    """
    # The one-hot step
    ohe = transformer.named_transformers_["cat"]

    # One column for each category type!
    ohe_cols = ohe.get_feature_names_out()

    # Full list of columns is numerical + one-hot
    features = num_cols + list(ohe_cols)

    # The regressor
    regressor = pipeline["randomforestregressor"]

    # Create the dataframe with importances
    importance = pd.DataFrame(
        {"Feature": features, "Importance": regressor.feature_importances_}
    )

    # Sort importance in descending order and get the top
    importance = importance.sort_values("Importance", ascending=False).iloc[:top]

    # Plot
    return importance.hvplot.barh(
        x="Feature", y="Importance", flip_yaxis=True, **kwargs
    )
In [78]:
plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)
Out[78]:

Both new spatial features are in the top 5 in terms of importance!¶

Exercise: How about other spatial features?¶

  • I've listed out several other types of potential sources of new distance-based features from OpenDataPhilly
  • Choose a few and add new features
  • Re-fit the model and evalute the performance on the test set and feature importances

Modify the get_xy_from_geometry() function to use the "centroid" of the geometry column.

Note: you can take the centroid of a Point() or Polygon() object. For a Point(), you just get the x/y coordinates back.

In [79]:
def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns

Example 1: Universities¶

New feature: Distance to the nearest university/college

  • Source: OpenDataPhilly
  • GeoService URL: https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0
In [80]:
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0"
univs = esri2gpd.get(url)

# Get the X/Y
univXY = get_xy_from_geometry(univs.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(univXY)
univDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistUniv'] = np.log10(univDists.mean(axis=1))
In [81]:
fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=np.log10(univDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest University/College", fontsize=16, color='white');

Example 2: Parks¶

New feature: Distance to the nearest park centroid

  • Source: OpenDataPhilly
  • GeoService URL: https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0

Notes

  • The park geometries are polygons, so you'll need to get the x and y coordinates of the park centroids and calculate the distance to these centroids.
  • You can use the geometry.centroid.x and geometry.centroid.y values to access these coordinates.
In [82]:
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0"
parks = esri2gpd.get(url)

# Get the X/Y
parksXY = get_xy_from_geometry(parks.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(parksXY)
parksDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistParks"] = np.log10(parksDists.mean(axis=1))
In [83]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(parksDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest Park", fontsize=16, color="white");

City Hall¶

New feature: Distance to City Hall.

  • Source: OpenDataPhilly
  • GeoService URL: https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0

Notes

  • To identify City Hall, you'll need to pull data where "NAME='City Hall'" and "FEAT_TYPE='Municipal Building'"
  • As with the parks, the geometry will be a polygon, so you should calculate the distance to the centroid of the City Hall polygon
In [84]:
# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0"
cityHall = esri2gpd.get(
    url, where="NAME = 'City Hall' AND FEAT_TYPE = 'Municipal Building'"
)

# Get the X/Y
cityHallXY = get_xy_from_geometry(cityHall.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(cityHallXY)
cityHallDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistCityHall"] = np.log10(cityHallDist.mean(axis=1))
In [85]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(cityHallDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to City Hall", fontsize=16, color="white");

Residential Construction Permits¶

New feature: Distance to the 5 nearest residential construction permits from 2021

  • Source: OpenDataPhilly
  • CARTO table name: "permits"

Notes

  • You can pull new construction permits only by selecting where permitdescription equals 'RESIDENTRIAL CONSTRUCTION PERMIT'
  • You can select permits from only 2021 using the permitissuedate column
In [86]:
# Table name
table_name = "permits"

# Where clause
where = "permitissuedate >= '2021-01-01' AND permitissuedate < '2022-01-01'"
where = where + " AND permitdescription='RESIDENTIAL BUILDING PERMIT'"

# Query
permits = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
permits = permits.loc[permits.geometry.notnull()]

# Get the X/Y
permitsXY = get_xy_from_geometry(permits.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(permitsXY)
permitsDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistPermits"] = np.log10(permitsDist.mean(axis=1))
In [87]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(permitsDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Building Permits", fontsize=16, color="white");

Aggravated Assaults¶

New feature: Distance to the 5 nearest aggravated assaults in 2021

  • Source: OpenDataPhilly
  • CARTO table name: "incidents_part1_part2"

Notes

  • You can pull aggravated assaults only by selecting where Text_General_Code equals 'Aggravated Assault No Firearm' or 'Aggravated Assault Firearm'
  • You can select crimes from only 2021 using the dispatch_date column
In [88]:
# Table name
table_name = "incidents_part1_part2"

# Where selection
where = "dispatch_date >= '2021-01-01' AND dispatch_date < '2022-01-01'"
where = where + " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"

# Query
assaults = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing 
assaults = assaults.loc[assaults.geometry.notnull()]
    
# Get the X/Y
assaultsXY = get_xy_from_geometry(assaults.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(assaultsXY)
assaultDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistAssaults'] = np.log10(assaultDists.mean(axis=1))
In [89]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(assaultDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Assaults", fontsize=16, color="white");

Abandonded Vehicle 311 Calls¶

New feature: Distance to the 5 nearest abandoned vehicle 311 calls in 2021

  • Source: OpenDataPhilly
  • CARTO table name: "public_cases_fc"

Notes

  • You can pull abandonded vehicle calls only by selecting where service_name equals 'Abandoned Vehicle'
  • You can select crimes from only 2021 using the requested_datetime column
In [90]:
# Table name
table_name = "public_cases_fc"

# Where selection
where = "requested_datetime >= '2021-01-01' AND requested_datetime < '2022-01-01'"
where = where + " AND service_name = 'Abandoned Vehicle'"

# Query
cars = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
cars = cars.loc[cars.geometry.notnull()]

# Get the X/Y
carsXY = get_xy_from_geometry(cars.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(carsXY)
carDists, _ = nbrs.kneighbors(salesXY)

# Handle any sales that have 0 distances
carDists[carDists == 0] = 1e-5  # a small, arbitrary value

# Add the new feature
sales["logDistCars"] = np.log10(carDists.mean(axis=1))
In [91]:
fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(carDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title(
    "Distance to 5 Closest Abandoned Vehichle 311 Calls", fontsize=16, color="white"
);
sys:1: ResourceWarning: Unclosed socket <zmq.Socket(zmq.PUSH) at 0x1068d8a60>
In [ ]: