from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas

np.random.seed(42)


pd.options.display.max_columns = 999


# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


import carto2gpd


# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2021 sales for single family residential properties
where = "sale_date >= '2021-01-01' and sale_date <= '2021-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)

# Optional: put it a reproducible order for test/training splits later
salesRaw = salesRaw.sort_values("parcel_number")


salesRaw.head()


len(salesRaw)

25706


# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols + ["geometry"]].dropna()

# Trim zip code to only the first five digits
sales["zip_code"] = sales["zip_code"].astype(str).str.slice(0, 5)


# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]


len(sales)

19294


# Split the data 70/30
train_set, test_set = train_test_split(sales, 
                                       test_size=0.3, 
                                       random_state=42)

# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])

# The features
feature_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


# Make a linear model pipeline
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Fit on the training data
linear_pipeline.fit(X_train, y_train)

# What's the test score?
linear_pipeline.score(X_test, y_test)

0.23452440229126648


# Make a random forest pipeline
forest_pipeline = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 10-fold cross validation
scores = cross_val_score(
    forest_pipeline,
    X_train,
    y_train,
    cv=10,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.39981407 0.33859844 0.35040232 0.35257309 0.32632646 0.38019832
 0.3111287  0.37412471 0.34844108 0.34621604]
Scores mean =  0.35278232253535474
Score std dev =  0.024744515194105168


# Fit on the training data
forest_pipeline.fit(X_train, y_train)

# What's the test score?
forest_pipeline.score(X_test, y_test)

0.34475795326957004


# Extract the regressor from the pipeline
forest_model = forest_pipeline["randomforestregressor"]


# Create the data frame of importances
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance")


importance.hvplot.barh(x="Feature", y="Importance")


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Example data of colors
colors = np.array(["red", "green", "blue", "red"])
colors = colors[:, np.newaxis]


colors.shape

(4, 1)


colors

array([['red'],
       ['green'],
       ['blue'],
       ['red']], dtype='<U5')


# Initialize the OHE transformer
ohe = OneHotEncoder()

# Fit the transformer and then transform the colors 
ohe.fit_transform(colors).toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])


# The corresponding category for each column
ohe.categories_

[array(['blue', 'green', 'red'], dtype='<U5')]


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
# ----> Scale the numerical columns
# ----> One-hot encode the categorical columns

transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Initialize the pipeline
# NOTE: only use 10 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=10, 
                                       random_state=42)
)


# Fit the training set
pipe.fit(train_set, y_train);


# What's the test score?
pipe.score(test_set, y_test)

0.5781569592950195


# The column transformer...
transformer

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'zip_code'])])

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'zip_code'])])

['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']

StandardScaler()

['exterior_condition', 'zip_code']

OneHotEncoder(handle_unknown='ignore')


# The steps in the column transformer
transformer.named_transformers_

{'num': StandardScaler(),
 'cat': OneHotEncoder(handle_unknown='ignore'),
 'remainder': 'drop'}


# The one-hot step
ohe = transformer.named_transformers_['cat']

ohe

OneHotEncoder(handle_unknown='ignore')

OneHotEncoder(handle_unknown='ignore')


# One column for each category type!
ohe_cols = ohe.get_feature_names_out()

ohe_cols

array(['exterior_condition_0', 'exterior_condition_1',
       'exterior_condition_2', 'exterior_condition_3',
       'exterior_condition_4', 'exterior_condition_5',
       'exterior_condition_6', 'exterior_condition_7', 'zip_code_19102',
       'zip_code_19103', 'zip_code_19104', 'zip_code_19106',
       'zip_code_19107', 'zip_code_19111', 'zip_code_19114',
       'zip_code_19115', 'zip_code_19116', 'zip_code_19118',
       'zip_code_19119', 'zip_code_19120', 'zip_code_19121',
       'zip_code_19122', 'zip_code_19123', 'zip_code_19124',
       'zip_code_19125', 'zip_code_19126', 'zip_code_19127',
       'zip_code_19128', 'zip_code_19129', 'zip_code_19130',
       'zip_code_19131', 'zip_code_19132', 'zip_code_19133',
       'zip_code_19134', 'zip_code_19135', 'zip_code_19136',
       'zip_code_19137', 'zip_code_19138', 'zip_code_19139',
       'zip_code_19140', 'zip_code_19141', 'zip_code_19142',
       'zip_code_19143', 'zip_code_19144', 'zip_code_19145',
       'zip_code_19146', 'zip_code_19147', 'zip_code_19148',
       'zip_code_19149', 'zip_code_19150', 'zip_code_19151',
       'zip_code_19152', 'zip_code_19153', 'zip_code_19154'], dtype=object)


# Full list of columns is numerical + one-hot 
features = num_cols + list(ohe_cols)

features

['total_livable_area',
 'total_area',
 'garage_spaces',
 'fireplaces',
 'number_of_bathrooms',
 'number_of_bedrooms',
 'number_stories',
 'exterior_condition_0',
 'exterior_condition_1',
 'exterior_condition_2',
 'exterior_condition_3',
 'exterior_condition_4',
 'exterior_condition_5',
 'exterior_condition_6',
 'exterior_condition_7',
 'zip_code_19102',
 'zip_code_19103',
 'zip_code_19104',
 'zip_code_19106',
 'zip_code_19107',
 'zip_code_19111',
 'zip_code_19114',
 'zip_code_19115',
 'zip_code_19116',
 'zip_code_19118',
 'zip_code_19119',
 'zip_code_19120',
 'zip_code_19121',
 'zip_code_19122',
 'zip_code_19123',
 'zip_code_19124',
 'zip_code_19125',
 'zip_code_19126',
 'zip_code_19127',
 'zip_code_19128',
 'zip_code_19129',
 'zip_code_19130',
 'zip_code_19131',
 'zip_code_19132',
 'zip_code_19133',
 'zip_code_19134',
 'zip_code_19135',
 'zip_code_19136',
 'zip_code_19137',
 'zip_code_19138',
 'zip_code_19139',
 'zip_code_19140',
 'zip_code_19141',
 'zip_code_19142',
 'zip_code_19143',
 'zip_code_19144',
 'zip_code_19145',
 'zip_code_19146',
 'zip_code_19147',
 'zip_code_19148',
 'zip_code_19149',
 'zip_code_19150',
 'zip_code_19151',
 'zip_code_19152',
 'zip_code_19153',
 'zip_code_19154']


random_forest = pipe["randomforestregressor"]

# Create the dataframe with importances
importance = pd.DataFrame(
    {"Feature": features, "Importance": random_forest.feature_importances_}
)


importance.head(n=20)


# Sort by importance and get the top 30
# SORT IN DESCENDING ORDER
importance = importance.sort_values("Importance", ascending=False).iloc[:30]

# Plot
importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)


# the 311 table
table_name = "public_cases_fc"

# Peak at the first row of data
carto2gpd.get(carto_url, table_name, limit=1)


# Select only those for grafitti and in 2021
where_2019 = "requested_datetime >= '01-01-2021' and requested_datetime < '01-01-2022'"
where_grafitti = "service_name = 'Graffiti Removal'"
where = f"{where_2019} and {where_grafitti}"
 
# Pull the subset we want
graffiti = carto2gpd.get(carto_url, table_name, where=where)


# Remove rows with missing geometries
graffiti = graffiti.loc[graffiti.geometry.notnull()]


len(graffiti)

19472


graffiti.head()


# Do the CRS conversion
sales_3857 = sales.to_crs(epsg=3857)
graffiti_3857 = graffiti.to_crs(epsg=3857)


def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    x = df.geometry.x
    y = df.geometry.y
    
    return np.column_stack((x, y)) # stack as columns


# Extract x/y for sales
salesXY = get_xy_from_geometry(sales_3857)

# Extract x/y for grafitti calls
graffitiXY = get_xy_from_geometry(graffiti_3857)


salesXY.shape

(19294, 2)


graffitiXY.shape

(19472, 2)


from sklearn.neighbors import NearestNeighbors


# STEP 1: Initialize the algorithm
k = 5
nbrs = NearestNeighbors(n_neighbors=k)

# STEP 2: Fit the algorithm on the "neighbors" dataset
nbrs.fit(graffitiXY)

# STEP 3: Get distances for sale to neighbors
grafDists, grafIndices = nbrs.kneighbors(salesXY)


print("length of sales = ", len(salesXY))
print("shape of grafDists = ", grafDists.shape)
print("shape of grafIndices = ", grafIndices.shape)

length of sales =  19294
shape of grafDists =  (19294, 5)
shape of grafIndices =  (19294, 5)


# The distances from the first sale to the 5 nearest neighbors
grafDists[0]

array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])


salesXY[0]

array([-8365316.98040236,  4855961.96674844])


# The coordinates for the first sale
x0, y0 = salesXY[0]
x0, y0

(-8365316.980402357, 4855961.966748442)


# The indices for the 5 nearest graffiti calls
grafIndices[0]

array([ 1182,  6051, 11120, 13899, 19127])


# the graffiti neighbors
sale0_neighbors = graffitiXY[grafIndices[0]]
sale0_neighbors

array([[-8365291.26559998,  4855909.56005081],
       [-8365313.86345662,  4855886.18762381],
       [-8365280.91288734,  4855893.44613563],
       [-8365280.91288734,  4855893.44613563],
       [-8365280.91288734,  4855893.30096534]])


# Access the first and second column for x/y values
neighbors_x = sale0_neighbors[:,0]
neighbors_y = sale0_neighbors[:,1]

# The x/y differences between neighbors and first sale coordinates
dx = (neighbors_x - x0)
dy = (neighbors_y - y0)

# The Euclidean dist
manual_dists = (dx**2 + dy**2) ** 0.5


manual_dists

array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])


grafDists[0]

array([58.37562006, 75.84320062, 77.43345543, 77.43345543, 77.56194562])


grafDists.mean(axis=1)

array([ 73.32953543,  72.14817954, 222.62189541, ..., 610.60262287,
       610.60262287,  34.21559157])


# Average distance to neighbors
avgGrafDist = grafDists.mean(axis=1)

# Set zero distances to be small, but nonzero
# IMPORTANT: THIS WILL AVOID INF DISTANCES WHEN DOING THE LOG
avgGrafDist[avgGrafDist==0] = 1e-5

# Calculate log of distances
sales['logDistGraffiti'] = np.log10(avgGrafDist)


sales.head()


# Load the City Limits to plot too
import esri2gpd

# From OpenDataPhilly's page
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/City_Limits/FeatureServer/0"
city_limits = esri2gpd.get(url).to_crs(epsg=3857)


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

# Plot the log of the Graffiti distance
x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=sales["logDistGraffiti"].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")


import osmnx as ox


 city_limits.to_crs(epsg=4326).squeeze().geometry


# Get the geometry from the city limits
city_limits_outline = city_limits.to_crs(epsg=4326).squeeze().geometry

city_limits_outline


# Get the subway stops within the city limits
subway = ox.geometries_from_polygon(city_limits_outline, tags={"station": "subway"})

# Convert to 3857 (meters)
subway = subway.to_crs(epsg=3857)

subway.head(n=20)


fig, ax = plt.subplots(figsize=(6,6))

# Plot the subway locations
subway.plot(ax=ax, markersize=10, color='crimson')

# City limits, too
city_limits.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=4)

ax.set_axis_off()


# STEP 1: x/y coordinates of subway stops (in EPGS=3857)
subwayXY = get_xy_from_geometry(subway.to_crs(epsg=3857))

# STEP 2: Initialize the algorithm
nbrs = NearestNeighbors(n_neighbors=1)

# STEP 3: Fit the algorithm on the "neighbors" dataset
nbrs.fit(subwayXY)

# STEP 4: Get distances for sale to neighbors
subwayDists, subwayIndices = nbrs.kneighbors(salesXY)

# STEP 5: add back to the original dataset
sales["logDistSubway"] = np.log10(subwayDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

# Plot the log of the subway distance
x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=sales['logDistSubway'].values, gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")


import seaborn as sns


cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway",  # NEW
    "sale_price"
]
sns.heatmap(sales[cols].corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1);


# Numerical columns
num_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "logDistGraffiti", # NEW
    "logDistSubway" # NEW
]

# Categorical columns
cat_cols = ["exterior_condition", "zip_code"]


# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Initialize the pipeline
# NOTE: only use 20 estimators here so it will run in a reasonable time
pipe = make_pipeline(
    transformer, RandomForestRegressor(n_estimators=20, random_state=42)
)


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])


# Fit the training set
# REMINDER: use the training dataframe objects here rather than numpy array
pipe.fit(train_set, y_train);


# What's the test score?
# REMINDER: use the test dataframe rather than numpy array
pipe.score(test_set, y_test)

0.6542810337083442


def plot_feature_importances(pipeline, num_cols, transformer, top=20, **kwargs):
    """
    Utility function to plot the feature importances from the input
    random forest regressor.

    Parameters
    ----------
    pipeline :
        the pipeline object
    num_cols :
        list of the numerical columns
    transformer :
        the transformer preprocessing step
    top : optional
        the number of importances to plot
    **kwargs : optional
        extra keywords passed to the hvplot function
    """
    # The one-hot step
    ohe = transformer.named_transformers_["cat"]

    # One column for each category type!
    ohe_cols = ohe.get_feature_names_out()

    # Full list of columns is numerical + one-hot
    features = num_cols + list(ohe_cols)

    # The regressor
    regressor = pipeline["randomforestregressor"]

    # Create the dataframe with importances
    importance = pd.DataFrame(
        {"Feature": features, "Importance": regressor.feature_importances_}
    )

    # Sort importance in descending order and get the top
    importance = importance.sort_values("Importance", ascending=False).iloc[:top]

    # Plot
    return importance.hvplot.barh(
        x="Feature", y="Importance", flip_yaxis=True, **kwargs
    )


plot_feature_importances(pipe, num_cols, transformer, top=30, height=500)


def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    """
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/Universities_Colleges/FeatureServer/0"
univs = esri2gpd.get(url)

# Get the X/Y
univXY = get_xy_from_geometry(univs.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(univXY)
univDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistUniv'] = np.log10(univDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10,10), facecolor=plt.get_cmap('viridis')(0))

x = salesXY[:,0]
y = salesXY[:,1]
ax.hexbin(x, y, C=np.log10(univDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor='none', edgecolor='white', linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest University/College", fontsize=16, color='white');


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0"
parks = esri2gpd.get(url)

# Get the X/Y
parksXY = get_xy_from_geometry(parks.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(parksXY)
parksDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistParks"] = np.log10(parksDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(parksDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to Nearest Park", fontsize=16, color="white");


# Get the data
url = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/CITY_LANDMARKS/FeatureServer/0"
cityHall = esri2gpd.get(
    url, where="NAME = 'City Hall' AND FEAT_TYPE = 'Municipal Building'"
)

# Get the X/Y
cityHallXY = get_xy_from_geometry(cityHall.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=1)
nbrs.fit(cityHallXY)
cityHallDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistCityHall"] = np.log10(cityHallDist.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(cityHallDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to City Hall", fontsize=16, color="white");


# Table name
table_name = "permits"

# Where clause
where = "permitissuedate >= '2021-01-01' AND permitissuedate < '2022-01-01'"
where = where + " AND permitdescription='RESIDENTIAL BUILDING PERMIT'"

# Query
permits = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
permits = permits.loc[permits.geometry.notnull()]

# Get the X/Y
permitsXY = get_xy_from_geometry(permits.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(permitsXY)
permitsDist, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales["logDistPermits"] = np.log10(permitsDist.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(permitsDist.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Building Permits", fontsize=16, color="white");


# Table name
table_name = "incidents_part1_part2"

# Where selection
where = "dispatch_date >= '2021-01-01' AND dispatch_date < '2022-01-01'"
where = where + " AND Text_General_Code IN ('Aggravated Assault No Firearm', 'Aggravated Assault Firearm')"

# Query
assaults = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing 
assaults = assaults.loc[assaults.geometry.notnull()]
    
# Get the X/Y
assaultsXY = get_xy_from_geometry(assaults.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(assaultsXY)
assaultDists, _ = nbrs.kneighbors(salesXY)

# Add the new feature
sales['logDistAssaults'] = np.log10(assaultDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(assaultDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title("Distance to 5 Closest Assaults", fontsize=16, color="white");


# Table name
table_name = "public_cases_fc"

# Where selection
where = "requested_datetime >= '2021-01-01' AND requested_datetime < '2022-01-01'"
where = where + " AND service_name = 'Abandoned Vehicle'"

# Query
cars = carto2gpd.get(carto_url, table_name, where=where)

# Remove missing
cars = cars.loc[cars.geometry.notnull()]

# Get the X/Y
carsXY = get_xy_from_geometry(cars.to_crs(epsg=3857))

# Run the k nearest algorithm
nbrs = NearestNeighbors(n_neighbors=5)
nbrs.fit(carsXY)
carDists, _ = nbrs.kneighbors(salesXY)

# Handle any sales that have 0 distances
carDists[carDists == 0] = 1e-5  # a small, arbitrary value

# Add the new feature
sales["logDistCars"] = np.log10(carDists.mean(axis=1))


fig, ax = plt.subplots(figsize=(10, 10), facecolor=plt.get_cmap("viridis")(0))

x = salesXY[:, 0]
y = salesXY[:, 1]
ax.hexbin(x, y, C=np.log10(carDists.mean(axis=1)), gridsize=60)

# Plot the city limits
city_limits.plot(ax=ax, facecolor="none", edgecolor="white", linewidth=4)

ax.set_axis_off()
ax.set_aspect("equal")
ax.set_title(
    "Distance to 5 Closest Abandoned Vehichle 311 Calls", fontsize=16, color="white"
);

sys:1: ResourceWarning: Unclosed socket <zmq.Socket(zmq.PUSH) at 0x1068d8a60>

	geometry	cartodb_id	assessment_date	basements	beginning_point	book_and_page	building_code	building_code_description	category_code	category_code_description	census_tract	central_air	cross_reference	date_exterior_condition	depth	exempt_building	exterior_condition	frontage	fuel	garage_spaces	garage_type	general_construction	geographic_ward	homestead_exemption	house_extension	house_number	interior_condition	location	mailing_address_1	mailing_address_2	mailing_care_of	mailing_city_state	mailing_street	mailing_zip	market_value	market_value_date	number_of_bathrooms	number_of_bedrooms	number_of_rooms	number_stories	off_street_open	other_building	owner_1	owner_2	parcel_number	parcel_shape	quality_grade	recording_date	registry_number	sale_date	sale_price	separate_utilities	sewer	site_type	state_code	street_code	street_designation	street_direction	street_name	suffix	taxable_building	taxable_land	topography	total_area	total_livable_area	type_heater	unfinished	unit	utility	view_type	year_built	year_built_estimate	zip_code	zoning	pin	objectid
18473	POINT (-75.14692 39.93129)	41645	2021-07-16T00:00:00Z	D	15D94 W HOWARD ST	53843064	O50	ROW 3 STY MASONRY	1	SINGLE FAMILY	27	Y	None	None	49.0	80000	2	16.0	None	0.0	None	A	1	80000.0	None	110	2	110 WHARTON ST	None	None	None	PHILADELPHIA PA	110 WHARTON ST	19147-5425	379600	None	1.0	3.0	NaN	3.0	770.0	None	FINLEY BRIAN	None	011000700	E	C	2021-06-05T00:00:00Z	009S170046	2021-04-02T00:00:00Z	430000.0	None	None	None	PA	82740	ST	None	WHARTON	None	223680	75920	F	779.0	1203.0	H	None	None	None	I	1920	Y	19147	RSA5	1001563065	226380222
20663	POINT (-75.14703 39.93123)	44294	2021-07-16T00:00:00Z	C	45'2" W HOWARD ST	53834792	O50	ROW 3 STY MASONRY	1	SINGLE FAMILY	27	Y	None	None	100.0	0	4	14.0	A	0.0	0	A	1	0.0	None	114	3	114 WHARTON ST	SIMPLIFILE LC E-RECORDING	None	None	PHILADELPHIA PA	114 WHARTON ST	19147-5425	383000	None	2.0	4.0	NaN	2.0	770.0	None	NICOLO KATIE M	NICOLO THOMAS E	011000900	E	C+	2021-05-19T00:00:00Z	009S170129	2021-03-01T00:00:00Z	520000.0	None	0	None	PA	82740	ST	None	WHARTON	None	306400	76600	F	1433.0	2050.0	A	None	None	None	I	1920	Y	19147	RSA5	1001563070	226381406
2059	POINT (-75.14854 39.93144)	21596	2021-07-16T00:00:00Z	0	54'7" E OF AMERICAN	53958642	R30	ROW B/GAR 2 STY MASONRY	1	SINGLE FAMILY	27	Y	None	None	90.0	80000	4	18.0	None	1.0	None	A	1	80000.0	None	222	4	222 WHARTON ST	SIMPLIFILE LC E-RECORDING	None	None	PHILADELPHIA PA	222 WHARTON ST	19147-5336	365600	None	2.0	3.0	NaN	2.0	711.0	None	CELLMER TROY	SCHWARTZ KATHERINE	011001660	E	C	2022-01-28T00:00:00Z	009S170309	2021-11-19T00:00:00Z	600000.0	None	None	None	PA	82740	ST	None	WHARTON	None	212480	73120	F	1623.0	1625.0	A	None	None	None	I	1960	Y	19147	RSA5	1001563091	226361243
3142	POINT (-75.14849 39.93119)	22960	2022-04-27T00:00:00Z	D	20' W PHILIP ST	53969048	P50	ROW W/GAR 3 STY MASONRY	1	SINGLE FAMILY	27	Y	None	None	45.0	493120	1	20.0	A	1.0	None	B	1	0.0	None	210	1	210 SEARS ST	SIMPLIFILE LC E-RECORDING	None	None	PHILADELPHIA PA	210 SEARS ST	19147-6007	616400	None	3.0	3.0	NaN	3.0	601.0	None	COTTER ALLISON SHIRLEY	COTTER BENJAMIN CHARTIER	011004110	E	C+	2022-02-09T00:00:00Z	009S170376	2021-09-10T00:00:00Z	1.0	None	None	None	PA	71440	ST	None	SEARS	None	0	123280	F	909.0	2163.0	A	None	None	None	I	2016	None	19147	RSA5	1001475221	226359605
18555	POINT (-75.14798 39.93012)	41737	2022-04-27T00:00:00Z	A	ES O S.2ND ST	53845333	P51	ROW W/GAR 3 STY MAS+OTHER	1	SINGLE FAMILY	27	Y	None	None	84.0	578720	1	17.0	A	1.0	None	C	1	0.0	None	142	1	142 REED ST	SIMPLIFILE LC E-RECORDING	None	None	PHILADELPHIA PA	142 REED ST	19147-6117	723400	None	0.0	3.0	NaN	2.0	296.0	None	JADHAV GAURAV P	JADHAV PRITI PRADEEP	011011425	E	C+	2021-06-10T00:00:00Z	010S110345	2021-04-09T00:00:00Z	765000.0	None	Y	None	PA	67780	ST	None	REED	None	0	144680	F	1400.0	2558.0	A	None	None	None	I	2014	None	19147	ICMX	1001442224	226379883

	geometry	cartodb_id	objectid	service_request_id	subject	status	status_notes	service_name	service_code	agency_responsible	service_notice	requested_datetime	updated_datetime	expected_datetime	closed_datetime	address	zipcode	media_url	lat	lon
0	POINT (-75.15211 40.01255)	3248076	13486482	14635630	Graffiti Removal	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2021-12-24T11:44:34Z	2022-03-16T10:31:02Z	2022-01-05T19:00:00Z	None	3901 GERMANTOWN AVE - BUCKET	None	None	40.012552	-75.152108
1	POINT (-75.24306 40.04726)	2236325	13882242	14269755	Graffiti Removal	Closed	Issue Resolved	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2021-07-13T11:59:34Z	2022-05-23T14:57:54Z	2021-07-22T20:00:00Z	None	401 DEARNLEY ST	None	None	40.047261	-75.243063
2	POINT (-75.14392 39.95225)	2304145	13894509	14570255	Graffiti Removal	Closed	Other	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2021-11-17T16:12:50Z	2022-05-26T08:31:50Z	2021-11-28T19:00:00Z	None	219 ARCH ST - BUCKET	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.952248	-75.143922
3	POINT (-75.17966 39.95066)	3234163	13485307	14558564	Graffiti Removal	Closed	Information Provided	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2021-11-11T18:07:12Z	2022-03-16T06:43:07Z	2021-11-22T19:00:00Z	None	212-24 S 24TH ST # P22	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.950656	-75.179662
4	POINT (-75.18026 39.95073)	3234165	13485308	14558534	Graffiti Removal	Closed	Other	Graffiti Removal	SR-CL01	Community Life Improvement Program	7 Business Days	2021-11-11T17:28:38Z	2022-03-16T06:44:15Z	2021-11-22T19:00:00Z	None	220 S 24TH ST	None	https://d17aqltn7cihbm.cloudfront.net/uploads/...	39.950733	-75.180258

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	zip_code	geometry	logDistGraffiti
18473	430000.0	1203.0	779.0	0.0	1.0	3.0	3.0	2	19147	POINT (-75.14692 39.93129)	1.865279
20663	520000.0	2050.0	1433.0	0.0	2.0	4.0	2.0	4	19147	POINT (-75.14703 39.93123)	1.858225
2059	600000.0	1625.0	1623.0	1.0	2.0	3.0	2.0	4	19147	POINT (-75.14854 39.93144)	2.347568
18555	765000.0	2558.0	1400.0	1.0	0.0	3.0	2.0	1	19147	POINT (-75.14798 39.93012)	2.151381
4024	255000.0	1008.0	546.0	0.0	1.0	3.0	2.0	4	19147	POINT (-75.14886 39.93012)	2.285802

	Feature	Importance
0	total_livable_area	0.192483
1	total_area	0.194151
2	garage_spaces	0.010269
3	fireplaces	0.001761
4	number_of_bathrooms	0.132411
5	number_of_bedrooms	0.038099
6	number_stories	0.020997
7	exterior_condition_0	0.000226
8	exterior_condition_1	0.004962
9	exterior_condition_2	0.005188
10	exterior_condition_3	0.012277
11	exterior_condition_4	0.010224
12	exterior_condition_5	0.012266
13	exterior_condition_6	0.005376
14	exterior_condition_7	0.016392
15	zip_code_19102	0.000262
16	zip_code_19103	0.002595
17	zip_code_19104	0.003707
18	zip_code_19106	0.000726
19	zip_code_19107	0.000678

		addr:city	name	network	operator	platforms	public_transport	railway	station	subway	wheelchair	wikidata	wikipedia	geometry	addr:postcode	operator_1	addr:housenumber	addr:street	railway:position	internet_access	old_name	addr:state	short_name	elevator	tram
element_type	osmid
node	469917297	Philadelphia	15th-16th & Locust	PATCO	PATCO	1	station	station	subway	yes	yes	Q4551078	en:15–16th & Locust (PATCO station)	POINT (-8367552.610 4858465.747)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	469917298	Philadelphia	9th-10th & Locust	PATCO	PATCO	1	station	station	subway	yes	yes	Q4646737	en:9–10th & Locust (PATCO station)	POINT (-8366424.042 4858281.683)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	471026103	Philadelphia	12th-13th & Locust	PATCO	PATCO	1	station	station	subway	yes	no	Q4548965	en:12–13th & Locust (PATCO station)	POINT (-8366949.703 4858366.817)	19107	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	650938316	NaN	63rd Street	SEPTA	SEPTA	NaN	station	station	subway	yes	NaN	NaN	NaN	POINT (-8376424.717 4860524.238)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	650959043	NaN	56th Street	SEPTA	SEPTA	NaN	station	station	subway	yes	NaN	Q4640769	NaN	POINT (-8374883.844 4860274.795)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	650960111	NaN	46th Street	NaN	SEPTA	NaN	station	station	subway	yes	yes	NaN	NaN	POINT (-8372784.826 4859935.838)	NaN	SEPTA	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	775915931	Philadelphia	Lombard-South	SEPTA	SEPTA	1	station	station	subway	yes	no	Q6669414	en:Lombard–South station	POINT (-8367373.508 4857829.684)	NaN	NaN	500	South Broad Street	mi:7.40	NaN	NaN	NaN	NaN	NaN	NaN
	775915932	NaN	Ellsworth-Federal	SEPTA	SEPTA	1	station	station	subway	yes	no	Q11681426	en:Ellsworth–Federal station	POINT (-8367565.011 4856679.778)	NaN	NaN	1200	South Broad Street	mi:7.95	NaN	NaN	NaN	NaN	NaN	NaN
	775915933	Philadelphia	Tasker-Morris	SEPTA	SEPTA	1	station	station	subway	yes	no	Q7687362	en:Tasker–Morris station	POINT (-8367718.220 4855760.268)	NaN	NaN	NaN	NaN	mi:8.40	NaN	NaN	NaN	NaN	NaN	NaN
	775915935	Philadelphia	Snyder	SEPTA	SEPTA	1	station	station	subway	yes	NaN	Q7548885	en:Snyder station	POINT (-8367839.558 4854864.750)	NaN	NaN	NaN	NaN	mi:8.78	NaN	NaN	NaN	NaN	NaN	NaN
	775915939	Philadelphia	NRG	SEPTA	SEPTA	2	station	station	subway	yes	NaN	Q4654508	en:NRG station	POINT (-8368232.627 4852290.639)	NaN	NaN	3600	South Broad Street	NaN	no	Pattison	NaN	NaN	NaN	NaN
	775922743	Philadelphia	Olney Transportation Center	NaN	SEPTA	2	station	station	subway	yes	yes	Q7088461	en:Olney Transportation Center	POINT (-8365074.616 4871581.655)	NaN	NaN	5600	North Broad Street	mi:0.76	NaN	NaN	PA	Olney T.C.	NaN	NaN
	775922744	Philadelphia	Logan	SEPTA	SEPTA	NaN	station	station	subway	yes	NaN	Q6666919	en:Logan station	POINT (-8365293.013 4870275.497)	NaN	NaN	5100	North Broad Street	mi:1.38	NaN	NaN	PA	NaN	NaN	NaN
	775922745	Philadelphia	Wyoming	SEPTA	SEPTA	2	station	station	subway	yes	NaN	Q5859777	en:Wyoming station (SEPTA)	POINT (-8365420.875 4869513.586)	NaN	NaN	4700	North Broad Street	mi:1.75	NaN	NaN	PA	NaN	NaN	NaN
	775922746	NaN	Hunting Park	SEPTA	SEPTA	2	station	station	subway	yes	NaN	Q389072	en:Hunting Park station	POINT (-8365591.617 4868497.068)	NaN	NaN	4200	North Broad Street	mi:2.20	NaN	NaN	NaN	NaN	NaN	NaN
	775922747	Philadelphia	Erie	SEPTA	SEPTA	NaN	station	station	subway	yes	NaN	Q2885954	en:Erie station (SEPTA)	POINT (-8365794.240 4867285.291)	19140	NaN	3700	North Broad Street	mi:2.82	NaN	NaN	PA	NaN	NaN	NaN
	775922748	NaN	Allegheny	NaN	septa	NaN	station	station	subway	yes	NaN	NaN	NaN	POINT (-8365979.788 4866173.656)	NaN	NaN	NaN	NaN	mi:3.34	NaN	NaN	NaN	NaN	NaN	NaN
	775922749	Philadelphia	North Philadelphia	SEPTA	SEPTA	2	station	station	subway	yes	yes	Q7056322	en:North Philadelphia station (Broad Street Line)	POINT (-8366148.637 4865165.705)	NaN	NaN	2700	North Broad Street	mi:3.79	NaN	NaN	PA	NaN	NaN	NaN
	775922751	Philadelphia	Cecil B. Moore	SEPTA	SEPTA	NaN	station	station	subway	yes	yes	Q5055946	en:Cecil B. Moore station	POINT (-8366539.324 4862828.662)	NaN	NaN	1700	North Broad Street	mi:4.96	NaN	NaN	PA	NaN	NaN	NaN
	775922752	Philadelphia	Girard	SEPTA	SEPTA	NaN	station	station	subway	yes	yes	Q5564139	en:Girard station (Broad Street Line)	POINT (-8366712.793 4861801.427)	NaN	NaN	NaN	North Broad Street	mi:5.47	NaN	NaN	NaN	NaN	NaN	NaN

Lecture 12A: Predictive Modeling Part 2¶

Housekeeping¶

This week: Predictive modeling continued¶

Last time¶

Review: Predicting housing prices in Philadelphia¶

Load data from the Office of Property Assessment¶

Let's focus on numerical features only first¶

Test score improved!¶

Which variables were most important?¶

On to new material...¶

How to handle categorical data?¶

One-hot encoding in scikit learn¶

Important!¶

Substantial improvement on test set when including ZIP codes¶

But how crucial? Let's plot the importances¶

Takeaways¶

Why is feature engineering so important?¶

Adding spatial features to the housing price model¶

Spatial amenity/disamenity features¶

Examples of new possible features...¶

Example #1: 311 Graffiti Calls¶

Step 1: Download the data from the CARTO database¶

Step 2: Get the x/y coordinates of both datasets¶

Step 3: Calculate the nearest neighbor distances¶

What did we just calculate?¶

Can we reproduce these distances?¶

Use the log of the average distance as the new feature¶

Let's plot a hex map of the new feature!¶

Example #2: Subway stops¶

Now, get the distances to the nearest subway stop¶

Let's plot a hex map again!¶

What about correlations?¶

Now, let's re-run our model...did it help?¶

A small improvement!¶

How about the top 30 feature importances now?¶

Both new spatial features are in the top 5 in terms of importance!¶

Exercise: How about other spatial features?¶

Example 1: Universities¶

Example 2: Parks¶

City Hall¶

Residential Construction Permits¶

Aggravated Assaults¶

Abandonded Vehicle 311 Calls¶