from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import hvplot.pandas

np.random.seed(42)


pd.options.display.max_columns = 999


%matplotlib inline


# Load the data 
data = pd.read_csv("./data/gdp_vs_satisfaction.csv")
data.head()


# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# Model selection
from sklearn.model_selection import train_test_split

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


# Do a 70/30 train/test split
train_set, test_set = train_test_split(data, test_size=0.3, random_state=42)

# Features
X_train = train_set['gdp_per_capita'].values
X_train = X_train[:, np.newaxis]

X_test = test_set['gdp_per_capita'].values
X_test = X_test[:, np.newaxis]

# Labels
y_train = train_set['life_satisfaction'].values
y_test = test_set['life_satisfaction'].values


# The values we want to predict (ranging from our min to max GDP per capita)
gdp_pred = np.linspace(1e3, 1.1e5, 100)

# Sklearn needs the second axis!
X_pred = gdp_pred[:, np.newaxis]


# Create a pre-processing pipeline
# This scales and adds polynomial features up to degree = 3
pipe = make_pipeline(StandardScaler(), PolynomialFeatures(degree=3))

# BASELINE: Setup and fit a linear model (with scaled features)
linear = LinearRegression()
scaler = StandardScaler()
linear.fit(scaler.fit_transform(X_train), y_train)


with plt.style.context("fivethirtyeight"):

    fig, ax = plt.subplots(figsize=(10, 6))

    ## Plot the data
    ax.scatter(
        data["gdp_per_capita"] / 1e5,
        data["life_satisfaction"],
        label="Data",
        s=100,
        zorder=10,
        color="#666666",
    )

    ## Evaluate the linear fit
    print("Linear fit")
    training_score = linear.score(scaler.fit_transform(X_train), y_train)
    print(f"Training Score = {training_score}")

    test_score = linear.score(scaler.fit_transform(X_test), y_test)
    print(f"Test Score = {test_score}")
    print()


    ## Plot the linear fit
    ax.plot(
        X_pred / 1e5,
        linear.predict(scaler.fit_transform(X_pred)),
        color="k",
        label="Linear fit",
    )

    ## Ridge regression: linear model with regularization 
    # Plot the predicted values for each alpha
    for alpha in [0, 10, 100, 1e5]:
        print(f"alpha = {alpha}")

        # Create out Ridge model with this alpha
        ridge = Ridge(alpha=alpha)

        # Fit the model on the training set
        # NOTE: Use the pipeline that includes polynomial features
        ridge.fit(pipe.fit_transform(X_train), y_train)

        # Evaluate on the training set
        training_score = ridge.score(pipe.fit_transform(X_train), y_train)
        print(f"Training Score = {training_score}")

        # Evaluate on the test set
        test_score = ridge.score(pipe.fit_transform(X_test), y_test)
        print(f"Test Score = {test_score}")

        # Plot the ridge results
        y_pred = ridge.predict(pipe.fit_transform(X_pred))
        ax.plot(X_pred / 1e5, y_pred, label=f"alpha = {alpha}")

        print()

    # Plot formatting
    ax.legend(ncol=2, loc=0)
    ax.set_ylim(4, 8)
    ax.set_xlabel("GDP Per Capita ($\\times$ $10^5$)")
    ax.set_ylabel("Life Satisfaction")

Linear fit
Training Score = 0.4638100579740343
Test Score = 0.35959585147159556

alpha = 0
Training Score = 0.6458898101593082
Test Score = 0.5597457659851048

alpha = 10
Training Score = 0.5120282691427858
Test Score = 0.38335642103788325

alpha = 100
Training Score = 0.1815398751108913
Test Score = -0.05242399995626967

alpha = 100000.0
Training Score = 0.0020235571180508005
Test Score = -0.26129559971586125


data2 = pd.read_csv("./data/gdp_vs_satisfaction_more_features.csv")


data2.head()


from sklearn.ensemble import RandomForestRegressor


# Split the data 70/30
train_set, test_set = train_test_split(data2, test_size=0.3, random_state=42)

# the target labels
y_train = train_set["life_satisfaction"].values
y_test = test_set["life_satisfaction"].values

# The features
feature_cols = [col for col in data2.columns if col not in ["life_satisfaction", "Country"]]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


feature_cols

['GDP per capita',
 'Air pollution',
 'Employment rate',
 'Feeling safe walking alone at night',
 'Homicide rate',
 'Life expectancy',
 'Quality of support network',
 'Voter turnout',
 'Water quality']


import seaborn as sns


sns.heatmap(
    train_set[feature_cols].corr(), 
    cmap="coolwarm", 
    annot=True, 
    vmin=-1, 
    vmax=1
);


# Linear model pipeline with two steps
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())

# Fit the pipeline
# NEW: This applies all of the transformations, and then fits the model
print("Linear regression")
linear_pipe.fit(X_train, y_train)

# NEW: Print the training score
training_score = linear_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# NEW: Print the test score
test_score = linear_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Linear regression
Training Score = 0.755333265746168
Test Score = 0.6478865590446832


# Random forest model pipeline with two steps
forest_pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, max_depth=2, random_state=42)
)

# Fit a random forest
print("Random forest")
forest_pipe.fit(X_train, y_train)

# Print the training score
training_score = forest_pipe.score(X_train, y_train)
print(f"Training Score = {training_score}")

# Print the test score
test_score = forest_pipe.score(X_test, y_test)
print(f"Test Score = {test_score}")

Random forest
Training Score = 0.8460559576380231
Test Score = 0.862756366860489


# What are the named steps?
forest_pipe.named_steps

{'standardscaler': StandardScaler(),
 'randomforestregressor': RandomForestRegressor(max_depth=2, random_state=42)}


# Get the forest model
forest_model = forest_pipe['randomforestregressor']


forest_model.feature_importances_

array([0.67746013, 0.0038663 , 0.13108609, 0.06579352, 0.00985913,
       0.01767323, 0.02635605, 0.00601776, 0.06188779])


# Make the dataframe
importance = pd.DataFrame(
    {"Feature": feature_cols, "Importance": forest_model.feature_importances_}
).sort_values("Importance", ascending=False)


importance


# Plot
importance.sort_values("Importance", ascending=True).hvplot.barh(
    x="Feature", y="Importance", title="Does Money Make You Happier?"
)


from sklearn.model_selection import cross_val_score


model = linear_pipe['linearregression']


# Make a linear pipeline
linear_pipe = make_pipeline(StandardScaler(), LinearRegression())

# Run the 3-fold cross validation
scores = cross_val_score(
    linear_pipe,
    X_train,
    y_train,
    cv=3,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [ 0.02064625 -0.84773581 -0.53652985]
Scores mean =  -0.45453980429946145
Score std dev =  0.35922474493059114


# Make a random forest pipeline
forest_pipe = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 3-fold cross validation
scores = cross_val_score(
    forest_pipe,
    X_train,
    y_train,
    cv=3,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.51950656 0.78033403 0.66526384]
Scores mean =  0.655034812029183
Score std dev =  0.10672774411781198


from sklearn.model_selection import GridSearchCV


# Create our regression pipeline
pipe = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

StandardScaler()

RandomForestRegressor(random_state=42)


pipe.named_steps

{'standardscaler': StandardScaler(),
 'randomforestregressor': RandomForestRegressor(random_state=42)}


model_step = "randomforestregressor"
param_grid = {
    f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
    f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

param_grid

{'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200],
 'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51]}


# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1)

# Run the search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]},
             verbose=1)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]},
             verbose=1)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

StandardScaler()

RandomForestRegressor(random_state=42)


# The best estimator
grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=7, random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=7, random_state=42))])

StandardScaler()

RandomForestRegressor(max_depth=7, random_state=42)


# The best hyper parameters
grid.best_params_

{'randomforestregressor__max_depth': 7,
 'randomforestregressor__n_estimators': 100}


def evaluate_mape(model, X_test, y_test):
    """
    Given a model and test features/targets, print out the 
    mean absolute error and accuracy
    """
    # Make the predictions
    predictions = model.predict(X_test)

    # Absolute error
    errors = abs(predictions - y_test)
    avg_error = np.mean(errors)

    # Mean absolute percentage error
    mape = 100 * np.mean(errors / y_test)

    # Accuracy
    accuracy = 100 - mape

    print("Model Performance")
    print(f"Average Absolute Error: {avg_error:0.4f}")
    print(f"Accuracy = {accuracy:0.2f}%.")

    return accuracy


# Setup the pipeline
linear = make_pipeline(StandardScaler(), LinearRegression())

# Fit on train set
linear.fit(X_train, y_train)

# Evaluate on test set
evaluate_mape(linear, X_test, y_test)

Model Performance
Average Absolute Error: 0.3281
Accuracy = 94.93%.

94.92864894582036


# Initialize the pipeline
base_model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=42))

# Fit the training set
base_model.fit(X_train, y_train)

# Evaluate on the test set
base_accuracy = evaluate_mape(base_model, X_test, y_test)

Model Performance
Average Absolute Error: 0.2291
Accuracy = 96.47%.


grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=7, random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(max_depth=7, random_state=42))])

StandardScaler()

RandomForestRegressor(max_depth=7, random_state=42)


# Evaluate the best random forest model
best_random = grid.best_estimator_
random_accuracy = evaluate_mape(best_random, X_test, y_test)

# What's the improvement?
improvement = 100 * (random_accuracy - base_accuracy) / base_accuracy
print(f'Improvement of {improvement:0.4f}%.')

Model Performance
Average Absolute Error: 0.2288
Accuracy = 96.47%.
Improvement of 0.0025%.


import carto2gpd


# the CARTO API url
carto_url = "https://phl.carto.com/api/v2/sql"

# The table name
table_name = "opa_properties_public"

# Only pull 2021 sales for single family residential properties
where = "sale_date >= '2021-01-01' and sale_date <= '2021-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"

# Run the query
salesRaw = carto2gpd.get(carto_url, table_name, where=where)


salesRaw.head()


len(salesRaw)

25695


import missingno as msno


# We'll visualize the first half of columns
# and then the second half
ncol = len(salesRaw.columns)

fields1 = salesRaw.columns[:ncol//2]
fields2 = salesRaw.columns[ncol//2:]


ncol

78


# The first half of columns
msno.bar(salesRaw[fields1]);


# The second half of columns
msno.bar(salesRaw[fields2]);


# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)


len(sales)

24777


# Trim very low and very high sales
valid = (sales['sale_price'] > 3000) & (sales['sale_price'] < 1e6)
sales = sales.loc[valid]


len(sales)

19301


# Split the data 70/30
train_set, test_set = train_test_split(sales, test_size=0.3, random_state=42)

# the target labels: log of sale price
y_train = np.log(train_set["sale_price"])
y_test = np.log(test_set["sale_price"])

# The features
feature_cols = [
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
]
X_train = train_set[feature_cols].values
X_test = test_set[feature_cols].values


# Make a random forest pipeline
forest = make_pipeline(
    StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42)
)

# Run the 10-fold cross validation
scores = cross_val_score(
    forest,
    X_train,
    y_train,
    cv=10,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.35220166 0.3154972  0.35396205 0.33760708 0.32022955 0.37991694
 0.33484225 0.28405482 0.37658985 0.37023379]
Scores mean =  0.3425135181916731
Score std dev =  0.028775602976589713


# Fit on the training data
forest.fit(X_train, y_train)

# What's the test score?
forest.score(X_test, y_test)

0.3497856044657076


# Extract the regressor from the pipeline
regressor = forest["randomforestregressor"]


# Create the data frame of importances
importance = pd.DataFrame(
    {
     "Feature": feature_cols, 
     "Importance": regressor.feature_importances_
    }
).sort_values(by="Importance")

importance.hvplot.barh(x="Feature", y="Importance")

	Country	life_satisfaction	gdp_per_capita
0	Australia	7.3	50961.87
1	Austria	7.1	43724.03
2	Belgium	6.9	40106.63
3	Brazil	6.4	8670.00
4	Canada	7.4	43331.96

	Country	life_satisfaction	GDP per capita	Air pollution	Employment rate	Feeling safe walking alone at night	Homicide rate	Life expectancy	Quality of support network	Voter turnout	Water quality
0	Australia	7.3	50961.87	5.0	73.0	63.5	1.1	82.5	95.0	91.0	93.0
1	Austria	7.1	43724.03	16.0	72.0	80.6	0.5	81.7	92.0	80.0	92.0
2	Belgium	6.9	40106.63	15.0	63.0	70.1	1.0	81.5	91.0	89.0	84.0
3	Brazil	6.4	8670.00	10.0	61.0	35.6	26.7	74.8	90.0	79.0	73.0
4	Canada	7.4	43331.96	7.0	73.0	82.2	1.3	81.9	93.0	68.0	91.0

	geometry	cartodb_id	assessment_date	basements	beginning_point	book_and_page	building_code	building_code_description	category_code	category_code_description	census_tract	central_air	cross_reference	date_exterior_condition	depth	exempt_building	exterior_condition	fireplaces	frontage	fuel	garage_spaces	garage_type	general_construction	geographic_ward	homestead_exemption	house_extension	house_number	interior_condition	location	mailing_address_1	mailing_address_2	mailing_care_of	mailing_city_state	mailing_street	mailing_zip	market_value	market_value_date	number_of_bathrooms	number_of_bedrooms	number_of_rooms	number_stories	off_street_open	other_building	owner_1	owner_2	parcel_number	parcel_shape	quality_grade	recording_date	registry_number	sale_date	sale_price	separate_utilities	sewer	site_type	state_code	street_code	street_designation	street_direction	street_name	suffix	taxable_building	taxable_land	topography	total_area	total_livable_area	type_heater	unfinished	unit	utility	view_type	year_built	year_built_estimate	zip_code	zoning	pin	objectid
0	POINT (-75.19667 39.93733)	981	2021-07-16T00:00:00Z	None	164' S WHARTON ST	54096200	O30	ROW 2 STY MASONRY	1	Single Family	33	None	None	None	62.0	0	4	0.0	16.0	None	0.0	None	A	36	0.0	None	1319	4	1319 S 32ND ST	None	None	None	PHILADELPHIA PA	1319 S 32ND ST	19146-3409	134100	None	1.0	3.0	NaN	2.0	761.0	None	HAMMOND FREDERICK HORACE	HAMMOND TERRY R	362299500	E	C	2022-09-12T00:00:00Z	010S080153	2021-12-23T00:00:00Z	1.0	None	None	None	PA	88430	ST	S	32ND	None	107280	26820	F	992.0	1350.0	None	None	None	None	I	1925	Y	19146	RSA5	1001649631	223429799
1	POINT (-75.16623 39.94166)	1046	2022-03-09T00:00:00Z	A	SWC BROAD & FITZWATER STS	53950012	SC	VACANT LAND COMMER < ACRE	1	Single Family	14	Y	None	None	48.0	1200800	1	NaN	18.0	None	2.0	1	B	30	0.0	None	742	1	742 S BROAD ST	UNIT#9	None	None	PHILADELPHIA PA	742 S BROAD STREET	19146	1501000	None	3.0	4.0	NaN	3.0	NaN	None	MARSHALL ARIELA LUCY	KAPA SURAJ	301262704	E	None	2022-11-19T00:00:00Z	006S010392	2021-11-10T00:00:00Z	1695000.0	None	None	None	PA	19160	ST	S	BROAD	None	0	300200	F	864.0	3529.0	None	None	None	None	I	2021	None	19146	CMX3	1001681202	223429772
2	POINT (-75.02846 40.11039)	1070	2021-07-16T00:00:00Z	F	89'11 1/2" NE SEL459	54095903	K30	S/D W/B GAR 2 STY MASONRY	1	Single Family	357	None	None	None	97.0	0	4	0.0	25.0	None	1.0	None	A	58	0.0	None	10229	4	10229 SELMER PLZ	None	None	None	PHILADELPHIA PA	10229 SELMER PLZ	19116-3629	258900	None	0.0	0.0	NaN	2.0	6289.0	None	ALHAJ SAID	None	582460600	E	C	2022-09-09T00:00:00Z	153N210213	2021-10-07T00:00:00Z	1.0	None	None	None	PA	71672	PLZ	None	SELMER	None	207120	51780	F	2377.0	1354.0	None	None	None	None	I	1968	Y	19116	RSA3	1001476645	223429916
3	POINT (-75.14689 40.05641)	1219	2021-07-16T00:00:00Z	H	152'6" N 67TH AVE	54095296	R30	ROW B/GAR 2 STY MASONRY	1	Single Family	267	N	None	None	76.0	80000	4	0.0	16.0	None	1.0	None	A	10	80000.0	None	6720	4	6720 N BOUVIER ST	None	None	None	PHILADELPHIA PA	6720 N BOUVIER ST	19126-2610	151200	None	1.0	3.0	NaN	2.0	2218.0	None	BETHAY ANTOINETTE	None	101076300	E	C+	2022-09-08T00:00:00Z	138N210282	2021-01-16T00:00:00Z	1.0	None	None	None	PA	18500	ST	N	BOUVIER	None	40960	30240	F	1216.0	1260.0	H	None	None	None	I	1925	Y	19126	RSA5	1001097499	223430033
4	POINT (-75.24216 39.95278)	1225	2021-07-16T00:00:00Z	None	106'S OF HAZEL AVE	54094927	O30	ROW 2 STY MASONRY	1	Single Family	83	None	None	None	62.0	0	4	0.0	14.0	None	0.0	None	A	3	0.0	None	544	4	544 S SALFORD ST	SFR3-000 LLC	None	None	NEW YORK NY	228 PARK AVE STE 73833	10003	76500	None	1.0	3.0	NaN	2.0	2988.0	None	SFR3-000 LLC	None	032213800	E	C	2022-09-08T00:00:00Z	023S240050	2021-12-16T00:00:00Z	87000.0	None	None	None	NY	70740	ST	S	SALFORD	None	61200	15300	F	888.0	922.0	None	None	None	None	I	1925	Y	19143	RM1	1001469933	223430040

	Feature	Importance
0	GDP per capita	0.677460
2	Employment rate	0.131086
3	Feeling safe walking alone at night	0.065794
8	Water quality	0.061888
6	Quality of support network	0.026356
5	Life expectancy	0.017673
4	Homicide rate	0.009859
7	Voter turnout	0.006018
1	Air pollution	0.003866

Week 11B: Supervised Learning with Scikit-Learn¶

The plan for today¶

Supervised learning with scikit-learn¶

Example: does money make people happier?¶

Where we left off: overfitting¶

Regularization to the rescue?¶

Takeaways¶

Recap: what we learned so far¶

How can we improve?¶

Decision trees: a more sophisticated modeling algorithm¶

A decision tree classifier for the Iris data set¶

Regression with decision trees is similar¶

Decision trees suffer from overfitting¶

Random forests: an ensemble solution to overfitting¶

Let's check for correlations in our input data¶

Let's do some fitting...¶

Which variables matter the most?¶

Let's improve our fitting with k-fold cross validation¶

Let's do 3-fold cross validation¶

Linear pipeline (baseline)¶

Random forest model¶

Takeaway: the random forest model is clearly more accurate¶

Question: Why did I choose to use 100 estimators in the RF model?¶

Enter GridSearchCV¶

Make the grid of parameters to search¶

Now let's evaluate!¶

Linear model results¶

Random forest results with default parameters¶

The random forest model with the optimal hyperparameters¶

Recap¶

Part 2: Modeling residential sales in Philadelphia¶

Machine learning models are increasingly common in the real estate industry¶

The hedonic approach to housing prices¶

What contributes to the price of a house?¶

Why are these kinds of models important?¶

Philadelphia's assessments are...not good¶

Data from the Office of Property Assessment¶

The OPA is messy¶

Let's focus on numerical features only first¶

Test score slightly better¶

Which variables were most important?¶

Takeaways¶

To be continued next lecture...¶

That's it!¶