import pandas as pd
from matplotlib import pyplot as plt


%matplotlib inline


home_values = pd.read_csv("data/zillow/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
rent_values = pd.read_csv("data/zillow/Metro_ZORI_AllHomesPlusMultifamily_SSA.csv")


rent_values.head()


home_values.head()


valid_cities = [
    "New York, NY",
    "Chicago, IL",
    "Los Angeles-Long Beach-Anaheim, CA",
    "Philadelphia, PA",
    "Houston, TX",
    "Phoenix, AZ",
]


selection =  home_values['RegionName'].isin(valid_cities)
home_values_trimmed = home_values.loc[selection]


selection = rent_values['RegionName'].isin(valid_cities)
rent_values_trimmed = rent_values.loc[selection]


rent_values_trimmed


x = ['SizeRank', 'RegionID', "RegionType", "StateName"]
home_values_final = home_values_trimmed.drop(x, axis=1)


columns = ['SizeRank', 'RegionID']
rent_values_final = rent_values_trimmed.drop(columns, axis=1)


rent_values_final


home_values_final


pd.melt?


ZHVI = pd.melt(
    home_values_final, 
    id_vars=["RegionName"], 
    value_name="ZHVI", 
    var_name="Date"
)
ZRI = pd.melt(
    rent_values_final, 
    id_vars=["RegionName"], 
    value_name="ZRI", 
    var_name="Date"
)


ZRI.tail()


ZHVI.head()


# Convert the Date column to Datetime objects
ZHVI["Date"] = pd.to_datetime(ZHVI["Date"])


# Extract YYYY-MM string
date_strings = ZHVI["Date"].dt.strftime("%Y-%m")


# First entry is a string!
date_strings.iloc[0]

'2000-01'


# Add the strings back as a column
ZHVI["Date"] = date_strings


ZHVI.head()


# Left dataframe is ZRI
# Right dataframe is ZHVI

zillow_data = pd.merge(ZRI, ZHVI, on=['Date', 'RegionName'], how='outer')


# Let's sort the data by Date
zillow_data  = zillow_data.sort_values("Date", ascending=True)

zillow_data


# Convert the Date column back to a Datetime
zillow_data["Date"] = pd.to_datetime(zillow_data["Date"])


# Note the the dtype is now datetime64[ns]
zillow_data['Date'].head()

547   2000-01-01
548   2000-01-01
549   2000-01-01
550   2000-01-01
551   2000-01-01
Name: Date, dtype: datetime64[ns]


# Extract out the month and year of each date
# Add them to the data frame as new columns!
zillow_data['Month'] = zillow_data['Date'].dt.month
zillow_data['Year'] = zillow_data['Date'].dt.year


zillow_data.head()


# calculate mean values for each Year and City (RegionName)
annual_ZHVI = zillow_data.groupby(['RegionName', 'Year'])['ZHVI'].mean() 
annual_ZRI = zillow_data.groupby(['RegionName', 'Year'])['ZRI'].mean()


print(type(annual_ZHVI))

<class 'pandas.core.series.Series'>


annual_ZHVI.head()

RegionName   Year
Chicago, IL  2000    176107.833333
             2001    190745.250000
             2002    205350.500000
             2003    220379.916667
             2004    237042.833333
Name: ZHVI, dtype: float64


annual_ZHVI = annual_ZHVI.reset_index()
annual_ZRI = annual_ZRI.reset_index()


annual_ZHVI.head(n=50)


with plt.style.context("ggplot"):

    # Create figure and axes
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Plot for each unique city
    for city in annual_ZHVI["RegionName"].unique():
        
        # select the data for this city
        selection = annual_ZHVI["RegionName"] == city
        df = annual_ZHVI.loc[selection]

        # plot
        ax.plot(df["Year"], df["ZHVI"] / 1e3, label=city, linewidth=4)

    
    # Format the axes
    ax.set_ylim(50, 800)
    ax.legend(loc=0, ncol=2, fontsize=12)
    ax.set_ylabel("Zillow Home Value Index\n(in thousands of dollars)")


with plt.style.context('ggplot'):
    
    # Create the figure and axes
    fig, ax = plt.subplots(figsize=(10,6))
    
    # Loop over the cities to plot each one
    for city in annual_ZRI['RegionName'].unique():
        
        # Select the city data
        selection = annual_ZRI['RegionName'] == city
        df = annual_ZRI.loc[selection]
        
        # Plot
        ax.plot(df['Year'], df['ZRI'], label=city, linewidth=4)
    
    # Format
    ax.set_ylim(1000, 3300)
    ax.legend(loc=0, ncol=2)
    ax.set_ylabel('Zillow Rent Index (in dollars)')

	RegionID	RegionName	SizeRank	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	...	2020-10	2020-11	2020-12	2021-01	2021-02	2021-03	2021-04	2021-05	2021-06	2021-07
0	102001	United States	0	1356.0	1361	1367.0	1373	1378	1384	1390	...	1712	1721	1729	1738	1747	1757	1766	1776.0	1786	1796
1	394913	New York, NY	1	2205.0	2214	2224.0	2234	2244	2254	2264	...	2437	2433	2428	2424	2421	2418	2415	2414.0	2413	2413
2	753899	Los Angeles-Long Beach-Anaheim, CA	2	1868.0	1879	1890.0	1902	1913	1924	1935	...	2529	2538	2546	2554	2563	2572	2581	2591.0	2601	2611
3	394463	Chicago, IL	3	1437.0	1441	1445.0	1449	1453	1456	1460	...	1651	1653	1655	1657	1659	1662	1664	1667.0	1670	1674
4	394514	Dallas-Fort Worth, TX	4	1179.0	1182	1186.0	1190	1194	1198	1202	...	1519	1529	1540	1551	1562	1573	1585	1597.0	1608	1620

	RegionID	SizeRank	RegionName	RegionType	StateName	2000-01-31	2000-02-29	2000-03-31	2000-04-30	2000-05-31	...	2021-10-31	2021-11-30	2021-12-31	2022-01-31	2022-02-28	2022-03-31	2022-04-30	2022-05-31	2022-06-30	2022-07-31
0	102001	0	United States	Country	NaN	128193.0	128540.0	128904.0	129647.0	130396.0	...	315164.0	319079.0	323445.0	328180.0	333400.0	338673.0	344230.0	349265.0	353446.0	355852.0
1	394913	1	New York, NY	Msa	NY	225281.0	226627.0	227838.0	230222.0	232442.0	...	571576.0	575145.0	578396.0	582369.0	587022.0	593470.0	600621.0	608159.0	614185.0	618517.0
2	753899	2	Los Angeles-Long Beach-Anaheim, CA	Msa	CA	228935.0	229732.0	230953.0	233274.0	235763.0	...	842992.0	850126.0	859335.0	870012.0	882230.0	899552.0	918626.0	935074.0	936406.0	933282.0
3	394463	3	Chicago, IL	Msa	IL	170843.0	171246.0	171768.0	172812.0	173918.0	...	287525.0	290233.0	292966.0	295614.0	298193.0	301504.0	304988.0	308971.0	311970.0	313828.0
4	394514	4	Dallas-Fort Worth, TX	Msa	TX	130922.0	131027.0	131114.0	131327.0	131550.0	...	334386.0	339874.0	345774.0	353216.0	361748.0	369869.0	379389.0	388117.0	395694.0	397905.0

	RegionID	RegionName	SizeRank	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	...	2020-10	2020-11	2020-12	2021-01	2021-02	2021-03	2021-04	2021-05	2021-06	2021-07
1	394913	New York, NY	1	2205.0	2214	2224.0	2234	2244	2254	2264	...	2437	2433	2428	2424	2421	2418	2415	2414.0	2413	2413
2	753899	Los Angeles-Long Beach-Anaheim, CA	2	1868.0	1879	1890.0	1902	1913	1924	1935	...	2529	2538	2546	2554	2563	2572	2581	2591.0	2601	2611
3	394463	Chicago, IL	3	1437.0	1441	1445.0	1449	1453	1456	1460	...	1651	1653	1655	1657	1659	1662	1664	1667.0	1670	1674
5	394974	Philadelphia, PA	5	1456.0	1458	1459.0	1461	1463	1465	1467	...	1723	1729	1735	1741	1748	1754	1761	1768.0	1774	1781
6	394692	Houston, TX	6	1135.0	1142	1149.0	1155	1161	1168	1174	...	1319	1325	1331	1336	1342	1348	1354	1361.0	1368	1374
14	394976	Phoenix, AZ	14	997.0	1001	1005.0	1009	1013	1017	1021	...	1530	1551	1573	1595	1617	1640	1662	1686.0	1709	1732

	RegionName	2014-01	2014-02	2014-03	2014-04	2014-05	2014-06	2014-07	2014-08	2014-09	...	2020-10	2020-11	2020-12	2021-01	2021-02	2021-03	2021-04	2021-05	2021-06	2021-07
1	New York, NY	2205.0	2214	2224.0	2234	2244	2254	2264	2273	2283	...	2437	2433	2428	2424	2421	2418	2415	2414.0	2413	2413
2	Los Angeles-Long Beach-Anaheim, CA	1868.0	1879	1890.0	1902	1913	1924	1935	1947	1958	...	2529	2538	2546	2554	2563	2572	2581	2591.0	2601	2611
3	Chicago, IL	1437.0	1441	1445.0	1449	1453	1456	1460	1463	1467	...	1651	1653	1655	1657	1659	1662	1664	1667.0	1670	1674
5	Philadelphia, PA	1456.0	1458	1459.0	1461	1463	1465	1467	1469	1471	...	1723	1729	1735	1741	1748	1754	1761	1768.0	1774	1781
6	Houston, TX	1135.0	1142	1149.0	1155	1161	1168	1174	1180	1186	...	1319	1325	1331	1336	1342	1348	1354	1361.0	1368	1374
14	Phoenix, AZ	997.0	1001	1005.0	1009	1013	1017	1021	1025	1030	...	1530	1551	1573	1595	1617	1640	1662	1686.0	1709	1732

	RegionName	2000-01-31	2000-02-29	2000-03-31	2000-04-30	2000-05-31	2000-06-30	2000-07-31	2000-08-31	2000-09-30	...	2021-10-31	2021-11-30	2021-12-31	2022-01-31	2022-02-28	2022-03-31	2022-04-30	2022-05-31	2022-06-30	2022-07-31
1	New York, NY	225281.0	226627.0	227838.0	230222.0	232442.0	234805.0	237288.0	239859.0	242437.0	...	571576.0	575145.0	578396.0	582369.0	587022.0	593470.0	600621.0	608159.0	614185.0	618517.0
2	Los Angeles-Long Beach-Anaheim, CA	228935.0	229732.0	230953.0	233274.0	235763.0	238114.0	240521.0	243047.0	245453.0	...	842992.0	850126.0	859335.0	870012.0	882230.0	899552.0	918626.0	935074.0	936406.0	933282.0
3	Chicago, IL	170843.0	171246.0	171768.0	172812.0	173918.0	175015.0	176228.0	177589.0	178995.0	...	287525.0	290233.0	292966.0	295614.0	298193.0	301504.0	304988.0	308971.0	311970.0	313828.0
5	Philadelphia, PA	130359.0	130796.0	131058.0	131710.0	132238.0	132835.0	133383.0	133967.0	134615.0	...	310909.0	313401.0	315690.0	318248.0	320804.0	324066.0	327757.0	332095.0	335593.0	338231.0
6	Houston, TX	126394.0	126472.0	126353.0	126472.0	126479.0	126681.0	126811.0	127090.0	127446.0	...	274426.0	277596.0	281019.0	284479.0	288795.0	294155.0	300160.0	305742.0	309933.0	312533.0
14	Phoenix, AZ	142839.0	143215.0	143669.0	144561.0	145451.0	146126.0	146849.0	147535.0	148362.0	...	418635.0	424834.0	430947.0	436727.0	445371.0	454535.0	465962.0	474655.0	480985.0	478985.0

	RegionName	Date	ZRI
541	Los Angeles-Long Beach-Anaheim, CA	2021-07	2611.0
542	Chicago, IL	2021-07	1674.0
543	Philadelphia, PA	2021-07	1781.0
544	Houston, TX	2021-07	1374.0
545	Phoenix, AZ	2021-07	1732.0

	RegionName	Date	ZHVI
0	New York, NY	2000-01	225281.0
1	Los Angeles-Long Beach-Anaheim, CA	2000-01	228935.0
2	Chicago, IL	2000-01	170843.0
3	Philadelphia, PA	2000-01	130359.0
4	Houston, TX	2000-01	126394.0

	RegionName	Date	ZRI	ZHVI	Month	Year
547	Los Angeles-Long Beach-Anaheim, CA	2000-01-01	NaN	228935.0	1	2000
548	Chicago, IL	2000-01-01	NaN	170843.0	1	2000
549	Philadelphia, PA	2000-01-01	NaN	130359.0	1	2000
550	Houston, TX	2000-01-01	NaN	126394.0	1	2000
551	Phoenix, AZ	2000-01-01	NaN	142839.0	1	2000

	RegionName	Year	ZHVI
0	Chicago, IL	2000	176107.833333
1	Chicago, IL	2001	190745.250000
2	Chicago, IL	2002	205350.500000
3	Chicago, IL	2003	220379.916667
4	Chicago, IL	2004	237042.833333
5	Chicago, IL	2005	257666.000000
6	Chicago, IL	2006	274795.083333
7	Chicago, IL	2007	276928.083333
8	Chicago, IL	2008	257053.444444
9	Chicago, IL	2009	227865.000000
10	Chicago, IL	2010	211839.916667
11	Chicago, IL	2011	192180.166667
12	Chicago, IL	2012	179569.166667
13	Chicago, IL	2013	187374.583333
14	Chicago, IL	2014	201120.666667
15	Chicago, IL	2015	207530.833333
16	Chicago, IL	2016	215485.250000
17	Chicago, IL	2017	227410.000000
18	Chicago, IL	2018	237385.750000
19	Chicago, IL	2019	242427.250000
20	Chicago, IL	2020	247735.833333
21	Chicago, IL	2021	275805.416667
22	Chicago, IL	2022	305009.714286
23	Houston, TX	2000	127062.916667
24	Houston, TX	2001	128605.916667
25	Houston, TX	2002	131883.500000
26	Houston, TX	2003	136163.416667
27	Houston, TX	2004	144030.636364
28	Houston, TX	2005	150617.750000
29	Houston, TX	2006	153528.666667
30	Houston, TX	2007	158566.000000
31	Houston, TX	2008	157503.916667
32	Houston, TX	2009	153664.083333
33	Houston, TX	2010	153315.583333
34	Houston, TX	2011	146821.000000
35	Houston, TX	2012	146639.583333
36	Houston, TX	2013	157032.166667
37	Houston, TX	2014	172792.250000
38	Houston, TX	2015	188334.416667
39	Houston, TX	2016	197249.750000
40	Houston, TX	2017	204009.333333
41	Houston, TX	2018	211769.333333
42	Houston, TX	2019	219758.083333
43	Houston, TX	2020	227084.250000
44	Houston, TX	2021	257929.500000
45	Houston, TX	2022	299399.571429
46	Los Angeles-Long Beach-Anaheim, CA	2000	239580.833333
47	Los Angeles-Long Beach-Anaheim, CA	2001	266247.333333
48	Los Angeles-Long Beach-Anaheim, CA	2002	303152.750000
49	Los Angeles-Long Beach-Anaheim, CA	2003	361282.083333

Week 2: Data Visualization Fundamentals¶

Housekeeping¶

Guides¶

Git & GitHub Resources¶

File paths and working directories¶

Reminder: following along with lectures¶

Easiest option: Binder¶

Harder option: downloading Github repository contents¶

Today's agenda¶

Continuing with pandas: Zillow rental and home value data¶

Selecting the cities we want¶

Removing unwanted columns¶

Wide vs long format for datasets¶

Merging data frames¶

But first! Our date string formats don't match!¶

Datetime objects¶

The strftime function¶

Now we can merge!¶

Annual trends: grouping by Year¶

Plotting our results: ZHVI¶

Home values in Philadelphia have only recently recovered to pre-2008 levels¶

Plotting the results: Zillow Rent Index¶

Rent prices in Philadelphia have remained relatively flat, relative to other large cities.¶

Week #2: Data Visualization Fundamentals¶

A brief history¶

Example 1: the pioneering work of W. E. B. Du Bois¶

Re-making the Du Bois Spiral with census data¶

The demographics of whites in seven states¶

Additional references¶

Example 2: the Statistical Atlas of the United States¶

Using modern data¶

Industry and Earnings by Sex¶

Median Household Income¶

Many more examples...¶

More recently...¶

Two main movements:¶

Wave 1: Clarity¶

The idea of "Chartjunk"¶

An extreme example¶

Wave 2: the grammar of visualization¶

Where are we now?¶

A survey of common tools¶

The 7 kinds of data viz people¶

Data visualization as communication¶

Fast visualization¶

Moving beyond fast visualizations¶

Example: Fatalities in the Iraq War¶

What design choices drive home the implicit message?¶

Data Visualization as Storytelling¶

A negative portrayal¶

A positive portrayal¶

Design choices matter & data viz has never been more important¶

Data Viz Style Guides¶

Good rules¶

Now onto colors...¶

ColorBrewer 2.0¶

Perceptually uniform color maps¶

Need more colors?¶

Some of my favorites¶

Making sure your colors work: Viz Palette¶

Wrapping up: some good rules to live by¶

Disclaimer¶

That's it!¶

The `strftime` function¶

Annual trends: grouping by `Year`¶

Making sure your colors work: Viz Palette ¶