import pandas as pd


# city names and population
city_names = pd.Series(["San Francisco", "San Jose", "Sacramento"])
population = pd.Series([852469, 1015785, 485199])


city_names

0    San Francisco
1         San Jose
2       Sacramento
dtype: object


cities_df = pd.DataFrame({"City Name": city_names, "Population": population})
cities_df


# access columns with a dict-like syntax
cities_df["Population"]

0     852469
1    1015785
2     485199
Name: Population, dtype: int64


# list slicing: get the elements with indices 1 and 2 (but NOT 3)
cities_list = ["San Francisco", "San Jose", "Sacramento"]
cities_list[:2]

['San Francisco', 'San Jose']


# slicing data frame rows is very similar!
cities_df[:2]


cities_df["Population"].median()

852469.0


import numpy as np

# calculate the median population value
np.median(cities_df["Population"])

852469.0


# define our function
def get_large_cities(population):
    return population > 1e6


large_cities_sel = cities_df["Population"].apply(get_large_cities)


large_cities_sel

0    False
1     True
2    False
Name: Population, dtype: bool


# add the new computed column to our original data frame
cities_df["Large Cities"] = large_cities_sel

small_cities_func = lambda population: population < 1e6

# we can also use lambda (unnamed, inline) functions
cities_df["Small Cities"] = cities_df["Population"].apply(small_cities_func)

# print out
cities_df


cities_df["Large Cities"]

0    False
1     True
2    False
Name: Large Cities, dtype: bool


cities_df.loc[cities_df["Large Cities"]]


cities_df["Population"] > 1e6

0    False
1     True
2    False
Name: Population, dtype: bool


# this is equivalent to doing
cities_df.loc[cities_df["Population"] < 1e6]


# reverse the large cities boolean selection
cities_df.loc[~large_cities_sel]


# or equivalently:
# NOTE: you need to put the whole expression in () and then apply the tilde!
cities_df.loc[~(cities_df["Population"] > 1e6)]


cities_df.query("Population < 1e6")


# use the question mark
pd.DataFrame.loc?


census_df = pd.read_csv("./data/census/ACS_19_1YR_B25003.csv")


census_df.head(n=5)


## FIRST STEP: let's trim to the columns we want

# columns we want
columns = ["GEO.display-label", "HD01_VD01", "HD01_VD03"]
census_df = census_df[columns]


# Peak at the dataframe with trimmed columns
census_df.head()


## STEP 2: Let's rename the columns!

# rename columns
census_df.columns = ["City", "Total", "Rentals"]

# Return the head (first 5 rows) from the cell
census_df.head(n=5)


# Step 3: Add the rental rate as a new column
census_df['Rental Rate'] = census_df['Rentals'] / census_df['Total'] * 100

census_df.head()


# How many rows are in the dataframe? --> use the len() operator

len(census_df)

634


# Define the list of city names we want
top_cities_list = [
    "Philadelphia city, Pennsylvania",
    "New York city, New York",
    "Los Angeles city, California",
    "Houston city, Texas",
    "Chicago city, Illinois",
    "Phoenix city, Arizona",
]


# Use the isin() to test whether a row value is in a city
census_df["City"].isin(top_cities_list)

0      False
1      False
2      False
3      False
4      False
       ...  
629    False
630    False
631    False
632    False
633    False
Name: City, Length: 634, dtype: bool


# Now do the selection!
census_df.loc[census_df["City"].isin(top_cities_list)]  # select the valid rows


# Get the rows where the City column is in "top_cities_list"
census_df.query("City in @top_cities_list")


year_str = "19"
path = f"./data/census/ACS_{year_str}_1YR_B25003.csv"
print(path)

./data/census/ACS_19_1YR_B25003.csv


all_rental_data = []
start_year = 2005
stop_year = 2019
for year in range(start_year, stop_year + 1):
    
    #---------------------------------------------
    # Step 0: Print out year 
    # The variable year changes for each iteration of the loop
    #----------------------------------------------
    print(f"Loading data for year = {year}...")
    
    #---------------------------------
    # Step 1: Read data for this year
    #---------------------------------
    year_str = str(year) # convert integer value of "year" to a string
    year_str = year_str[2:] # extract the last two digits of the year string
    rental_df = pd.read_csv(f"./data/census/ACS_{year_str}_1YR_B25003.csv")
     
    #---------------------------------
    # Step 2: Columns we want
    #---------------------------------
    columns = ['GEO.display-label', 'HD01_VD01', 'HD01_VD03']
    rental_df = rental_df[columns]

    #---------------------------------
    # Step 3: Rename columns
    #---------------------------------
    rental_df.columns = ['City', 'Total', 'Rentals']
    
    #----------------------------------
    # Step 4: Calculate the rental rate
    #----------------------------------
    rental_df['Rental Rate'] = rental_df['Rentals'] / rental_df['Total'] * 100
    
    #----------------------------------
    # Step 5: Select the cities we want
    #----------------------------------
    selection = rental_df['City'].isin(top_cities_list)
    top_cities_df = rental_df.loc[selection].copy()
    
    #----------------------------------
    # Step 6: Add a column for the year
    #----------------------------------
    top_cities_df['Year'] = year
    
    #-----------------------------------
    # Step 7: Save this dataframe in the list
    #-----------------------------------
    all_rental_data.append(top_cities_df)

#---------------------------------------------------
# Step 8: After the for loop, combine all of the 
# data frames into one along the row axis
#---------------------------------------------------
all_rental_data = pd.concat(all_rental_data, axis=0)

Loading data for year = 2005...
Loading data for year = 2006...
Loading data for year = 2007...
Loading data for year = 2008...
Loading data for year = 2009...
Loading data for year = 2010...
Loading data for year = 2011...
Loading data for year = 2012...
Loading data for year = 2013...
Loading data for year = 2014...
Loading data for year = 2015...
Loading data for year = 2016...
Loading data for year = 2017...
Loading data for year = 2018...
Loading data for year = 2019...


# Note: the pd.concat() took a list of dataframes and converted them 
# into a single dataframe!
type(all_rental_data)

pandas.core.frame.DataFrame


all_rental_data.head()


# What about the tail?
all_rental_data.tail()


# How many rows? Hopefully (6 cities x 15 years = 90!)
len(all_rental_data)

90


from matplotlib import pyplot as plt


all_rental_data['City'].unique()

array(['Phoenix city, Arizona', 'Los Angeles city, California',
       'Chicago city, Illinois', 'New York city, New York',
       'Philadelphia city, Pennsylvania', 'Houston city, Texas'],
      dtype=object)


with plt.style.context('ggplot'):
    
    # Step 1: Create a figure and axes to plot on
    fig, ax = plt.subplots(figsize=(10,6))
    
    # Step 2: Loop over each city and plot
    for city in all_rental_data['City'].unique():
        
        # Print out the city in each loop iteration
        print(city)
        
        # Select data for this city
        this_city = all_rental_data['City'] == city
        city_df = all_rental_data.loc[this_city]
        
        # Plot Year vs Rental Rate
        ax.plot(city_df['Year'], city_df['Rental Rate'], label=city, linewidth=4)

    # Add a legend
    ax.legend(loc=0, ncol=3, fontsize=12)

    # Set the y limit
    ax.set_ylim(30, 72)

    # Add a y label
    ax.set_ylabel("Rental Rate in Percent")
    
    # Show
    plt.show()

Phoenix city, Arizona
Los Angeles city, California
Chicago city, Illinois
New York city, New York
Philadelphia city, Pennsylvania
Houston city, Texas


# trim to just Philadelphia first, using this boolean selection index
all_rental_data["City"] == "Philadelphia city, Pennsylvania"

13     False
75     False
218    False
349    False
393     True
       ...  
94     False
284    False
439    False
487     True
538    False
Name: City, Length: 90, dtype: bool


# select Philadelphia only
philly = all_rental_data.loc[
    all_rental_data["City"] == "Philadelphia city, Pennsylvania"
]

philly


philly.loc[393]

City           Philadelphia city, Pennsylvania
Total                                   565433
Rentals                                 244792
Rental Rate                          43.292839
Year                                      2005
Name: 393, dtype: object


philly.iloc[0]

City           Philadelphia city, Pennsylvania
Total                                   565433
Rentals                                 244792
Rental Rate                          43.292839
Year                                      2005
Name: 393, dtype: object


all_rental_data.loc[393]

City           Philadelphia city, Pennsylvania
Total                                   565433
Rentals                                 244792
Rental Rate                          43.292839
Year                                      2005
Name: 393, dtype: object


philly.reset_index(drop=True)


# select 20
philly_2005 = philly.loc[philly['Year']==2005]


philly_2005


# Notice b/c we have a dataframe with length of 1, we don't get back just a number
philly_2005["Rental Rate"]

393    43.292839
Name: Rental Rate, dtype: float64


philly_2005['Rental Rate'].iloc[0]

43.292839293072745


len(philly_2005["Rental Rate"])

1


philly_2005 = philly.loc[philly['Year']==2005].squeeze()

philly_2005

City           Philadelphia city, Pennsylvania
Total                                   565433
Rentals                                 244792
Rental Rate                          43.292839
Year                                      2005
Name: 393, dtype: object


value_2005 = philly_2005["Rental Rate"]

value_2005

43.292839293072745


# Do the same thing for 2019 — make sure you squeeze!
philly_2019 = philly.loc[philly['Year']==2019].squeeze()

# Get the value
value_2019 = philly_2019['Rental Rate']

value_2019

47.74505451933398


# calculate percent change
percent_change = (value_2019 - value_2005)/ value_2005 * 100


print(f"change in rental rate in Philadelphia from 2005 to 2019 = {percent_change}")

change in rental rate in Philadelphia from 2005 to 2019 = 10.283952956103837

	GEO.id	GEO.display-label	HD01_VD01	HD02_VD01	HD01_VD02	HD02_VD02	HD01_VD03	HD02_VD03
0	1600000US0103076	Auburn city, Alabama	26156	2687	13095	1829	13061	2108
1	1600000US0107000	Birmingham city, Alabama	93300	3618	37788	2635	55512	2788
2	1600000US0121184	Dothan city, Alabama	27103	1044	15891	824	11212	817
3	1600000US0135896	Hoover city, Alabama	32461	1606	22391	1515	10070	1496
4	1600000US0137000	Huntsville city, Alabama	88930	2823	48728	2758	40202	2550

	GEO.display-label	HD01_VD01	HD01_VD03
0	Auburn city, Alabama	26156	13061
1	Birmingham city, Alabama	93300	55512
2	Dothan city, Alabama	27103	11212
3	Hoover city, Alabama	32461	10070
4	Huntsville city, Alabama	88930	40202

	City	Total	Rentals
0	Auburn city, Alabama	26156	13061
1	Birmingham city, Alabama	93300	55512
2	Dothan city, Alabama	27103	11212
3	Hoover city, Alabama	32461	10070
4	Huntsville city, Alabama	88930	40202

	City	Total	Rentals	Rental Rate
0	Auburn city, Alabama	26156	13061	49.935005
1	Birmingham city, Alabama	93300	55512	59.498392
2	Dothan city, Alabama	27103	11212	41.368114
3	Hoover city, Alabama	32461	10070	31.021842
4	Huntsville city, Alabama	88930	40202	45.206342

	City	Total	Rentals	Rental Rate
19	Phoenix city, Arizona	586878	267225	45.533314
94	Los Angeles city, California	1398900	887785	63.463078
284	Chicago city, Illinois	1080345	603352	55.848086
439	New York city, New York	3211033	2188214	68.146730
487	Philadelphia city, Pennsylvania	619505	295783	47.745055
538	Houston city, Texas	876504	522942	59.662249

Week 1B: Exploratory Data Science in Python¶

Today¶

Reminder: The weekly workflow¶

Python Data Analysis: the pandas package¶

The new, less fun logo¶

Basic pandas concepts¶

Accessing data¶

Rows can be accessed using Python's syntax for slicing¶

Manipulating data¶

Data selection¶

There is a faster, more concise, to achieve this!¶

What about getting rows where "Large Cities" is False?¶

An even faster way!¶

How to remember the specifics of all of these functions?¶

Now onto a more interesting example...¶

Analyzing trends in rentals rates over time¶

Python f-strings¶

Now let's load data from 2005 to 2019¶

Basic plotting: `matplotlib`¶

The strategy¶

Note¶

From 2005 to 2019, Philadelphia had the third largest percent increase¶

Exercise: Can you calculate what the increase was for Philadelphia?¶

1. Trim the full data set to just Philadelphia¶

An aside about labels and the index in Pandas¶

2. Select the rental rate for 2005¶

An aside: the squeeze() function¶

3. Select the rental rate for 2019¶

4. Calculate the percent change¶

To be continued next time!¶

First Homework Assignment¶

That's it!¶

	City Name	Population	Large Cities	Small Cities
0	San Francisco	852469	False	True
1	San Jose	1015785	True	False
2	Sacramento	485199	False	True

	City	Total	Rentals	Rental Rate	Year
13	Phoenix city, Arizona	503753	204914	40.677475	2005
75	Los Angeles city, California	1284124	771325	60.066240	2005
218	Chicago city, Illinois	1020605	525620	51.500825	2005
349	New York city, New York	3026196	2023924	66.880136	2005
393	Philadelphia city, Pennsylvania	565433	244792	43.292839	2005

	City	Total	Rentals	Rental Rate	Year
393	Philadelphia city, Pennsylvania	565433	244792	43.292839	2005
400	Philadelphia city, Pennsylvania	554048	231323	41.751437	2006
408	Philadelphia city, Pennsylvania	562384	239363	42.562199	2007
415	Philadelphia city, Pennsylvania	578263	253084	43.766245	2008
419	Philadelphia city, Pennsylvania	569835	258459	45.356814	2009
427	Philadelphia city, Pennsylvania	575413	264129	45.902508	2010
434	Philadelphia city, Pennsylvania	576429	265126	45.994563	2011
437	Philadelphia city, Pennsylvania	579874	277323	47.824700	2012
449	Philadelphia city, Pennsylvania	582528	285430	48.998503	2013
454	Philadelphia city, Pennsylvania	577862	277301	47.987409	2014
458	Philadelphia city, Pennsylvania	581604	275397	47.351291	2015
465	Philadelphia city, Pennsylvania	580205	277707	47.863600	2016
474	Philadelphia city, Pennsylvania	606142	309279	51.024182	2017
486	Philadelphia city, Pennsylvania	608233	287543	47.275140	2018
487	Philadelphia city, Pennsylvania	619505	295783	47.745055	2019

Week 1B: Exploratory Data Science in Python¶

Today¶

Reminder: The weekly workflow¶

Python Data Analysis: the pandas package¶

The new, less fun logo¶

Basic pandas concepts¶

Accessing data¶

Rows can be accessed using Python's syntax for slicing¶

Manipulating data¶

Data selection¶

There is a faster, more concise, to achieve this!¶

What about getting rows where "Large Cities" is False?¶

An even faster way!¶

How to remember the specifics of all of these functions?¶

Now onto a more interesting example...¶

Analyzing trends in rentals rates over time¶

Python f-strings¶

Now let's load data from 2005 to 2019¶

Basic plotting: matplotlib¶

The strategy¶

Note¶

From 2005 to 2019, Philadelphia had the third largest percent increase¶

Exercise: Can you calculate what the increase was for Philadelphia?¶

1. Trim the full data set to just Philadelphia¶

An aside about labels and the index in Pandas¶

2. Select the rental rate for 2005¶

An aside: the squeeze() function¶

3. Select the rental rate for 2019¶

4. Calculate the percent change¶

To be continued next time!¶

First Homework Assignment¶

That's it!¶

Basic plotting: `matplotlib`¶