# Let's setup the imports we'll need first
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import altair as alt


# Print out the current working directory
%pwd

'/Users/nhand/Teaching/PennMUSA/Fall2022/week-3'


# List all of the current working directories 
%ls

README.md
altair-data-bab009db68fd5baeb0b3f2a47a0e775a.json
altair-data-c88ae29f5e52292f57ec56d169bfc4e9.json
altair-data-d676c5169a80f978f8cf008b5d336e79.json
data/
environment.yml
lecture-3A.ipynb
lecture-3B.ipynb
outline.md


path = './data/measles_incidence.csv' # this is a relative path 
data = pd.read_csv(path, skiprows=2, na_values='-')
data.head()


# drop week first
annual = data.drop('WEEK', axis=1)


grped = annual.groupby('YEAR')
print(grped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x17c01e1c0>


annual = grped.sum()


annual.head()


measles = annual.reset_index()
measles.head()


measles = measles.melt(id_vars='YEAR', value_name="incidence", var_name="state")
measles.head(n=10)


# Define a custom colormape using Hex codes & HTML color names
colormap = alt.Scale(
    domain=[0, 100, 200, 300, 1000, 3000],
    range=[
        "#F0F8FF",
        "cornflowerblue",
        "mediumseagreen",
        "#FFEE00",
        "darkorange",
        "firebrick",
    ],
    type="sqrt",
)


alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')


measles


# Heatmap of YEAR vs state, colored by incidence
chart = (
   alt.Chart(measles)
    .mark_rect()
   .encode(
       x=alt.X("YEAR:O", axis=alt.Axis(title=None, ticks=False)),
       y=alt.Y("state:N", axis=alt.Axis(title=None, ticks=False)),
       color=alt.Color("incidence:Q", sort="ascending", scale=colormap),
       tooltip=["state", "YEAR", "incidence"],
   )
   .properties(width=700, height=500)
)

chart


threshold = pd.DataFrame([{"threshold": 1963}])
threshold


# Vertical line for vaccination year
rule = alt.Chart(threshold).mark_rule(strokeWidth=4).encode(x="threshold:O")
chart + rule


# Heatmap of YEAR vs state, colored by incidence
chart = (
    alt.Chart(measles)
    .mark_rect()
    .encode(
        x=alt.X("YEAR:O", axis=alt.Axis(title=None, ticks=False)),
        y=alt.Y("state:N", axis=alt.Axis(title=None, ticks=False)),
        color=alt.Color(
            "incidence:Q",
            sort="ascending",
            scale=alt.Scale(scheme="viridis"),
            legend=None,
        ),
        tooltip=["state", "YEAR", "incidence"],
    )
    .properties(width=700, height=450)
)

# Vertical line for vaccination year
rule = (
    alt.Chart(threshold).mark_rule(strokeWidth=4, color="white").encode(x="threshold:O")
)

chart + rule


# The heatmap
chart = (
    alt.Chart(measles)
    .mark_rect()
    .encode(
        x=alt.X("YEAR:O", axis=alt.Axis(title=None, ticks=False)),
        y=alt.Y("state:N", axis=alt.Axis(title=None, ticks=False)),
        color=alt.Color(
            "incidence:Q",
            sort="ascending",
            scale=alt.Scale(scheme="viridis"),
            legend=None,
        ),
        tooltip=["state", "YEAR", "incidence"],
    )
    .properties(width=700, height=400)
)

# The annual average
annual_avg = (
    alt.Chart(measles)
    .mark_line()
    .encode(
        x=alt.X("YEAR:O", axis=alt.Axis(title=None, ticks=False)),
        y=alt.Y("mean(incidence):Q", axis=alt.Axis(title=None, ticks=False)),
    )
    .properties(width=700, height=200)
)

# Add the vertical line 
rule = (
    alt.Chart(threshold).mark_rule(strokeWidth=4, color="white").encode(x="threshold:O")
)

# Combine everything
alt.vconcat(annual_avg, chart + rule)


%ls "data/ne_110m_admin_0_countries/"

ne_110m_admin_0_countries.cpg  ne_110m_admin_0_countries.shp
ne_110m_admin_0_countries.dbf  ne_110m_admin_0_countries.shx
ne_110m_admin_0_countries.prj


import geopandas as gpd


# Read the shape file, giving the name of the directory
countries = gpd.read_file("./data/ne_110m_admin_0_countries")


countries.head()


type(countries)

geopandas.geodataframe.GeoDataFrame


# Print out the first 10 entires of the "geometry" column
countries['geometry'].head(n=10)

0    POLYGON ((61.21082 35.65007, 62.23065 35.27066...
1    MULTIPOLYGON (((23.90415 -11.72228, 24.07991 -...
2    POLYGON ((21.02004 40.84273, 20.99999 40.58000...
3    POLYGON ((51.57952 24.24550, 51.75744 24.29407...
4    MULTIPOLYGON (((-66.95992 -54.89681, -67.56244...
5    POLYGON ((43.58275 41.09214, 44.97248 41.24813...
6    MULTIPOLYGON (((-59.57209 -80.04018, -59.86585...
7    POLYGON ((68.93500 -48.62500, 69.58000 -48.940...
8    MULTIPOLYGON (((145.39798 -40.79255, 146.36412...
9    POLYGON ((16.97967 48.12350, 16.90375 47.71487...
Name: geometry, dtype: geometry


countries['pop_est'].sum()/1e9 # In billions

7.383089462


grouped = countries.groupby('continent')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x28ada5bb0>


# Sum population on each continent
pop_by_continent = grouped['pop_est'].sum()

# Sort values
pop_by_continent.sort_values(ascending=False, inplace=True)

# Output sorted values from cell
pop_by_continent/1e9

continent
Asia                       4.389145e+00
Africa                     1.219176e+00
Europe                     7.463985e-01
North America              5.730421e-01
South America              4.185407e-01
Oceania                    3.678284e-02
Antarctica                 4.050000e-06
Seven seas (open ocean)    1.400000e-07
Name: pop_est, dtype: float64


# Is the country name USA?
is_USA = countries['name']=='United States of America'

# Get the row with USA
USA = countries.loc[is_USA]

USA


USA.squeeze().geometry


# Squeeze
USA = USA.squeeze()

# Print out the type
print("The type of USA is: ", type(USA))

# Output
USA

The type of USA is:  <class 'pandas.core.series.Series'>

/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/pandas/core/dtypes/inference.py:383: ShapelyDeprecationWarning: Iteration over multi-part geometries is deprecated and will be removed in Shapely 2.0. Use the `geoms` property to access the constituent parts of a multi-part geometry.
  iter(obj)  # Can iterate over it.
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/pandas/core/dtypes/inference.py:384: ShapelyDeprecationWarning: __len__ for multi-part geometries is deprecated and will be removed in Shapely 2.0. Check the length of the `geoms` property instead to get the  number of parts of a multi-part geometry.
  len(obj)  # Has a length associated with it.
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/pandas/io/formats/printing.py:117: ShapelyDeprecationWarning: Iteration over multi-part geometries is deprecated and will be removed in Shapely 2.0. Use the `geoms` property to access the constituent parts of a multi-part geometry.
  s = iter(seq)
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/pandas/io/formats/printing.py:121: ShapelyDeprecationWarning: __len__ for multi-part geometries is deprecated and will be removed in Shapely 2.0. Check the length of the `geoms` property instead to get the  number of parts of a multi-part geometry.
  for i in range(min(nitems, len(seq)))
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/pandas/io/formats/printing.py:125: ShapelyDeprecationWarning: __len__ for multi-part geometries is deprecated and will be removed in Shapely 2.0. Check the length of the `geoms` property instead to get the  number of parts of a multi-part geometry.
  if nitems < len(seq):

iso_a3                                                      USA
name                                   United States of America
continent                                         North America
pop_est                                             326625791.0
gdp_md_est                                           18560000.0
geometry      (POLYGON ((-122.84000000000005 49.000000000000...
Name: 168, dtype: object


type(USA.geometry)

shapely.geometry.multipolygon.MultiPolygon


# a mini USA
USA.geometry


countries.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich


# Create a figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the countries on our axes
ax = countries.plot(ax=ax, facecolor="none", edgecolor="black")

# Add a title
ax.set_title("Equirectangular Projection")

Text(0.5, 1.0, 'Equirectangular Projection')


# Remove Antartica, as the Mercator projection 
# cannot deal with the poles
no_antartica = countries.loc[(countries['name'] != "Antarctica")]


# Two ways to specify the EPSG code
countries_mercator = no_antartica.to_crs(epsg=3395)

# Alternatively:
#countries_mercator = no_antartica.to_crs("EPSG:3395")


countries_mercator.head()


# Initialize the figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Use built-in plot() of the GeoDataFrame
ax = countries_mercator.plot(ax=ax, 
                             facecolor="none", 
                             edgecolor="black")

# Add a title
ax.set_title("Mercator Projection");


city_limits = gpd.read_file('./data/City_Limits')

city_limits


city_limits.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich


# Create our figure and axes
fig, ax = plt.subplots(figsize=(5, 5))

# Plot
city_limits.plot(ax=ax, facecolor="none", edgecolor="black")

# Format
ax.set_title("Equirectangular")
ax.set_axis_off() # This will remove the axes completely
ax.set_aspect("equal") # This forces an equal aspect ratio


# Create the figure
fig, ax = plt.subplots(figsize=(5, 5))

# Convert to EPSG:3857
city_limits_3857 = city_limits.to_crs(epsg=3857)

# Plot and format
city_limits_3857.plot(ax=ax, facecolor="none", edgecolor="black")

ax.set_title("Web Mercator")
ax.set_axis_off()
ax.set_aspect("equal");


# ESRI shape file
city_limits_3857.to_file("./data/city_limits_3857", 
                         driver='ESRI Shapefile')


# GeoJSON is also an option
city_limits_3857.to_file("./data/city_limits_3857.geojson", 
                         driver='GeoJSON')


# save a csv file
city_limits_3857.to_csv("./data/city_limits_3857.csv", index=False)


df = pd.read_csv("./data/city_limits_3857.csv")
df.head()


type(df.geometry)

pandas.core.series.Series


from shapely import wkt

# wkt.loads will convert from string to Polygon object
df['geometry'] = df['geometry'].apply(wkt.loads)


df.geometry.iloc[0]


# Make specifying the name of the geometry column and CRS
gdf= gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:3857")

# Now plot
fig, ax = plt.subplots(figsize=(5,5))
ax = gdf.plot(ax=ax, facecolor='none', edgecolor='black')
ax.set_axis_off()
ax.set_aspect("equal")


ax = (
    gdf.to_crs(epsg=4326)
      .plot(facecolor='none', edgecolor='black')
)
 
ax.set_axis_off()
ax.set_aspect("equal")


# Load some cities data
cities = gpd.read_file("./data/ne_110m_populated_places")


cities


# Select the Point representing New York City
new_york = cities.loc[cities['name'] == 'New York'].geometry.squeeze()
new_york


type(new_york)

shapely.geometry.point.Point


countries.contains(new_york)

0      False
1      False
2      False
3      False
4      False
       ...  
172    False
173    False
174    False
175    False
176    False
Length: 177, dtype: bool


# Find the country that contains New York
countries.loc[countries.contains(new_york)]


# Get the geometry column of the country containing NYC
USA = countries.loc[countries.contains(new_york)].squeeze().geometry
USA


type(USA)

shapely.geometry.multipolygon.MultiPolygon


# Is New York within the USA?
new_york.within(USA)

True

USA

	YEAR	WEEK	ALABAMA	ALASKA	ARIZONA	ARKANSAS	CALIFORNIA	COLORADO	CONNECTICUT	DELAWARE	...	SOUTH DAKOTA	TENNESSEE	TEXAS	UTAH	VERMONT	VIRGINIA	WASHINGTON	WEST VIRGINIA	WISCONSIN	WYOMING
0	1928	1	3.67	NaN	1.90	4.11	1.38	8.38	4.50	8.58	...	5.69	22.03	1.18	0.4	0.28	NaN	14.83	3.36	1.54	0.91
1	1928	2	6.25	NaN	6.40	9.91	1.80	6.02	9.00	7.30	...	6.57	16.96	0.63	NaN	0.56	NaN	17.34	4.19	0.96	NaN
2	1928	3	7.95	NaN	4.50	11.15	1.31	2.86	8.81	15.88	...	2.04	24.66	0.62	0.2	1.12	NaN	15.67	4.19	4.79	1.36
3	1928	4	12.58	NaN	1.90	13.75	1.87	13.71	10.40	4.29	...	2.19	18.86	0.37	0.2	6.70	NaN	12.77	4.66	1.64	3.64
4	1928	5	8.03	NaN	0.47	20.79	2.38	5.13	16.80	5.58	...	3.94	20.05	1.57	0.4	6.70	NaN	18.83	7.37	2.91	0.91

	ALABAMA	ALASKA	ARIZONA	ARKANSAS	CALIFORNIA	COLORADO	CONNECTICUT	DELAWARE	DISTRICT OF COLUMBIA	FLORIDA	...	SOUTH DAKOTA	TENNESSEE	TEXAS	UTAH	VERMONT	VIRGINIA	WASHINGTON	WEST VIRGINIA	WISCONSIN	WYOMING
YEAR
1928	334.99	0.0	200.75	481.77	69.22	206.98	634.95	256.02	535.63	119.58	...	160.16	315.43	97.35	16.83	334.80	0.0	344.82	195.98	124.61	227.00
1929	111.93	0.0	54.88	67.22	72.80	74.24	614.82	239.82	94.20	78.01	...	167.77	33.04	71.28	68.90	105.31	0.0	248.60	380.14	1016.54	312.16
1930	157.00	0.0	466.31	53.44	760.24	1132.76	112.23	109.25	182.10	356.59	...	346.31	179.91	73.12	1044.79	236.69	0.0	631.64	157.70	748.58	341.55
1931	337.29	0.0	497.69	45.91	477.48	453.27	790.46	1003.28	832.99	260.79	...	212.36	134.79	39.56	29.72	318.40	0.0	197.43	291.38	506.57	60.69
1932	10.21	0.0	20.11	5.33	214.08	222.90	348.27	15.98	53.14	13.63	...	96.37	68.99	76.58	13.91	1146.08	53.4	631.93	599.65	935.31	242.10

	YEAR	ALABAMA	ARIZONA	ARKANSAS	CALIFORNIA	COLORADO	CONNECTICUT	DELAWARE	DISTRICT OF COLUMBIA	...	SOUTH DAKOTA	TENNESSEE	TEXAS	UTAH	VERMONT	VIRGINIA	WASHINGTON	WEST VIRGINIA	WISCONSIN	WYOMING
0	1928	334.99	200.75	481.77	69.22	206.98	634.95	256.02	535.63	...	160.16	315.43	97.35	16.83	334.80	0.0	344.82	195.98	124.61	227.00
1	1929	111.93	54.88	67.22	72.80	74.24	614.82	239.82	94.20	...	167.77	33.04	71.28	68.90	105.31	0.0	248.60	380.14	1016.54	312.16
2	1930	157.00	466.31	53.44	760.24	1132.76	112.23	109.25	182.10	...	346.31	179.91	73.12	1044.79	236.69	0.0	631.64	157.70	748.58	341.55
3	1931	337.29	497.69	45.91	477.48	453.27	790.46	1003.28	832.99	...	212.36	134.79	39.56	29.72	318.40	0.0	197.43	291.38	506.57	60.69
4	1932	10.21	20.11	5.33	214.08	222.90	348.27	15.98	53.14	...	96.37	68.99	76.58	13.91	1146.08	53.4	631.93	599.65	935.31	242.10

	YEAR	state	incidence
0	1928	ALABAMA	334.99
1	1929	ALABAMA	111.93
2	1930	ALABAMA	157.00
3	1931	ALABAMA	337.29
4	1932	ALABAMA	10.21
5	1933	ALABAMA	65.22
6	1934	ALABAMA	590.27
7	1935	ALABAMA	265.34
8	1936	ALABAMA	20.78
9	1937	ALABAMA	22.46

	YEAR	state	incidence
0	1928	ALABAMA	334.99
1	1929	ALABAMA	111.93
2	1930	ALABAMA	157.00
3	1931	ALABAMA	337.29
4	1932	ALABAMA	10.21
...	...	...	...
3871	1999	WYOMING	0.00
3872	2000	WYOMING	0.00
3873	2001	WYOMING	0.00
3874	2002	WYOMING	0.00
3875	2003	WYOMING	0.00

	iso_a3	name	continent	pop_est	gdp_md_est	geometry
0	AFG	Afghanistan	Asia	34124811.0	64080.0	POLYGON ((61.21082 35.65007, 62.23065 35.27066...
1	AGO	Angola	Africa	29310273.0	189000.0	MULTIPOLYGON (((23.90415 -11.72228, 24.07991 -...
2	ALB	Albania	Europe	3047987.0	33900.0	POLYGON ((21.02004 40.84273, 20.99999 40.58000...
3	ARE	United Arab Emirates	Asia	6072475.0	667200.0	POLYGON ((51.57952 24.24550, 51.75744 24.29407...
4	ARG	Argentina	South America	44293293.0	879400.0	MULTIPOLYGON (((-66.95992 -54.89681, -67.56244...

	name	geometry
0	Vatican City	POINT (12.45339 41.90328)
1	San Marino	POINT (12.44177 43.93610)
2	Vaduz	POINT (9.51667 47.13372)
3	Lobamba	POINT (31.20000 -26.46667)
4	Luxembourg	POINT (6.13000 49.61166)
...	...	...
238	Rio de Janeiro	POINT (-43.22697 -22.92308)
239	São Paulo	POINT (-46.62697 -23.55673)
240	Sydney	POINT (151.18323 -33.91807)
241	Singapore	POINT (103.85387 1.29498)
242	Hong Kong	POINT (114.18306 22.30693)

Week 3: Geospatial Data Analysis and GeoPandas¶

Housekeeping¶

Agenda for Week #3¶

Exercise: Measles incidence data viz with altair¶

Step 1: Load the data¶

First confirm the local path of the Jupyter notebook¶

Step 2: Calculate the total incidents in a given year per state¶

Step 3: Transform to tidy format¶

Step 4: Make the plot¶

Avoiding large data error¶

Bonus: Add the vaccination line!¶

Challenges¶

Exploring other color maps¶

Add the annual average chart on top¶

Now, on to geospatial analysis...¶

Vector Data¶

A couple of terminology notes¶

Common formats for vector datasets¶

A shapefile¶

The GeoJSON file¶

Additional GeoJSON resources and tools:¶

Working with vector data in Python: GeoPandas¶

What's a GeoDataFrame?¶

We can still leverage the power of pandas...¶

An aside: the squeeze() function¶

How does geopandas handle coordinate systems and map projections?¶

Coordinate Reference Systems¶

How to plot all of the geometries at once?¶

Can we convert to other geometries?¶

Let's convert to the Mercator projection¶

Now let's plot it¶

So which CRS is best?¶

Let's load the city limits for Philadelphia¶

What's the CRS?¶

Saving GeoDataFrames¶

How about as a CSV file?¶

Looks similar...¶

Use shapely to parse the string version of the polygons¶

Converting from a DataFrame to a GeoDataFrame¶

Let's convert back to 4326 and plot¶

Note¶

So, when should you use GeoPandas?¶

When it may not be the best tool:¶

Spatial Relationships and Joins¶

A quick example¶

Note¶

Reference¶

More next time...¶

That's it!¶

What's a `GeoDataFrame`?¶

We can still leverage the power of `pandas`...¶

An aside: the `squeeze()` function¶

How does `geopandas` handle coordinate systems and map projections?¶

Use `shapely` to parse the string version of the polygons¶