# Initial imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd


# Create an array of normally-distributed random numbers
a = np.random.randn(1000)

a[:10]

array([-0.92904591,  1.2667361 , -0.71450318, -1.59469798,  1.03601616,
       -0.42282844,  0.11812523, -0.23338086, -2.17108618,  2.40594822])


# Multiply this array by a factor
b = a * 4


# Find the minimum value
b_min = b.min()


b_min

-11.989284650894874


import dask.array as da

# Create a dask array from the above array
a_dask = da.from_array(a, chunks=200)


# Multiply this array by a factor
b_dask = a_dask * 4


# Find the minimum value
b_min_dask = b_dask.min()

print(b_min_dask)

dask.array<amin-aggregate, shape=(), dtype=float64, chunksize=(), chunktype=numpy.ndarray>


b_min_dask.compute()

-11.989284650894874


import intake


datasets = intake.open_catalog("./datasets.yml")


list(datasets)

['nyc_taxi_wide', 'census', 'osm']


type(datasets.osm)

intake_parquet.source.ParquetSource


osm_ddf = datasets.osm.to_dask()

/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
/Users/nhand/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/dask/dataframe/backends.py:189: FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


osm_ddf


# we can still get the head of the file quickly
osm_ddf.head(n=10)


# getting the length means all of the data must be loaded though
nrows = len(osm_ddf)
print(f"number of rows = {nrows}")

number of rows = 1000050363


# mean x/y coordinates
mean_x = osm_ddf['x'].mean()
mean_y = osm_ddf['y'].mean()

print(mean_x, mean_y)

dd.Scalar<series-..., dtype=float32> dd.Scalar<series-..., dtype=float32>


# evaluate the expressions
print("mean x = ", mean_x.compute())

mean x =  2731828.836864097


# evaluate the expressions
print("mean y = ", mean_y.compute())

mean y =  5801830.125332437


# Datashader imports
import datashader as ds
import datashader.transfer_functions as tf


# Color-related imports
from datashader.colors import Greys9, viridis, inferno
from colorcet import fire


# Web Mercator bounds
bound = 20026376.39
global_x_range = (-bound, bound)
global_y_range = (int(-bound*0.4), int(bound*0.6))

# Default width and height
global_plot_width = 900
global_plot_height = int(global_plot_width*0.5)


# Step 1: Setup the canvas
canvas = ds.Canvas(
    plot_width=global_plot_width,
    plot_height=global_plot_height,
    x_range=global_x_range,
    y_range=global_y_range,
)
 
# Step 2: Aggregate the points into pixels
# NOTE: Use the "count()" function — count number of points per pixel
agg = canvas.points(osm_ddf, "x", "y", agg=ds.count())

agg

<xarray.DataArray (y: 450, x: 900)>
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint32)
Coordinates:
  * x        (x) float64 -2e+07 -1.996e+07 -1.992e+07 ... 1.996e+07 2e+07
  * y        (y) float64 -7.988e+06 -7.944e+06 ... 1.195e+07 1.199e+07

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint32)

array([-20004124.860678, -19959621.802033, -19915118.743389, ...,
        19915118.743389,  19959621.802033,  20004124.860678])

array([-7988298.472222, -7943795.416667, -7899292.361111, ..., 11904567.361111,
       11949070.416667, 11993573.472222])


# Step 3: Perform the shade operation
img = tf.shade(agg, cmap=fire)

# Format: set the background of the image to black so it looks better
img = tf.set_background(img, "black")

img


selected = agg.where(agg > 15)
selected

<xarray.DataArray (y: 450, x: 900)>
array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])
Coordinates:
  * x        (x) float64 -2e+07 -1.996e+07 -1.992e+07 ... 1.996e+07 2e+07
  * y        (y) float64 -7.988e+06 -7.944e+06 ... 1.195e+07 1.199e+07

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

array([-20004124.860678, -19959621.802033, -19915118.743389, ...,
        19915118.743389,  19959621.802033,  20004124.860678])

array([-7988298.472222, -7943795.416667, -7899292.361111, ..., 11904567.361111,
       11949070.416667, 11993573.472222])


# plot the masked data
tf.set_background(tf.shade(selected, cmap=fire),"black")


# Load the data
# REMEMBER: this will take some time to download the first time
census_ddf = datasets.census.to_dask()


census_ddf


census_ddf.head()


print("number of rows =", len(census_ddf))

number of rows = 306675004


from datashader.utils import lnglat_to_meters


# Sensible lat/lng coordinates for U.S. cities
# NOTE: these are in lat/lng so EPSG=4326
USA           = [(-124.72,  -66.95), (23.55, 50.06)]
Chicago       = [( -88.29,  -87.30), (41.57, 42.00)]
NewYorkCity   = [( -74.39,  -73.44), (40.51, 40.91)]
LosAngeles    = [(-118.53, -117.81), (33.63, 33.96)]
Houston       = [( -96.05,  -94.68), (29.45, 30.11)]
Austin        = [( -97.91,  -97.52), (30.17, 30.37)]
NewOrleans    = [( -90.37,  -89.89), (29.82, 30.05)]
Atlanta       = [( -84.88,  -84.04), (33.45, 33.84)]
Philly        = [( -75.28,  -74.96), (39.86, 40.14)]


# Get USA xlim and ylim in meters (EPSG=3857)
USA_xlim_meters, USA_ylim_meters = [list(r) for r in lnglat_to_meters(USA[0], USA[1])]


# Define some a default plot width & height
plot_width  = int(900)
plot_height = int(plot_width*7.0/12)


# Step 1: Setup the canvas
cvs = ds.Canvas(
    plot_width, plot_height, x_range=USA_xlim_meters, y_range=USA_ylim_meters
)

# Step 2: Aggregate the x/y points
agg = cvs.points(census_ddf, "easting", "northing")

# Step 3: Shade with a "Grey" colormap and "linear" colormapping
img = tf.shade(agg, cmap=Greys9, how="linear")

# Format: Set the background
tf.set_background(img, "black")


# Step 3: Shade with a "Grey" colormap and "log" colormapping
img = tf.shade(agg, cmap=Greys9, how='log')

# Format: add a black background
img = tf.set_background(img, 'black')
img


## Step 3: Shade with "fire" color scale and "log" colormapping
img = tf.shade(agg, cmap=fire, how='log')
tf.set_background(img, 'black')


# Step 3: Shade with fire colormap and equalized histogram mapping
img = tf.shade(agg, cmap=fire, how='eq_hist')
tf.set_background(img, 'black')


img = tf.shade(agg, cmap=viridis, how='eq_hist')
img = tf.set_background(img, 'black')
img


from datashader.utils import export_image


export_image(img, 'usa_census_viridis')


census_ddf.head()


census_ddf['race'].value_counts().compute()

w    196052887
h     50317503
b     37643995
a     13914371
o      8746248
Name: race, dtype: int64


color_key = {"w": "aqua", "b": "lime", "a": "red", "h": "fuchsia", "o": "yellow"}


def create_census_image(longitude_range, latitude_range, w=plot_width, h=plot_height):
    """
    A function for plotting the Census data, coloring pixel by race values.
    """
    # Step 1: Calculate x and y range from lng/lat ranges
    x_range, y_range = lnglat_to_meters(longitude_range, latitude_range)

    # Step 2: Setup the canvas
    canvas = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)

    # Step 3: Aggregate, but this time count the "race" category
    # NEW: specify the aggregation method to count the "race" values in each pixel
    agg = canvas.points(census_ddf, "easting", "northing", agg=ds.count_cat("race"))
    
    # Step 4: Shade, using our custom color map
    img = tf.shade(agg, color_key=color_key, how="eq_hist")

    # Return image with black background
    return tf.set_background(img, "black")


create_census_image(USA[0], USA[1])


create_census_image(Philly[0], Philly[1], w=600, h=600)


create_census_image(NewYorkCity[0], NewYorkCity[1])


create_census_image(Atlanta[0], Atlanta[1])


create_census_image(LosAngeles[0], LosAngeles[1])


create_census_image(Houston[0], Houston[1])


create_census_image(Chicago[0], Chicago[1])


create_census_image(NewOrleans[0], NewOrleans[1])


# Step 1: Setup canvas
cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height)

# Step 2: Aggregate and count race category
aggc = cvs.points(census_ddf, "easting", "northing", agg=ds.count_cat("race"))

# NEW: Select only African Americans (where "race" column is equal to "b")
agg_b = aggc.sel(race="b")

agg_b

<xarray.DataArray (northing: 525, easting: 900)>
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint32)
Coordinates:
  * easting   (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06
  * northing  (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06
    race      <U1 'b'

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint32)

array([-13880456.374722, -13873311.124167, -13866165.873611, ...,
        -7471166.626389,  -7464021.375833,  -7456876.125278])

array([2821641.671905, 2828342.015714, 2835042.359524, ..., 6319221.140476,
       6325921.484286, 6332621.828095])

array('b', dtype='<U1')


# Step 3: Shade and set background
img = tf.shade(agg_b, cmap=fire, how="eq_hist")
img = tf.set_background(img, "black")

img


bool_sel = aggc.sel(race=['w', 'b', 'a', 'h']) > 0

bool_sel

<xarray.DataArray (northing: 525, easting: 900, race: 4)>
array([[[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
...
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]]])
Coordinates:
  * easting   (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06
  * northing  (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06
  * race      (race) <U1 'w' 'b' 'a' 'h'

array([[[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
...
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        ...,
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]]])

array([-13880456.374722, -13873311.124167, -13866165.873611, ...,
        -7471166.626389,  -7464021.375833,  -7456876.125278])

array([2821641.671905, 2828342.015714, 2835042.359524, ..., 6319221.140476,
       6325921.484286, 6332621.828095])

array(['w', 'b', 'a', 'h'], dtype='<U1')


# Do a "logical and" operation across the "race" dimension
# Pixels will be "True" if the pixel has a positive count for each race
diverse_selection = bool_sel.all(dim='race')

diverse_selection

<xarray.DataArray (northing: 525, easting: 900)>
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])
Coordinates:
  * easting   (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06
  * northing  (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

array([-13880456.374722, -13873311.124167, -13866165.873611, ...,
        -7471166.626389,  -7464021.375833,  -7456876.125278])

array([2821641.671905, 2828342.015714, 2835042.359524, ..., 6319221.140476,
       6325921.484286, 6332621.828095])


# Select the pixel values where our diverse selection criteria is True
agg2 = aggc.where(diverse_selection).fillna(0)

# and shade using our color key
img = tf.shade(agg2, color_key=color_key, how='eq_hist')
img = tf.set_background(img,"black")

img


# Select where the "b" race dimension is greater than the "w" race dimension
selection = aggc.sel(race='b') > aggc.sel(race='w') 

selection

<xarray.DataArray (northing: 525, easting: 900)>
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])
Coordinates:
  * easting   (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06
  * northing  (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

array([-13880456.374722, -13873311.124167, -13866165.873611, ...,
        -7471166.626389,  -7464021.375833,  -7456876.125278])

array([2821641.671905, 2828342.015714, 2835042.359524, ..., 6319221.140476,
       6325921.484286, 6332621.828095])


# Select based on the selection criteria
agg3 = aggc.where(selection).fillna(0)

img = tf.shade(agg3, color_key=color_key, how="eq_hist")
img = tf.set_background(img, "black")

img

	x	y
0	-16274360.0	-17538778.0
1	-16408889.0	-16618700.0
2	-16246231.0	-16106805.0
3	-19098164.0	-14783157.0
4	-17811662.0	-13948767.0
5	-17751736.0	-13926740.0
6	-17711376.0	-13921245.0
7	-17532738.0	-13348323.0
8	-19093358.0	-10380358.0
9	-19077458.0	-10445329.0

	easting	northing	race
0	-12418767.0	3697425.00	h
1	-12418512.0	3697143.50	h
2	-12418245.0	3697584.50	h
3	-12417703.0	3697636.75	w
4	-12418120.0	3697129.25	h

	easting	northing	race
0	-12418767.0	3697425.00	h
1	-12418512.0	3697143.50	h
2	-12418245.0	3697584.50	h
3	-12417703.0	3697636.75	w
4	-12418120.0	3697129.25	h

	easting	northing	race
npartitions=36
	float32	float32	category[unknown]
	...	...	...
...	...	...	...
	...	...	...
	...	...	...

Week 7Analyzing and Visualizing Large Datasets

This week's agenda: working with big data¶

The holoViz ecosystem (revisited)¶

First up: dask¶

The key: lazy evaluation¶

Numpy arrays¶

Dask dataframes¶

Enables out-of-core operations¶

Let's try it out:¶

Loading data with intake¶

An intake "catalog"¶

Load our intake catalog¶

Which datasets do we have?¶

3 "big data" examples today¶

Example 1: OSM data points¶

Convert the data to a dask array: to_dask()¶

Remember:¶

Let's load the data for the first partition to peek at the head of the file¶

Note: What geographic coordinates?¶

How about the size of the data frame?¶

1 billion rows!¶

Let's do some simple calculations¶

Now let's visualize the points...but how?¶

Not with matplotlib...¶

Enter Datashader¶

The Datashader pipeline¶

Steps:¶

Important¶

The aggregated pixels are stored using xarray¶

Steps (continued)¶

Improvement: Remove noise from pixels with low counts¶

This will mask pixels that do not satisfy the where condition — masked pixels are set to NaN¶

Example 2: Revisiting the census dot map¶

Note: What geographic coordinates?¶

Roughly 300 million rows: 1 for each person in the U.S. population¶

Set up bounds for major cities and USA¶

First, visualize the population density¶

How about a linear color scale?¶

Okay, what about a log scale?¶

Let's use a perceptually uniform color map¶

The best option: using the equal histogram method for shading¶

How about viridis?¶

How to save?¶

Next, let's visualize race¶

Define a color scale for each race category¶

Let's visualize all 300 million points¶

Remember: color scheme¶

Let's zoom in on Philadelphia...¶

At home exercise: what about other cities?¶

New York City¶

Atlanta¶

Los Angeles¶

Houston¶

Chicago¶

New Orleans¶

Can we learn more than just population density and race?¶

Question: Where do African Americans live?¶

Question: How to identify diverse areas?¶

Question: Where is African American population greater than the White population?¶

To be continued...¶

Week 7
Analyzing and Visualizing Large Datasets

Convert the data to a dask array: `to_dask()`¶

This will mask pixels that do not satisfy the `where` condition — masked pixels are set to NaN¶