# Start with the usual imports
# We'll use these throughout
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from bs4 import BeautifulSoup
import requests


url = "https://www.phila.gov/programs/coronavirus-disease-2019-covid-19/updates/"


user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37"


result = requests.get(url, headers={"User-Agent": user_agent}) # NEW: Specify the "User-Agent" header
soup = BeautifulSoup(result.content, "html.parser")


selector = "#post-263624 > div.one-quarter-layout > div:nth-child(1) > div.medium-18.columns.pbxl > ul > li:nth-child(1)"


avg = soup.select_one(selector).text

avg

'Average new cases per day: 177'


words = avg.split()

words

['Average', 'new', 'cases', 'per', 'day:', '177']


int(words[-1])

177


selector = "#post-263624 > div.one-quarter-layout > div:nth-child(1) > div.medium-18.columns.pbxl > p:nth-child(3) > em"


last_updated = soup.select_one(selector).text

last_updated

'Cases last updated: October 4, 2022\nHospitalizations last updated: September 28, 2022'


print(last_updated)

Cases last updated: October 4, 2022
Hospitalizations last updated: September 28, 2022


lines = last_updated.splitlines()

lines

['Cases last updated: October 4, 2022',
 'Hospitalizations last updated: September 28, 2022']


lines[0].split(":")

['Cases last updated', ' October 4, 2022']


last_updated_date = lines[0].split(":")[-1]

last_updated_date

' October 4, 2022'


timestamp = pd.to_datetime(last_updated_date)

timestamp

Timestamp('2022-10-04 00:00:00')


timestamp.strftime("%B %-d, %Y")

'October 4, 2022'


timestamp.strftime("%m/%d/%y")

'10/04/22'


# Initialize the soup for this page
url = "https://vote.phila.gov"
r = requests.get(
    url,
    headers={"User-Agent": user_agent},
)  # Add the user-agent

soup2 = BeautifulSoup(r.content, "html.parser")


# Select the h1 element holding the number of days
# You can get this via the Web Inspector
selector = ".day-count"


# Select the element (use select_one to get only the first match)
element = soup2.select_one(selector)


# Get the text of the element
days = element.text
print("Raw days value = ", days)

Raw days value =  26


# Convert to float
days = float(days)
print(f"Number of days until General Election = {days}")

Number of days until General Election = 26.0


# Make the request
url = "https://phlcouncil.com/council-members/"
r = requests.get(
    url, headers={"User-Agent": user_agent}
)  # NOTE: include the user agent! Otherwise you get a 403 Forbidden error


# Parse the html
soup3 = BeautifulSoup(r.content, "html.parser")


soup3.select(".x-face-title")

[<h4 class="x-face-title"><strong>Darrell L. Clarke</strong></h4>,
 <h4 class="x-face-title">Council President Darrell L. Clarke</h4>,
 <h4 class="x-face-title"><strong>Mark Squilla</strong></h4>,
 <h4 class="x-face-title">Mark Squilla</h4>,
 <h4 class="x-face-title"><strong>Kenyatta Johnson</strong></h4>,
 <h4 class="x-face-title">Kenyatta Johnson</h4>,
 <h4 class="x-face-title"><strong>Jamie Gauthier</strong></h4>,
 <h4 class="x-face-title">Jamie Gauthier</h4>,
 <h4 class="x-face-title"><strong>Curtis Jones, Jr.</strong></h4>,
 <h4 class="x-face-title">Curtis Jones, Jr.</h4>,
 <h4 class="x-face-title"><strong>Michael Driscoll</strong></h4>,
 <h4 class="x-face-title">MICHAEL DRISCOLL</h4>,
 <h4 class="x-face-title"><strong>Vacant</strong></h4>,
 <h4 class="x-face-title">Vacant</h4>,
 <h4 class="x-face-title"><strong>Cindy Bass</strong></h4>,
 <h4 class="x-face-title">Cindy Bass</h4>,
 <h4 class="x-face-title"><strong>vacant</strong></h4>,
 <h4 class="x-face-title">vacant</h4>,
 <h4 class="x-face-title"><strong>Brian J. O’Neill</strong></h4>,
 <h4 class="x-face-title">Brian J. O’Neill</h4>,
 <h4 class="x-face-title"><strong>Kendra Brooks</strong></h4>,
 <h4 class="x-face-title">Kendra Brooks</h4>,
 <h4 class="x-face-title"><strong>VACANt</strong></h4>,
 <h4 class="x-face-title">Vacant</h4>,
 <h4 class="x-face-title"><strong>Vacant</strong></h4>,
 <h4 class="x-face-title">Vacant</h4>,
 <h4 class="x-face-title"><strong>Katherine Gilmore Richardson</strong></h4>,
 <h4 class="x-face-title">Katherine Gilmore Richardson</h4>,
 <h4 class="x-face-title"><strong>Helen Gym</strong></h4>,
 <h4 class="x-face-title">Helen Gym</h4>,
 <h4 class="x-face-title"><strong>David Oh</strong></h4>,
 <h4 class="x-face-title">David Oh</h4>,
 <h4 class="x-face-title"><strong>Isaiah Thomas</strong></h4>,
 <h4 class="x-face-title">Isaiah Thomas</h4>]


selector = '.x-face-outer.front .x-face-title'


name_elements = soup3.select(selector)


name_elements

[<h4 class="x-face-title"><strong>Darrell L. Clarke</strong></h4>,
 <h4 class="x-face-title"><strong>Mark Squilla</strong></h4>,
 <h4 class="x-face-title"><strong>Kenyatta Johnson</strong></h4>,
 <h4 class="x-face-title"><strong>Jamie Gauthier</strong></h4>,
 <h4 class="x-face-title"><strong>Curtis Jones, Jr.</strong></h4>,
 <h4 class="x-face-title"><strong>Michael Driscoll</strong></h4>,
 <h4 class="x-face-title"><strong>Vacant</strong></h4>,
 <h4 class="x-face-title"><strong>Cindy Bass</strong></h4>,
 <h4 class="x-face-title"><strong>vacant</strong></h4>,
 <h4 class="x-face-title"><strong>Brian J. O’Neill</strong></h4>,
 <h4 class="x-face-title"><strong>Kendra Brooks</strong></h4>,
 <h4 class="x-face-title"><strong>VACANt</strong></h4>,
 <h4 class="x-face-title"><strong>Vacant</strong></h4>,
 <h4 class="x-face-title"><strong>Katherine Gilmore Richardson</strong></h4>,
 <h4 class="x-face-title"><strong>Helen Gym</strong></h4>,
 <h4 class="x-face-title"><strong>David Oh</strong></h4>,
 <h4 class="x-face-title"><strong>Isaiah Thomas</strong></h4>]


print(f"Total number of city councilmembers is {len(name_elements)}")

Total number of city councilmembers is 17


names = [el.text.strip().lower() for el in name_elements]


names

['darrell l. clarke',
 'mark squilla',
 'kenyatta johnson',
 'jamie gauthier',
 'curtis jones, jr.',
 'michael driscoll',
 'vacant',
 'cindy bass',
 'vacant',
 'brian j. o’neill',
 'kendra brooks',
 'vacant',
 'vacant',
 'katherine gilmore richardson',
 'helen gym',
 'david oh',
 'isaiah thomas']


[name == 'vacant' for name in names]

[False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False]


sum([name == 'vacant' for name in names])

4


# Parse the HTML
url = "http://data.inquirer.com/inspections/"
soup4 = BeautifulSoup(requests.get(url).content, 'html.parser')


# This will select all rows of the table
rows = soup4.select(".inspectionUnitInner")

len(rows)

50


# The first row
rows[0]

<div class="inspectionUnitInner"><div class="inspectionNameWrapper"><div class="inspectionUnitName transitionAll">7-Eleven #2408-35275J</div><div class="inspectionUnitDate"><span class="inspectionUnitDateTitle">Inspection date:</span> Oct 14, 2022</div><div class="clearAll"></div></div><div class="inspectionUnitInfoWrapper"><div class="inspectionUnitAddress">1084 N DELAWARE AVE 19125</div><div class="inspectionUnitNeigborhood"></div><div class="clearAll"></div></div><div class="inspectionUnitCountWrapper"><span class="inspectionCountLabel">Violations</span><li class="inspectionUnitCount inspectionUnitCountFoodborne inspectionUnitCountFirst"><span class="inspectionCountNumber">4</span><span class="inspectionUnitInfoItemTitle"><span class="inspectionUnitInfoItemTitleLabel">Foodborne Illness Risk Factors</span></span></li><li class="inspectionUnitCount inspectionUnitCountRetail"><span class="inspectionCountNumber">7</span><span class="inspectionUnitInfoItemTitle"><span class="inspectionUnitInfoItemTitleLabel">Lack of Good Retail Practices</span></span></li><div class="clearAll"></div></div><div class="clearAll"></div></div>


# Keep track of the restaurant names and violations
names = []
violations = []

# Loop over each row
for row in rows:
    
    # The name of the restaurant
    name_tag = row.select_one(".inspectionUnitName")
    name = name_tag.text
    
    # The number of foodborne violations
    count = row.select_one(".inspectionUnitCountFoodborne .inspectionCountNumber")
    
    # Only save it if count was listed (0 violations will show up as None)
    if count is not None:
        names.append(name)
        violations.append(int(count.text))

df = pd.DataFrame({"name" : names, "violations" : violations})

df.sort_values("violations", ascending=False)


print("total number of foodborne violations = ", df['violations'].sum())

total number of foodborne violations =  65


# Import the webdriver from selenium
from selenium import webdriver


# UNCOMMENT BELOW TO USE CHROME

#from webdriver_manager.chrome import ChromeDriverManager
#from selenium.webdriver.chrome.service import Service


#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7.52M/7.52M [00:00<00:00, 24.0MB/s]


# UNCOMMENT BELOW IF ON BINDER

# from webdriver_manager.firefox import GeckoDriverManager
# from selenium.webdriver.firefox.service import Service

# options = webdriver.FirefoxOptions()

# IF ON BINDER, RUN IN "HEADLESS" MODE (NO BROWSER WINDOW IS OPENED)
# COMMENT THIS LINE IF WORKING LOCALLY
# options.add_argument("--headless")

# Initialize
# driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)


# UNCOMMENT BELOW TO USE MICROSOFT EDGE

# from webdriver_manager.microsoft import EdgeChromiumDriverManager
# from selenium.webdriver.edge.service import Service

# driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))


# Open the URL
url = "https://ujsportal.pacourts.us/CaseSearch"
driver.get(url)


# Use the Web Inspector to get the css selector of the dropdown select element
dropdown_selector = "#SearchBy-Control > select"


from selenium.webdriver.common.by import By

# Select the dropdown by the element's CSS selector
dropdown = driver.find_element(By.CSS_SELECTOR, dropdown_selector)


from selenium.webdriver.support.ui import Select

# Initialize a Select object
dropdown_select = Select(dropdown)


# Set the selected text in the dropdown element
dropdown_select.select_by_visible_text("Incident Number")


# Get the input element for the DC number
incident_input_selector = "#IncidentNumber-Control > input"
incident_input = driver.find_element(By.CSS_SELECTOR, incident_input_selector)


# Clear any existing entry
incident_input.clear()

# Input our example incident number
incident_input.send_keys("1725088232")


# Submit the search
search_button_id = "btnSearch"
driver.find_element(By.ID, search_button_id).click()


courtsSoup = BeautifulSoup(driver.page_source, "html.parser")


# Select the results container by its ID 
results_table = courtsSoup.select_one("#caseSearchResultGrid")


# Get all of the <tr> rows inside the tbody element 
# NOTE: we using nested selections here!
results_rows = results_table.select("tbody > tr")


# Number of court cases
number_of_cases = len(results_rows)
print(f"Number of courts cases: {number_of_cases}")

Number of courts cases: 2


first_row = results_rows[0]


print(first_row.prettify())

<tr class="slide-active">
 <td class="display-none">
  1
 </td>
 <td class="display-none">
  0
 </td>
 <td>
  MC-51-CR-0030672-2017
 </td>
 <td>
  Common Pleas
 </td>
 <td>
  Comm. v. Velquez, Victor
 </td>
 <td>
  Closed
 </td>
 <td>
  10/13/2017
 </td>
 <td>
  Velquez, Victor
 </td>
 <td>
  09/05/1974
 </td>
 <td>
  Philadelphia
 </td>
 <td>
  MC-01-51-Crim
 </td>
 <td>
  U0981035
 </td>
 <td>
  1725088232-0030672
 </td>
 <td>
  1725088232
 </td>
 <td class="display-none">
 </td>
 <td class="display-none">
 </td>
 <td class="display-none">
 </td>
 <td class="display-none">
 </td>
 <td>
  <div class="grid inline-block">
   <div>
    <div class="inline-block">
     <a class="icon-wrapper" href="/Report/CpDocketSheet?docketNumber=MC-51-CR-0030672-2017&amp;dnh=%2FGgePQykMpAymRENgxLBzg%3D%3D" target="_blank">
      <img alt="Docket Sheet" class="icon-size" src="https://ujsportal.pacourts.us/resource/Images/svg-defs.svg?v=3-Me4WMBYQPCgs0IdgGyzeTEx_qd5uveL0qyDZoiHPM#icon-document-letter-D" title="Docket Sheet"/>
      <label class="link-text">
       Docket Sheet
      </label>
     </a>
    </div>
   </div>
  </div>
  <div class="grid inline-block">
   <div>
    <div class="inline-block">
     <a class="icon-wrapper" href="/Report/CpCourtSummary?docketNumber=MC-51-CR-0030672-2017&amp;dnh=%2FGgePQykMpAymRENgxLBzg%3D%3D" target="_blank">
      <img alt="Court Summary" class="icon-size" src="https://ujsportal.pacourts.us/resource/Images/svg-defs.svg?v=3-Me4WMBYQPCgs0IdgGyzeTEx_qd5uveL0qyDZoiHPM#icon-court-summary" title="Court Summary"/>
      <label class="link-text">
       Court Summary
      </label>
     </a>
    </div>
   </div>
  </div>
 </td>
</tr>


# Extract out all of the "<td>" cells from the first row
td_cells = first_row.select("td")

# Loop over each <td> cell
for cell in td_cells:
    
    # Extract out the text from the <td> element
    text = cell.text
    
    # Print out text
    if text != "":
        print(text)

1
0
MC-51-CR-0030672-2017
Common Pleas
Comm. v. Velquez, Victor
Closed
10/13/2017
Velquez, Victor
09/05/1974
Philadelphia
MC-01-51-Crim
U0981035
1725088232-0030672
1725088232
Docket SheetCourt Summary


driver.close()

	name	violations
7	Cerda Grocery Inc.	7
8	Fairmount Pizzeria	6
0	7-Eleven #2408-35275J	4
1	B&R Grocery	4
10	Care to Learn Child Development Center	3
15	Girard Neighborhood Food Market	3
13	Delicias Meat & Produce	3
12	Delianny Mini Market	3
11	Cousin's Fresh Market 3	3
14	G & J 1526 Tasker Grocery	3
9	Aid For Friends	3
2	Great Valu	2
18	Cabrera,Javier/Tacos La Charreada Inc/V07250	2
17	Brunch N	2
16	Abi's Bargain Outlet	2
19	Charles Audenreid Charter Private School	2
3	52 Kings Food Market Inc	1
4	Gilbert Spruance School	1
5	Haydee Mini Market	1
6	James J. Sullivan Elementary School	1
20	3J's Food Market	1
21	A S Jenks School	1
22	AM Deli Grocery II Inc.	1
23	Burger Fi	1
24	Discount Store 2	1
25	Germantown Home	1
26	Holy Cross Parish School	1
27	Julia DeBurgos Bilingual School	1
28	Knorr Street Shoprite Inc	1

Week 6
Web Scraping

Housekeeping¶

Week 6 agenda: web scraping¶

Scraping: Adding the User-Agent Header¶

Example: Let's get COVID-19 stats in Philadelphia¶

Get the average case count¶

Get the last updated date¶

Part 1: Web scraping exercises¶

1. The number of days until the General Election¶

2. Philadelphia City Council¶

3. Food inspections in Philadelphia¶

Part 2: What about dynamic content?¶

Selenium¶

Note: web browser needed¶

Selenium¶

Best by example: Scraping the Philadelphia Municipal Courts portal¶

Initialize the driver¶

Important: Working on Binder¶

Using Google Chrome¶

Using Firefox¶

Using Microsoft Edge¶

Run the scraping analysis¶

1. Open the URL¶

4. Set the incident number¶

5. Click the search button!¶

6. Use BeautifulSoup to parse the results¶

7. Close the driver!¶

Part 3: Automated "git scraping"¶

Example: @PHLHomicides¶

Example: Building a Twitter bot for COVID-19 stats 🤖¶

What it does¶

Github Actions¶

Github Actions¶

Github secrets¶

The final bot¶

That's it!¶

Week 6Web Scraping

Housekeeping¶

Week 6 agenda: web scraping¶

Scraping: Adding the User-Agent Header¶

Example: Let's get COVID-19 stats in Philadelphia¶

Get the average case count¶

Get the last updated date¶

Part 1: Web scraping exercises¶

1. The number of days until the General Election¶

2. Philadelphia City Council¶

3. Food inspections in Philadelphia¶

Part 2: What about dynamic content?¶

Selenium¶

Note: web browser needed¶

Selenium¶

Best by example: Scraping the Philadelphia Municipal Courts portal¶

Initialize the driver¶

Important: Working on Binder¶

Using Google Chrome¶

Using Firefox¶

Using Microsoft Edge¶

Run the scraping analysis¶

1. Open the URL¶

2. Create a dropdown "Select" element¶

3. Change the selected text in the dropdown¶

4. Set the incident number¶

5. Click the search button!¶

6. Use BeautifulSoup to parse the results¶

7. Close the driver!¶

Part 3: Automated "git scraping"¶

Example: @PHLHomicides¶

Example: Building a Twitter bot for COVID-19 stats 🤖¶

What it does¶

Github Actions¶

Github Actions¶

Github secrets¶

The final bot¶

That's it!¶

Week 6
Web Scraping