import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

from matplotlib import pyplot as plt
import seaborn as sns

import time 
import hvplot.pandas
import holoviews as hv


import esri2gpd
import carto2gpd
import cenpy


pd.options.display.max_columns = 999


import tweepy as tw


# INPUT YOUR API TOKENS HERE
BEARER_TOKEN = ""
API_KEY = ""
API_KEY_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""


# Initialize the API client and tell tweepy to wait if rate limit is met
client = tw.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_KEY_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True,
)

tweet = 'Hello World!'
client.create_tweet(text=tweet)


#tweet = 'Hello World!'
#client.create_tweet(text=tweet)


client.search_recent_tweets?

Signature: client.search_recent_tweets(query, *, user_auth=False, **params)
Docstring:
search_recent_tweets(             query, *, end_time=None, expansions=None, max_results=None,             media_fields=None, next_token=None, place_fields=None,             poll_fields=None, since_id=None, sort_order=None,             start_time=None, tweet_fields=None, until_id=None,             user_fields=None, user_auth=False         )

The recent search endpoint returns Tweets from the last seven days that
match a search query.

The Tweets returned by this endpoint count towards the Project-level
`Tweet cap`_.

.. versionchanged:: 4.6
    Added ``sort_order`` parameter

Parameters
----------
query : str
    One rule for matching Tweets. If you are using a
    `Standard Project`_ at the Basic `access level`_, you can use the
    basic set of `operators`_ and can make queries up to 512 characters
    long. If you are using an `Academic Research Project`_ at the Basic
    access level, you can use all available operators and can make
    queries up to 1,024 characters long.
end_time : datetime.datetime | str | None
    YYYY-MM-DDTHH:mm:ssZ (ISO 8601/RFC 3339). The newest, most recent
    UTC timestamp to which the Tweets will be provided. Timestamp is in
    second granularity and is exclusive (for example, 12:00:01 excludes
    the first second of the minute). By default, a request will return
    Tweets from as recent as 30 seconds ago if you do not include this
    parameter.
expansions : list[str] | str | None
    :ref:`expansions_parameter`
max_results : int | None
    The maximum number of search results to be returned by a request. A
    number between 10 and 100. By default, a request response will
    return 10 results.
media_fields : list[str] | str | None
    :ref:`media_fields_parameter`
next_token : str | None
    This parameter is used to get the next 'page' of results. The value
    used with the parameter is pulled directly from the response
    provided by the API, and should not be modified.
place_fields : list[str] | str | None
    :ref:`place_fields_parameter`
poll_fields : list[str] | str | None
    :ref:`poll_fields_parameter`
since_id : int | str | None
    Returns results with a Tweet ID greater than (that is, more recent
    than) the specified ID. The ID specified is exclusive and responses
    will not include it. If included with the same request as a
    ``start_time`` parameter, only ``since_id`` will be used.
sort_order : str | None
    This parameter is used to specify the order in which you want the
    Tweets returned. By default, a request will return the most recent
    Tweets first (sorted by recency).
start_time : datetime.datetime | str | None
    YYYY-MM-DDTHH:mm:ssZ (ISO 8601/RFC 3339). The oldest UTC timestamp
    (from most recent seven days) from which the Tweets will be
    provided. Timestamp is in second granularity and is inclusive (for
    example, 12:00:01 includes the first second of the minute). If
    included with the same request as a ``since_id`` parameter, only
    ``since_id`` will be used. By default, a request will return Tweets
    from up to seven days ago if you do not include this parameter.
tweet_fields : list[str] | str | None
    :ref:`tweet_fields_parameter`
until_id : int | str | None
    Returns results with a Tweet ID less than (that is, older than) the
    specified ID. The ID specified is exclusive and responses will not
    include it.
user_fields : list[str] | str | None
    :ref:`user_fields_parameter`
user_auth : bool
    Whether or not to use OAuth 1.0a User Context to authenticate

Returns
-------
dict | requests.Response | Response

References
----------
https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent

.. _Tweet cap: https://developer.twitter.com/en/docs/projects/overview#tweet-cap
.. _Standard Project: https://developer.twitter.com/en/docs/projects
.. _access level: https://developer.twitter.com/en/products/twitter-api/early-access/guide.html#na_1
.. _operators: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
.. _Academic Research Project: https://developer.twitter.com/en/docs/projects
File:      ~/mambaforge/envs/musa-550-fall-2022/lib/python3.9/site-packages/tweepy/client.py
Type:      method


# Collect tweets related to the phillies
search_words = "#phillies"

# Get tweets from the past 7 days
tweets = client.search_recent_tweets(
    query=search_words, tweet_fields=["created_at", "public_metrics"], max_results=100
)


len(tweets.data)

100


first_tweet = tweets.data[0]


first_tweet

<Tweet id=1577736998764396549 text='RT @ScottLauber: #Phillies-Cardinals start times, all times Eastern:\nGame 1: Friday, 2:07 p.m.\nGame 2: Saturday, 8:37 p.m.\nGame 3, if neces…'>


# Tab complete to see available attributes
# first_tweet.


dict(first_tweet)

{'id': 1577736998764396549,
 'text': 'RT @ScottLauber: #Phillies-Cardinals start times, all times Eastern:\nGame 1: Friday, 2:07 p.m.\nGame 2: Saturday, 8:37 p.m.\nGame 3, if neces…',
 'edit_history_tweet_ids': ['1577736998764396549'],
 'public_metrics': {'retweet_count': 8,
  'reply_count': 0,
  'like_count': 0,
  'quote_count': 0},
 'created_at': datetime.datetime(2022, 10, 5, 19, 6, 28, tzinfo=datetime.timezone.utc)}


first_tweet.text

'RT @ScottLauber: #Phillies-Cardinals start times, all times Eastern:\nGame 1: Friday, 2:07 p.m.\nGame 2: Saturday, 8:37 p.m.\nGame 3, if neces…'


first_tweet.created_at

datetime.datetime(2022, 10, 5, 19, 6, 28, tzinfo=datetime.timezone.utc)


first_tweet.public_metrics

{'retweet_count': 8, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}


client = tw.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_KEY_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True,
)


tweets = client.search_recent_tweets(
    query=search_words,
    tweet_fields=["created_at", "geo"],
    place_fields=["place_type", "geo"],
    expansions="geo.place_id",
    max_results=100,
)


tweets.includes['places']

[<Place id=01a9a39529b27f36 full_name=Manhattan, NY>]


places = {p["id"]: p for p in tweets.includes['places']}


tweets_with_geo = [tweet for tweet in tweets.data if tweet.geo is not None]


len(tweets_with_geo)

1


t = tweets_with_geo[0]

t

<Tweet id=1577736973665636353 text='@MLB #OMG 🤣🥳😎🤓🎃❣️❣️❣️#phillies https://t.co/JwMEURMlWT'>


t.geo

{'place_id': '01a9a39529b27f36'}


place = places[t.geo['place_id']]

place

<Place id=01a9a39529b27f36 full_name=Manhattan, NY>


dict(place)

{'full_name': 'Manhattan, NY',
 'geo': {'type': 'Feature',
  'bbox': [-74.026675, 40.683935, -73.910408, 40.877483],
  'properties': {}},
 'id': '01a9a39529b27f36',
 'place_type': 'city'}


client = tw.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_KEY_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True,
)


# Store tweets from all pages
all_tweets = []

# Track how many requests
requests_made = 0

# Use the paginator to loop over all available tweets in batches of 100
# This pulls 100 tweets per page for 5 pages to get 500 total
for tweets in tw.Paginator(
    client.search_recent_tweets,    # The API endpoint
    query="#phillies",              # The search query
    tweet_fields=["created_at"],    # Additional fields
    max_results=100,  # How many tweets per page
    limit=5,  # NEW: How many pages to retrieve
):

    # Saves all of the tweets
    all_tweets += tweets.data

    # Sleep for 1 second to avoid rate limit
    time.sleep(1)

    # Log each time a new request is made
    requests_made = requests_made + 1
    print(f"After request #{requests_made}, total # tweets = {len(all_tweets)}")

After request #1, total # tweets = 100
After request #2, total # tweets = 200
After request #3, total # tweets = 300
After request #4, total # tweets = 400
After request #5, total # tweets = 500


len(all_tweets)

500


search_words = "#phillies"
new_search = search_words + " -is:retweet"


client = tw.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_KEY_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True,
)


tweets = client.search_recent_tweets(
    query=new_search, tweet_fields=["created_at", "public_metrics"], max_results=100
)


for tweet in tweets.data[:10]:
    print(tweet.text)

@Phillies retired in Top 3rd.. Valdez collects 2 more strike outs
Kyle Schwarber should've walked, "strike one" was not a strike
Rhys Hoskins 6-3 ground out, the umpire lost track of the count, pitch #4 was a swing and a miss at strike three?
#Phillies #RingTheBell
Here's a quick look at betting odds for the Wild Card round:

#Padres-#Mets: https://t.co/cpUvAdfNc6

#Phillies-#STLCards: https://t.co/t8O3cmEB2H

#BlueJays-#Mariners: https://t.co/nyXwsCuEfZ

#Rays-#Guardians: https://t.co/Etmp66on00
With his K in the 3rd, Kyle Schwarber becomes the first Phillie to strike out 200 times in a season.

He surpasses Ryan Howard's #Phillies record of 199 in 2007 and '08.

It's the 16th 200+ K season in the majors.
https://t.co/sHFigLvUeD https://t.co/Drbq1aSrR6
@kneerecon #Pitt #PennState #Phillies #Sixers #Eagles #SteelersNation #Steelers #Penguins #PiratesStrong #IndependenceHall #LibertyBell #Flyers #Pennsylvania #Penn #Temple #Drexel #LehighUniversity #ShippensburgUniversity #Bucknell #IUP #KutztownUniversity #WCU #WestChesterUniversity #ESU
@kneerecon #VoteBlue2022 #Diamondbacks #Braves #Orioles #RedSox #Cubs #WhiteSox #Reds #Guardians #Rockies #Tigers #Astros #Royals #Angels #Dodgers #Marlins #Brewers #Twins #Mets #Yankees #Athletics #Phillies #Pirates #Padres #Giants #Mariners #Cardinals #Rays #Rangers #BlueJays #Nationals
#Phillies 0 @ #Astros 0 [T3-2o]:

Kyle Schwarber (swinging; 1)

LHP Framber Valdez (7)
Seq (6): fc si FC fc SI CU×
s3: 79.5mph Curveball
One too many transplants plugged in their #Tesla at #MinuteMaidPark #Astros #Phillies in the dark. https://t.co/K1l8tQ8qtD
Leaving for a golf trip tomorrow… anyone know how to watch on my phone? #Phillies
I think the huge elephant in the room that no one is really talking about is that it’s not really a “Red October” until you get a chance to play a game at home. 🐘 #phillies #RedOctober
It’s about time the MLB forces sanction upon the Astros! #Phillies #Astros https://t.co/nmZNtfgPjk


client = tw.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_KEY_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True,
)


# Store all tweet data
tweet_data = []

for tweets in tw.Paginator(
    client.search_recent_tweets,
    query="#phillies -is:retweet",
    tweet_fields=["created_at"],
    max_results=100,
    limit=10, # 10 pages, each with 100 tweets
):

    # Saves all of the tweets
    tweet_data += tweets.data

    # Sleep for 1 second to avoid rate limit
    time.sleep(1)


len(tweet_data)

1000


# Get the text of the tweets
tweets_text = [tweet.text for tweet in tweet_data]


# the first five tweets
tweets_text[:5]

['Hey look, more Pooh this afternoon. \n#phillies #mlb https://t.co/jlW09pHi8U',
 'Dan Bellino: 6.98% BCR - 4 Stars\n#Astros 10 vs. #Phillies 0\n#LevelUp vs. #RingTheBell https://t.co/znG6I8HxH8',
 'Can\'t say I\'m suprised #Phillies have the longest odds in NL to win pennant. Good pitching beats an "all or nothing" line up. (Bad teams they beat up on, couldn\'t pitch), all the NL Playoff teams can.',
 '@Phillies retired in Top 3rd.. Valdez collects 2 more strike outs\nKyle Schwarber should\'ve walked, "strike one" was not a strike\nRhys Hoskins 6-3 ground out, the umpire lost track of the count, pitch #4 was a swing and a miss at strike three?\n#Phillies #RingTheBell',
 "Here's a quick look at betting odds for the Wild Card round:\n\n#Padres-#Mets: https://t.co/cpUvAdfNc6\n\n#Phillies-#STLCards: https://t.co/t8O3cmEB2H\n\n#BlueJays-#Mariners: https://t.co/nyXwsCuEfZ\n\n#Rays-#Guardians: https://t.co/Etmp66on00"]


def remove_url(txt):
    """
    Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """
    import re
    return " ".join(re.sub("https://t.co/[A-Za-z\\d]+|&amp", "", txt).split())


tweets_no_urls = [remove_url(tweet) for tweet in tweets_text]
tweets_no_urls[:5]

['Hey look, more Pooh this afternoon. #phillies #mlb',
 'Dan Bellino: 6.98% BCR - 4 Stars #Astros 10 vs. #Phillies 0 #LevelUp vs. #RingTheBell',
 'Can\'t say I\'m suprised #Phillies have the longest odds in NL to win pennant. Good pitching beats an "all or nothing" line up. (Bad teams they beat up on, couldn\'t pitch), all the NL Playoff teams can.',
 '@Phillies retired in Top 3rd.. Valdez collects 2 more strike outs Kyle Schwarber should\'ve walked, "strike one" was not a strike Rhys Hoskins 6-3 ground out, the umpire lost track of the count, pitch #4 was a swing and a miss at strike three? #Phillies #RingTheBell',
 "Here's a quick look at betting odds for the Wild Card round: #Padres-#Mets: #Phillies-#STLCards: #BlueJays-#Mariners: #Rays-#Guardians:"]


example_string = "This is an Example"

example_string.lower()

'this is an example'


"This is an Example".lower().split()

['this', 'is', 'an', 'example']


words_in_tweet = [tweet.lower().split() for tweet in tweets_no_urls]
words_in_tweet[1]

['dan',
 'bellino:',
 '6.98%',
 'bcr',
 '-',
 '4',
 'stars',
 '#astros',
 '10',
 'vs.',
 '#phillies',
 '0',
 '#levelup',
 'vs.',
 '#ringthebell']


def count_word_frequencies(words_in_tweet, top=15):
    """
    Given a list of all words for every tweet, count
    word frequencies across all tweets.

    By default, this returns the top 15 words, but you
    can specify a different value for `top`.
    """
    import itertools, collections

    # List of all words across tweets
    all_words = list(itertools.chain(*words_in_tweet))

    # Create counter
    counter = collections.Counter(all_words)

    return pd.DataFrame(counter.most_common(top), columns=["words", "count"])


counts_no_urls = count_word_frequencies(words_in_tweet, top=15)
counts_no_urls.head(n=15)


fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
sns.barplot(
    y="words",
    x="count",
    data=counts_no_urls.sort_values(by="count", ascending=False),
    ax=ax,
    color="#cc3000",
    saturation=1.0,
)

ax.set_title("Common Words Found in Tweets (Including All Words)", fontsize=16);


import nltk
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to /Users/nhand/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


stop_words = list(set(nltk.corpus.stopwords.words('english')))

stop_words[:10]

["shouldn't", 'again', 'isn', 'up', 'ain', 'same', 'she', 'into', 'o', 'once']


len(stop_words)

179


import string


punctuation = list(string.punctuation)


punctuation[:5]

['!', '"', '#', '$', '%']


ignored = stop_words + punctuation


ignored[:10]

["shouldn't", 'again', 'isn', 'up', 'ain', 'same', 'she', 'into', 'o', 'once']


def is_word_valid(word):
    return word not in ignored


tweets_nsw = [
    [word for word in tweet_words if is_word_valid(word)]
    for tweet_words in words_in_tweet
]


tweets_nsw[0]

['hey', 'look,', 'pooh', 'afternoon.', '#phillies', '#mlb']

tweets_nsw = []
for tweet_words in words_in_tweet:

    temp = []
    for word in tweet_words:
        if is_word_valid(word):
            temp.append(word)

    tweets_nsw.append(temp)


counts_nsw = count_word_frequencies(tweets_nsw)
counts_nsw.head(n=15)


fig, ax = plt.subplots(figsize=(8, 8))

sns.barplot(
    y="words",
    x="count",
    data=counts_nsw.sort_values(by="count", ascending=False),
    ax=ax,
    color="#cc3000",
    saturation=1.0,
)

ax.set_title("Common Words Found in Tweets (Without Stop Words)", fontsize=16);


search_terms = ["#phillies", "phillies", "@phillies", "#ringthebell"]
tweets_final = [[w for w in word if w not in search_terms] for word in tweets_nsw]


# frequency counts
counts_final = count_word_frequencies(tweets_final)


fig, ax = plt.subplots(figsize=(8, 8))

sns.barplot(
    y="words",
    x="count",
    data=counts_final.sort_values(by="count", ascending=False),
    ax=ax,
    color="#cc3000",
    saturation=1.0,
)

ax.set_title("Common Words Found in Tweets (Cleaned)", fontsize=16);


import textblob


blobs = [textblob.TextBlob(remove_url(t.text)) for t in tweet_data]


blobs[0]

TextBlob("Hey look, more Pooh this afternoon. #phillies #mlb")


blobs[0].sentiment

Sentiment(polarity=0.5, subjectivity=0.5)


data = {}
data['date'] = [tweet.created_at for tweet in tweet_data]
data['polarity'] = [blob.sentiment.polarity for blob in blobs]
data['subjectivity'] = [blob.sentiment.subjectivity for blob in blobs]
data['text'] = [remove_url(tweet.text) for tweet in tweet_data]
data = pd.DataFrame(data)


data.head()


zero = (data['polarity']==0).sum()
print("number of unbiased tweets = ", zero)

number of unbiased tweets =  422


# remove unbiased tweets
biased = data.loc[ data['polarity'] != 0 ].copy()


biased['polarity'].idxmin()

155


biased.loc[biased['polarity'].idxmin(), 'text']

'It’s Cardinal Hate Week? I’m on board! #FlyEaglesFly #Phillies'


biased.loc[biased['polarity'].idxmax(), 'text']

'The best day of baseball IMHO. Day 1 of Wildcard. I got #Phillies, #Mariners, #Guardians and #Padres. One can dream. #MLB'


# create a figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# histogram
ax.hist(biased['polarity'], bins='auto')
ax.axvline(x=0, c='k', lw=2)

# format
ax.set_xlabel("Polarity")
ax.set_title("Polarity of #phillies Tweets", fontsize=16);


biased['polarity'].median()

0.13693181818181818


biased['polarity'].mean()

0.13043436174094494


biased.loc[biased['subjectivity'].idxmin(), 'text']

'#Astros Martin Maldonado homers (15) 393ft on a line drive to left center off #Phillies Ranger Suarez. PHI 0 @ HOU 6; BOT 2'


biased.loc[biased['subjectivity'].idxmax(), 'text']

'@lateby15 We are glad to have Brandon wearing #phillies pinstripes!!'


# create a figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# histogram
ax.hist(biased['subjectivity'], bins='auto')
ax.axvline(x=0.5, c='k', lw=2)

# format
ax.set_xlabel("Subjectivity")
ax.set_title("Subjectivity of #phillies Tweets", fontsize=16);


sns.regplot(x=biased['subjectivity'], y=biased['polarity']);


sns.kdeplot(x=biased['subjectivity'], y=biased['polarity']);


# this is month/day hour AM/PM
biased['date_string'] = biased['date'].dt.strftime("%-m/%d %I %p")


biased.head()


biased = biased.sort_values(by='date', ascending=True)


fig, ax = plt.subplots(figsize=(8, 14))

sns.boxplot(y='date_string', x='polarity', data=biased, ax=ax)
ax.axvline(x=0, c='k', lw=2) # neutral

# Set yticks to every other hour
yticks = ax.get_yticks()
ax.set_yticks(range(0, len(yticks), 2))
plt.setp(ax.get_yticklabels(), fontsize=10);


fig, ax = plt.subplots(figsize=(8,14))

sns.boxplot(y='date_string', x='subjectivity', data=biased)
ax.axvline(x=0.5, c='k', lw=2) # neutral

# Set yticks to every other hour
yticks = ax.get_yticks()
ax.set_yticks(range(0, len(yticks), 2))
plt.setp(ax.get_yticklabels(), fontsize=10);

	date	polarity	subjectivity	text
0	2022-10-05 21:08:25+00:00	0.500000	0.500000	Hey look, more Pooh this afternoon. #phillies ...
1	2022-10-05 21:08:21+00:00	0.000000	0.000000	Dan Bellino: 6.98% BCR - 4 Stars #Astros 10 vs...
2	2022-10-05 21:06:08+00:00	0.266667	0.555556	Can't say I'm suprised #Phillies have the long...
3	2022-10-05 21:04:47+00:00	0.333333	0.333333	@Phillies retired in Top 3rd.. Valdez collects...
4	2022-10-05 21:03:22+00:00	0.077778	0.433333	Here's a quick look at betting odds for the Wi...

	date	polarity	subjectivity	text	date_string
0	2022-10-05 21:08:25+00:00	0.500000	0.500000	Hey look, more Pooh this afternoon. #phillies ...	10/05 09 PM
2	2022-10-05 21:06:08+00:00	0.266667	0.555556	Can't say I'm suprised #Phillies have the long...	10/05 09 PM
3	2022-10-05 21:04:47+00:00	0.333333	0.333333	@Phillies retired in Top 3rd.. Valdez collects...	10/05 09 PM
4	2022-10-05 21:03:22+00:00	0.077778	0.433333	Here's a quick look at betting odds for the Wi...	10/05 09 PM
5	2022-10-05 21:02:19+00:00	0.125000	0.166667	With his K in the 3rd, Kyle Schwarber becomes ...	10/05 09 PM

	words	count
0	the	996
1	#phillies	952
2	a	342
3	to	341
4	in	292
5	and	248
6	of	231
7	#ringthebell	230
8	#astros	194
9	for	181
10	on	154
11	#mlb	151
12	game	150
13	at	149
14	i	132

Week 5BGetting Data Part 1: Working with APIs

API Example #2: the Twitter API¶

Semi-structured data¶

Key challenges¶

Getting a developer account¶

Step 1: Make a Twitter account¶

Step 2: Apply for Developer access¶

Step 3: Create a new project and app¶

Step 4: Edit your "User Authentication Settings"¶

Step 5: Create your API keys¶

Twitter v2 Access Levels¶

A note about academic access¶

Tweepy: a Python interface to Twitter¶

Define your API keys¶

Initialize an API object¶

Rate Limits¶

What does wait_on_rate_limit do?¶

Several different APIs available¶

You can also stream tweets¶

You can also tweet (if you want!)¶

The Twitter Search API¶

1. Getting tweets from the past 7 days¶

2. Getting geographic data¶

3. Getting more than 100 tweets¶

4. Customizing our search query¶

Let's remove retweets¶

Did it work?¶

Use case #1: calculating word frequencies¶

Load the most recent 1,000 tweets¶

Text mining and dealing with messy data¶

Step 1: Removing URLs¶

Step 2: Remove stop words and punctuation¶

Import and download the stop words¶

Get the list of common stop words¶

Get the list of common punctuation¶

Remove stop words from our tweets¶

Get our DataFrame of frequencies¶

Step 3: remove our query terms¶

Text mining: At home exercise¶

Use case #2: sentiment analysis¶

The goal¶

Let's analyze our set of 1,000 #phillies tweets¶

Create our "text blobs"¶

Combine the data into a DataFrame¶

How many are unbiased?¶

What does a polarized tweet look like?¶

The most negative¶

The most positive¶

Plot a histogram of polarity¶

And subjectivity too...¶

The most objective¶

The most subjective¶

The distribution of subjectivity¶

How does polarity influence subjectivity?¶

Seaborn's regplot() function¶

Seaborn's kdeplot()¶

Let's check for hourly trends too¶

Make a box and whiskers plot of the polarity¶

And subjectivity over time...¶

At home exercise: sentiment analysis¶

That's it!¶

Week 5B
Getting Data Part 1: Working with APIs

What does `wait_on_rate_limit` do?¶

Seaborn's `regplot()` function¶

Seaborn's `kdeplot()`¶