In [2]:
# Chapter 1: Mining Twitter: Exploring Trending Topics, Discovering What People Are Talking About, and More
# Not my original Code ...
# All Copyright belongs to the Original Author - Mining the Social Web, 2nd Edition (O'Reilly, 2013) http://bit.ly/135dHfs
# Code inspired from - https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition
import twitter
In [3]:
CONSUMER_KEY = 'Q1'
CONSUMER_SECRET = 'Sx'
OAUTH_TOKEN = '7d'
OAUTH_TOKEN_SECRET = 'A9'
In [4]:
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
# Nothing to see by displaying twitter_api except that it's now a
# defined variable
print twitter_api
In [5]:
# The Yahoo! Where On Earth ID for the entire world is 1.
# See https://dev.twitter.com/docs/api/1.1/get/trends/place and
# http://developer.yahoo.com/geo/geoplanet/
WORLD_WOE_ID = 1
INDIA_WOE_ID = 23424848
BANGALORE_WOE_ID = 2295420
# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special case keyword argument.
world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
INDIA_trends = twitter_api.trends.place(_id=INDIA_WOE_ID)
BANGALORE_trends = twitter_api.trends.place(_id=BANGALORE_WOE_ID)
print world_trends
print
print
print INDIA_trends
print
print
print BANGALORE_trends
In [6]:
import json
print json.dumps(world_trends, indent=1)
print
print json.dumps(INDIA_trends, indent=1)
print
print
print
print
print json.dumps(BANGALORE_trends, indent=1)
In [19]:
# Doesnt Seem to Work - Ignore for Now ....
import uuid
from IPython.display import display_javascript, display_html, display
import json
class RenderJSON(object):
def __init__(self, json_data):
if isinstance(json_data, dict):
self.json_str = json.dumps(json_data)
else:
self.json_str = json
self.uuid = str(uuid.uuid4())
def _ipython_display_(self):
display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
raw=True
)
display_javascript("""
require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
document.getElementById('%s').appendChild(renderjson(%s))
});
""" % (self.uuid, self.json_str), raw=True)
In [7]:
INDIA_trends_set = set([trend['name']
#
for trend in INDIA_trends[0]['trends']])
#
BANGALORE_trends_set = set([trend['name']
#
for trend in BANGALORE_trends[0]['trends']])
#
common_trends = INDIA_trends_set.intersection(BANGALORE_trends_set)
#
print common_trends
In [8]:
q = '#rstats'
count = 100
#
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
#
search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']
#
# Iterate through 5 more batches of results by following the cursor
#
for _ in range(5):
print "Length of statuses", len(statuses)
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
#break -
# Here Break is used in Original text - but seems BREAK not to be used
#
# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
#
kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
#
# Show one sample search result by slicing the list...
#
print json.dumps(statuses[0], indent=1)
#
In [9]:
status_texts = [ status['text']
for status in statuses ]
#
screen_names = [ user_mention['screen_name']
#
for status in statuses
for user_mention in status['entities']['user_mentions'] ]
#
hashtags = [ hashtag['text']
for status in statuses
for hashtag in status['entities']['hashtags'] ]
#
#
# Compute a collection of all words from all tweets
words = [ w
for t in status_texts
for w in t.split() ]
In [15]:
# Explore the first 10 items for each...
print json.dumps(status_texts[0:10], indent=1)
print
print
print json.dumps(screen_names[0:10], indent=1)
print
print
print json.dumps(hashtags[0:10], indent=1)
print
print
print json.dumps(words[0:10], indent=1)
print
print
In [10]:
from collections import Counter
#
for item in [words, screen_names, hashtags]:
c = Counter(item)
#
print c.most_common()[:15] # top 15
print
In [20]:
from prettytable import PrettyTable
#
for label, data in (('Word', words),
('Screen Name', screen_names)):
ptScrnName = PrettyTable(field_names=[label, 'Count of Tweets'])
c = Counter(data)
[ ptScrnName.add_row(kv) for kv in c.most_common()[:15] ]
ptScrnName.align[label], ptScrnName.align['Count'] = 'l', 'r' # Set column alignment
print ptScrnName
In [19]:
from prettytable import PrettyTable
#
for label, data in (('Word', words),
('Screen Name', screen_names),
('Hashtag of Tweet with #rstats - Not on @DhankarRohit Timeline', hashtags)):
ptHash = PrettyTable(field_names=[label, 'Count of Tweets'])
c = Counter(data)
[ ptHash.add_row(kv) for kv in c.most_common()[:15] ]
ptHash.align[label], ptHash.align['Count'] = 'l', 'r' # Set column alignment
print ptHash
In [23]:
from prettytable import PrettyTable
#
for label, data in ('Word', words):
ptWords = PrettyTable(field_names=[label, 'Count of Tweets'])
c = Counter(data)
[ ptWords.add_row(kv) for kv in c.most_common()[:15] ]
ptWords.align[label], ptWords.align['Count'] = 'l', 'r' # Set column alignment
print ptWords
In [32]:
# A function for computing lexical diversity
#
def lexical_diversity(tokens):
return 1.0*len(set(tokens))/len(tokens)
#
# A function for computing the average number of words per tweet
def average_words(statuses):
total_words = sum([ len(s.split()) for s in statuses ])
return 1.0*total_words/len(statuses)
#
print "The Lexical Diversity for - Words :" ,lexical_diversity(words)
print
print "Lexical Diversity for - Screen Names :" ,lexical_diversity(screen_names)
print
print "Lexical Diversity for - #Tags :" ,lexical_diversity(hashtags)
print
print average_words(status_texts)
In [34]:
# Finding the most popular retweets
retweets = [
# Store out a tuple of these three values ...
(status['retweet_count'],
status['retweeted_status']['user']['screen_name'],
status['text'])
#
# ... for each status ...
for status in statuses
# ... so long as the status meets this condition.
if status.has_key('retweeted_status')
]
#
# Slice off the first 5 from the sorted results and display each item in the tuple
pt = PrettyTable(field_names=['Re-Tweet Count', 'Screen Name', 'Text'])
[ pt.add_row(row) for row in sorted(retweets, reverse=True)[:5] ]
pt.max_width['Text'] = 50
pt.align= 'l'
print pt
In [37]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
In [38]:
word_counts = sorted(Counter(words).values(), reverse=True)
plt.loglog(word_counts)
plt.ylabel("Freq")
plt.xlabel("Word Rank")
Out[38]:
In [40]:
# Generating histograms of words, screen names, and hashtags
for label, data in (('Words', words),
('Screen Names', screen_names),
('Hashtags', hashtags)):
# Build a frequency map for each set of data
# and plot the values
c = Counter(data)
plt.hist(c.values())
#
# Add a title and y-label ...
#
plt.title(label)
plt.ylabel("Number of items in bin")
plt.xlabel("Bins (number of times an item appeared)")
#
# ... and display as a new figure
#
plt.figure()
#
# Generating a histogram of retweet counts
# Using underscores while unpacking values in
# a tuple is idiomatic for discarding them
#
counts = [count for count, _, _ in retweets]
plt.hist(counts)
plt.title("Retweets")
plt.xlabel('Bins (number of times retweeted)')
plt.ylabel('Number of tweets in bin')
print counts
#
#
# DISCLAIMER - None of the code seen in this Notebook is ORGINIAL - Its Copyright of the ORIGINAL AUTHOR and NOT of ROHIT .
#
#
In [ ]:
No comments:
Post a Comment