In [2]:

# Chapter 1: Mining Twitter: Exploring Trending Topics, Discovering What People Are Talking About, and More
# Not my original Code ...
# All Copyright belongs to the Original Author - Mining the Social Web, 2nd Edition (O'Reilly, 2013) http://bit.ly/135dHfs
# Code inspired from - https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition

import twitter


In [3]:

CONSUMER_KEY = 'Q1'
CONSUMER_SECRET = 'Sx'
OAUTH_TOKEN = '7d'
OAUTH_TOKEN_SECRET = 'A9'


In [4]:

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)

# Nothing to see by displaying twitter_api except that it's now a
# defined variable

print twitter_api



<twitter.api.Twitter object at 0x0000000003C0A780>


In [5]:

# The Yahoo! Where On Earth ID for the entire world is 1.
# See https://dev.twitter.com/docs/api/1.1/get/trends/place and
# http://developer.yahoo.com/geo/geoplanet/

WORLD_WOE_ID = 1
INDIA_WOE_ID = 23424848
BANGALORE_WOE_ID = 2295420


# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special case keyword argument.

world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
INDIA_trends = twitter_api.trends.place(_id=INDIA_WOE_ID)
BANGALORE_trends = twitter_api.trends.place(_id=BANGALORE_WOE_ID)


print world_trends
print
print
print INDIA_trends
print
print 
print BANGALORE_trends



[{u'created_at': u'2016-01-09T19:44:50Z', u'trends': [{u'url': u'http://twitter.com/search?q=%23BuyMadeInTheAM', u'query': u'%23BuyMadeInTheAM', u'tweet_volume': None, u'name': u'#Buname': u'#CelebCollege', u'promoted_content': None}, {u'url': u'http://twitter.com/search?q=%23FRAPOL', u'query': u'%23FRAPOL', u'tweet_volume': None, u'name': u'#FRAPOL', u'promoted_content': None}], u'as_of': u'2016-01-09T19:50:23Z', u'locations': [{u'woeid': 1, u'name': u'Worldwide'}]}]


[{u'created_at': u'2016-01-09T19:44:50Z', u'trends': [{u'url': u'http://twitter.com/search?q=%23HappyBirthdayHrithik', u'query': u'%23HappyBirthdayHrithik', u'tweet_volume': 22578, .com/search?q=%23THTRIdotcom', u'query': u'%23THTRIdotcom', u'tweet_volume': None, u'name': u'#THTRIdotcom', u'promoted_content': None}], u'as_of': u'2016-01-09T19:50:25Z', u'locations': [{u'woeid': 23424848, u'name': u'India'}]}]


[{u'created_at': u'2016-01-09T19:44:53Z', u'trends': [{u'url': u'http://twitter.com/search?q=%232YrsOfRivalsNightmareVEERAM', u'query': u'%232YrsOfRivalsNightmareVEERAM', u'tweet_volume': 32778, u'name': u'#2YrsOfRivalsNightmareVEERAM', u'promoted_content': None}, {u'url': ted_content': None}, {u'url': u'http://twitter.com/search?q=%23jallikattu', u'query': u'%23jallikattu', u'tweet_volume': None, u'name': u'#jallikattu', u'promoted_content': None}], u'as_of': u'2016-01-09T19:50:26Z', u'locations': [{u'woeid': 2295420, u'name': u'Bangalore'}]}]


In [6]:

import json
print json.dumps(world_trends, indent=1)
print
print json.dumps(INDIA_trends, indent=1)
print
print
print
print
print json.dumps(BANGALORE_trends, indent=1)



[
 {
  "created_at": "2016-01-09T19:44:50Z", 
  "trends": [
   {
    "url": "http://twitter.com/search?q=%23BuyMadeInTheAM", 
    "query": "%23BuyMadeInTheAM", 
    "tweet_volume": null, 
    "name": "#BuyMadeInTheAM", 
    "promoted_content": null
   }, 
   {
      }
  ], 
  "as_of": "2016-01-09T19:50:23Z", 
  "locations": [
   {
    "woeid": 1, 
    "name": "Worldwide"
   }
  ]
 }
]

[
 {
  "created_at": "2016-01-09T19:44:50Z", 
  "trends": [
   {
    "url": "http://twitter.com/search?q=%23HappyBirthdayHrithik", 
    "query": "%23HappyBirthdayHrithik", 
    "tweet_volume": 22578, 
    "name": "#HappyBirthdayHrithik", 
    "promoted_content": null
   }, 
   {
     }
  ], 
  "as_of": "2016-01-09T19:50:25Z", 
  "locations": [
   {
    "woeid": 23424848, 
    "name": "India"
   }
  ]
 }
]




[
 {
  "created_at": "2016-01-09T19:44:53Z", 
  "trends": [
   {
    "url": "http://twitter.com/search?q=%232YrsOfRivalsNightmareVEERAM", 
    "query": "%232YrsOfRivalsNightmareVEERAM", 
    "tweet_volume": 32778, 
    "name": "#2YrsOfRivalsNightmareVEERAM", 
    "promoted_content": null
   }, 
   {
       {
    "url": "http://twitter.com/search?q=%23ComedyNightsBachao", 
    "query": "%23ComedyNightsBachao", 
    "tweet_volume": null, 
    "name": "#ComedyNightsBachao", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23SanTina", 
    "query": "%23SanTina", 
    "tweet_volume": null, 
    "name": "#SanTina", 
    "promoted_content": null
   }, 
   {
    "url": "http://twitter.com/search?q=%23ICantGiveYouAnythingBut", 
    "query": "%23ICantGiveYouAnythingBut", 
    "tweet_volume": 16742, 
    "name": "#ICantGiveYouAnythingBut", 
    "promoted_content": null
   }, 
   {
      }, 
   {
    "url": "http://twitter.com/search?q=%23jallikattu", 
    "query": "%23jallikattu", 
    "tweet_volume": null, 
    "name": "#jallikattu", 
    "promoted_content": null
   }
  ], 
  "as_of": "2016-01-09T19:50:26Z", 
  "locations": [
   {
    "woeid": 2295420, 
    "name": "Bangalore"
   }
  ]
 }
]


In [19]:

# Doesnt Seem to Work - Ignore for Now ....

import uuid
from IPython.display import display_javascript, display_html, display
import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)


In [7]:

INDIA_trends_set = set([trend['name']
#
for trend in INDIA_trends[0]['trends']])
#
BANGALORE_trends_set = set([trend['name']
#                     
for trend in BANGALORE_trends[0]['trends']])
#
common_trends = INDIA_trends_set.intersection(BANGALORE_trends_set)
#
print common_trends



set([u'#2YrsOfRivalsNightmareVEERAM', u'#MomsBeLike', u'#VirtualRun', u'#ComedyNightsBachao', u'#ModiCrocodileTears', u'#jallikattu', u'AAMIR PROUD INDIAN', u'Netaji', u'#\u091c\u092c\u0938\u0947_\u0939\u092e_\u092c\u0921\u093c\u0947_\u0939\u0941\u090f', u'Mata', u'#SavourANewFlavour', u'#Season2', u'#CelebCollege', u'Van Gaal', u'#HNYSale', u'#MyFashionMyRight', u'#BuyMadeInTheAM', u'Manish Tewari', u'#HBDQueenDrashtiDhami', u'#FACup', u'#ManUtd', u'Feminist and Media Discourse', u'Memphis', u'#FullOnFashionWithSnapdeal', u'#SherlockSpecialOnAXN', u'Djokovic', u'#KyaKoolMusicLaunch', u'#Messi', u'#WAXIvIND', u'#WazirOutInCinemas', u'#SanTina', u'Rooney', u'Theatrical Trailer', u'#dekhajayeto', u'HBD Mihika', u'ABP News', u'#2YearsOfEpic1Nenokkadine', u'#QatarOpen', u'#BB9WithSalmanKhan', u'#ICantGiveYouAnythingBut', u'Sushma Swaraj', u'#AustinMoreRespectForHarry', u'#HappyBirthdayHrithik', u'#PearlsOfIndia', u'Sonam Kapoor'])


In [8]:

q = '#rstats'
count = 100
#
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
#
search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']
#
# Iterate through 5 more batches of results by following the cursor
#
for _ in range(5):
    print "Length of statuses", len(statuses)
try:
    next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
   
    #break - 
    # Here Break is used in Original text - but seems BREAK not to be used 
#
# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
#
    kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
    search_results = twitter_api.search.tweets(**kwargs)
    statuses += search_results['statuses']
#
# Show one sample search result by slicing the list...
#
print json.dumps(statuses[0], indent=1)
#



Length of statuses 100
Length of statuses 100
Length of statuses 100
Length of statuses 100
Length of statuses 100
{
 "contributors": null, 
 "truncated": false, 
 "text": "RT @EXAGolo: Deploy R code on our in-memory database with the EXASOL R SDK available on Github: https://t.co/O5B7ohzljO #rstats #datascience", 
 "is_quote_status": false, 
 "in_reply_to_status_id": null, 
 "id": 685912371412516864, 
 "favorite_count": 0, 
 "source": "<a href=\"https://roundteam.co\" rel=\"nofollow\">RoundTeam</a>", 
 "retweeted": false, 
 "coordinates": null, 
 "entities": {
  "symbols": [], 
  "user_mentions": [
   {
    "id": 2797602696, 
    "indices": [
     3, 
     11
    ], 
    "id_str": "2797602696", 
    "screen_name": "EXAGolo", 
    "name": "Mathias Golombek"
   }
  ], 
  "hashtags": [
   {
    "indices": [
     120, 
     127
    ], 
    "text": "rstats"
   }, 
   {
    "indices": [
     128, 
     140
    ], 
    "text": "datascience"
   }
  ], 
  "urls": [


    
        auth = OAuth(
            consumer_key='[your consumer key]',
            consumer_secret='[your consumer secret]',
            token='[your token]',
            token_secret='[your token secret]'
        )
        twitter_userstream = TwitterStream(auth=auth, domain='userstream.twitter.com')
        for msg in twitter_userstream.user():
            if 'direct_message' in msg:
                print msg['direct_message']['text']
    
 


DATA
    __all__ = ['NoAuth', 'OAuth', 'OAuth2', 'oauth2_dance', 'oauth_dance',...




In [9]:

status_texts = [ status['text']
for status in statuses ]
#
screen_names = [ user_mention['screen_name']
#        
for status in statuses
for user_mention in status['entities']['user_mentions'] ]
#
hashtags = [ hashtag['text']
for status in statuses
for hashtag in status['entities']['hashtags'] ]
#
#
# Compute a collection of all words from all tweets
words = [ w
for t in status_texts
for w in t.split() ]


In [15]:

# Explore the first 10 items for each...

print json.dumps(status_texts[0:10], indent=1)
print
print
print json.dumps(screen_names[0:10], indent=1)
print
print
print json.dumps(hashtags[0:10], indent=1)
print
print
print json.dumps(words[0:10], indent=1)
print
print



[
 "RT @BigDataShift: Twitter Data Analysis and Topic Modeling in R: https://t.co/mWiAMjrssQ #abdsc #Rstats #BigData #DataScience https://t.co/\u2026", 
 "RT @EXAGolo: Deploy R code on our in-memory database with the EXASOL R SDK available on Github: https://t.co/O5B7ohzljO #rstats #datascience", 
 "RT @BigDataShift: Twitter Data Analysis and Topic Modeling in R: https://t.co/mWiAMjrssQ #abdsc #Rstats #BigData #DataScience https://t.co/\u2026", 
 "RT @JennyBryan: Programming insight of the day \ud83d\udca1:\nAnything that can be NULL, NA, or of length zero eventually will be. #rstats", 
 "RT @v_vashishta: The Metropolis\u2013Hastings algorithm https://t.co/TVvhAYOwT0 #DataScience #rstats", 
 "The Force Accounted #rstats https://t.co/0mMaXKZzdj", 
 "RT @BigDataShift: Twitter Data Analysis and Topic Modeling in R: https://t.co/mWiAMjrssQ #abdsc #Rstats #BigData #DataScience https://t.co/\u2026", 
 "RT @JennyBryan: Programming insight of the day \ud83d\udca1:\nAnything that can be NULL, NA, or of length zero eventually will be. #rstats", 
 "RT @BigDataShift: Twitter Data Analysis and Topic Modeling in R: https://t.co/mWiAMjrssQ #abdsc #Rstats #BigData #DataScience https://t.co/\u2026", 
 "RT @BigDataShift: Twitter Data Analysis and Topic Modeling in R: https://t.co/mWiAMjrssQ #abdsc #Rstats #BigData #DataScience https://t.co/\u2026"
]


[
 "BigDataShift", 
 "EXAGolo", 
 "BigDataShift", 
 "JennyBryan", 
 "v_vashishta", 
 "BigDataShift", 
 "JennyBryan", 
 "BigDataShift", 
 "BigDataShift", 
 "JennyBryan"
]


[
 "abdsc", 
 "Rstats", 
 "BigData", 
 "DataScience", 
 "rstats", 
 "datascience", 
 "abdsc", 
 "Rstats", 
 "BigData", 
 "DataScience"
]


[
 "RT", 
 "@BigDataShift:", 
 "Twitter", 
 "Data", 
 "Analysis", 
 "and", 
 "Topic", 
 "Modeling", 
 "in", 
 "R:"
]




In [10]:

from collections import Counter
#
for item in [words, screen_names, hashtags]:
    c = Counter(item)
#
print c.most_common()[:15] # top 15
print



[(u'rstats', 76), (u'DataScience', 29), (u'BigData', 18), (u'Rstats', 17), (u'datascience', 13), (u'abdsc', 12), (u'ggplot2', 10), (u'StarWars', 7), (u'RStats', 7), (u'Wikipedia', 5), (u'machinelearning', 4), (u'jobs', 4), (u'Tech', 4), (u'Soft', 4), (u'Tools', 4)]



In [20]:

from prettytable import PrettyTable
#
for label, data in (('Word', words),
('Screen Name', screen_names)):
    ptScrnName = PrettyTable(field_names=[label, 'Count of Tweets'])
    c = Counter(data)
[ ptScrnName.add_row(kv) for kv in c.most_common()[:15] ]
ptScrnName.align[label], ptScrnName.align['Count'] = 'l', 'r' # Set column alignment
print ptScrnName



+---------------+-----------------+
| Screen Name   | Count of Tweets |
+---------------+-----------------+
| BigDataShift  |        12       |
| JennyBryan    |        10       |
| Rbloggers     |        8        |
| hrbrmstr      |        7        |
| EXAGolo       |        6        |
| yuvipanda     |        5        |
| MangoTheCat   |        5        |
| halfak        |        5        |
| quominus      |        4        |
| GilPress      |        4        |
| tcbanalytics  |        4        |
| datentaeterin |        3        |
| v_vashishta   |        3        |
| DiegoKuonen   |        3        |
| drivendataorg |        3        |
+---------------+-----------------+


In [19]:

from prettytable import PrettyTable
#
for label, data in (('Word', words),
('Screen Name', screen_names),
('Hashtag of Tweet with #rstats - Not on @DhankarRohit Timeline', hashtags)):
    ptHash = PrettyTable(field_names=[label, 'Count of Tweets'])
    c = Counter(data)
[ ptHash.add_row(kv) for kv in c.most_common()[:15] ]
ptHash.align[label], ptHash.align['Count'] = 'l', 'r' # Set column alignment
print ptHash



+---------------------------------------------------------------+-----------------+
| Hashtag of Tweet with #rstats - Not on @DhankarRohit Timeline | Count of Tweets |
+---------------------------------------------------------------+-----------------+
| rstats                                                        |        76       |
| DataScience                                                   |        29       |
| BigData                                                       |        18       |
| Rstats                                                        |        17       |
| datascience                                                   |        13       |
| abdsc                                                         |        12       |
| ggplot2                                                       |        10       |
| StarWars                                                      |        7        |
| RStats                                                        |        7        |
| Wikipedia                                                     |        5        |
| machinelearning                                               |        4        |
| jobs                                                          |        4        |
| Tech                                                          |        4        |
| Soft                                                          |        4        |
| Tools                                                         |        4        |
+---------------------------------------------------------------+-----------------+


In [23]:

from prettytable import PrettyTable
#
for label, data in ('Word', words):
    ptWords = PrettyTable(field_names=[label, 'Count of Tweets'])
    c = Counter(data)
[ ptWords.add_row(kv) for kv in c.most_common()[:15] ]
ptWords.align[label], ptWords.align['Count'] = 'l', 'r' # Set column alignment
print ptWords



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-7057b981c60b> in <module>()
      1 from prettytable import PrettyTable
      2 #
----> 3 for label, data in ('Word', words):
      4     ptWords = PrettyTable(field_names=[label, 'Count of Tweets'])
      5     c = Counter(data)

ValueError: too many values to unpack

In [32]:

# A function for computing lexical diversity
#
def lexical_diversity(tokens):
    return 1.0*len(set(tokens))/len(tokens)
#
# A function for computing the average number of words per tweet
def average_words(statuses):
    total_words = sum([ len(s.split()) for s in statuses ])
    return 1.0*total_words/len(statuses)
#
print "The Lexical Diversity for - Words :" ,lexical_diversity(words)
print
print  "Lexical Diversity for - Screen Names :" ,lexical_diversity(screen_names)
print
print  "Lexical Diversity for - #Tags :" ,lexical_diversity(hashtags)
print
print average_words(status_texts)



The Lexical Diversity for - Words : 0.244069912609

Lexical Diversity for - Screen Names : 0.327868852459

Lexical Diversity for - #Tags : 0.140495867769

16.02


In [34]:

# Finding the most popular retweets

retweets = [
# Store out a tuple of these three values ...
(status['retweet_count'],
status['retweeted_status']['user']['screen_name'],
status['text'])
#
# ... for each status ...
for status in statuses
# ... so long as the status meets this condition.
if status.has_key('retweeted_status')
]
#

# Slice off the first 5 from the sorted results and display each item in the tuple
pt = PrettyTable(field_names=['Re-Tweet Count', 'Screen Name', 'Text'])
[ pt.add_row(row) for row in sorted(retweets, reverse=True)[:5] ]
pt.max_width['Text'] = 50
pt.align= 'l'
print pt



+----------------+--------------+----------------------------------------------------+
| Re-Tweet Count | Screen Name  | Text                                               |
+----------------+--------------+----------------------------------------------------+
| 246            | lisachwinter | RT @lisachwinter: List of useful packages          |
|                |              | (libraries) for Data Analysis in R                 |
|                |              | http://t.co/XV31SrwgIA via @analyticsvidhya        |
|                |              | #RStats http://t.c…                                |
| 31             | BigDataShift | RT @BigDataShift: Twitter Data Analysis and Topic  |
|                |              | Modeling in R: https://t.co/mWiAMjrssQ #abdsc      |
|                |              | #Rstats #BigData #DataScience https://t.co/…       |
| 31             | BigDataShift | RT @BigDataShift: Twitter Data Analysis and Topic  |
|                |              | Modeling in R: https://t.co/mWiAMjrssQ #abdsc      |
|                |              | #Rstats #BigData #DataScience https://t.co/…       |
| 31             | BigDataShift | RT @BigDataShift: Twitter Data Analysis and Topic  |
|                |              | Modeling in R: https://t.co/mWiAMjrssQ #abdsc      |
|                |              | #Rstats #BigData #DataScience https://t.co/…       |
| 31             | BigDataShift | RT @BigDataShift: Twitter Data Analysis and Topic  |
|                |              | Modeling in R: https://t.co/mWiAMjrssQ #abdsc      |
|                |              | #Rstats #BigData #DataScience https://t.co/…       |
+----------------+--------------+----------------------------------------------------+


In [37]:

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt


In [38]:

word_counts = sorted(Counter(words).values(), reverse=True)
plt.loglog(word_counts)
plt.ylabel("Freq")
plt.xlabel("Word Rank")


Out[38]:

<matplotlib.text.Text at 0x21c9da0>



In [40]:

# Generating histograms of words, screen names, and hashtags
for label, data in (('Words', words),
('Screen Names', screen_names),
('Hashtags', hashtags)):
# Build a frequency map for each set of data
# and plot the values
    c = Counter(data)
    plt.hist(c.values())
#
# Add a title and y-label ...
#
plt.title(label)
plt.ylabel("Number of items in bin")
plt.xlabel("Bins (number of times an item appeared)")
#
# ... and display as a new figure
#
plt.figure()
#
# Generating a histogram of retweet counts
# Using underscores while unpacking values in
# a tuple is idiomatic for discarding them
#
counts = [count for count, _, _ in retweets]
plt.hist(counts)
plt.title("Retweets")
plt.xlabel('Bins (number of times retweeted)')
plt.ylabel('Number of tweets in bin')
print counts
#
#
# DISCLAIMER - None of the code seen in this Notebook is ORGINIAL - Its Copyright of the ORIGINAL AUTHOR and NOT of ROHIT . 
#
#



[6, 3, 31, 6, 3, 6, 31, 19, 10, 3, 31, 6, 31, 10, 3, 31, 10, 31, 31, 10, 31, 31, 31, 31, 31, 12, 2, 6, 6, 6, 4, 2, 10, 12, 19, 10, 10, 10, 19, 2, 4, 3, 10, 4, 19, 4, 3, 2, 2, 1, 10, 4, 4, 4, 4, 9, 19, 4, 4, 246, 4, 3, 3, 4, 4, 3, 4, 19, 2, 4, 19, 15, 15]






In [ ]:

 
Data Science with R and Python

Saturday 9 January 2016

Twitter API with iPython Notebook - Initial Experiments

No comments:

Post a Comment