Monday, 11 January 2016

Twitter API - LIVE Stream Mining PART -1

# Code inspired by = http://adilmoujahid.com/posts/2014/07/twitter-analytics/

#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

#Variables that contains the user credentials to access Twitter API 
access_token = '73090441-N74od'
access_token_secret = 'A########9'
consumer_key = 'Q#############1'
consumer_secret = 'S#############x'


#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):

    def on_data(self, data):
        print data
        return True

    def on_error(self, status):
        print status


if __name__ == '__main__':

    #This handles Twitter authetification and the connection to Twitter Streaming API
    l = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = Stream(auth, l)

    #This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
    stream.filter(track=['#rstats , #python, JavaScript , Perl , JAVA, PHP , RSTUDIO, Tutorial , oreilly , oreilly , Springer , Python , Pearson , Elsevier , Bertelsmann AG , McGrawHill , Wiley , APRESS , CAMBRIDGE , HARVARD , HarperCollins, HBR , Kadokawa Publishing'])
    for tweet in stream:
        print tweet['text']

# Note only 1 JSON Dump shown below ---- 
{"created_at":"Tue Jan 12 01:47:10 +0000 2016","id":686726079554269185,"id_str":"686726079554269185","text":"@Tom_Dart Talk to Tip Top Cafe's Jim Scott, who says 'armed society is a civil society,' allows open carry https:\/\/t.co\/10VHLJKbW4 via @mySA","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":151412907,"in_reply_to_user_id_str":"151412907","in_reply_to_screen_name":"Tom_Dart","user":{"id":1599217352,"id_str":"1599217352","name":"Arthur Cavazos","screen_name":"artcavazos1","location":null,"url":null,"description":"Veteran public relations executive and business owner with a sterling reputation built on solid values, experience, and strategic thinking.","protected":false,"verified":false,"followers_count":901,"friends_count":1976,"listed_count":11,"favourites_count":320,"statuses_count":2435,"created_at":"Tue Jul 16 20:29:31 +0000 2013","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000143871263\/80968f059be13a992f9da992acba42bb_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000143871263\/80968f059be13a992f9da992acba42bb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1599217352\/1398272181","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/10VHLJKbW4","expanded_url":"http:\/\/www.mysanantonio.com\/news\/local\/article\/San-Antonio-staple-Tip-Top-Cafe-to-allow-open-6740579.php?cmpid=twitter-desktop","display_url":"mysanantonio.com\/news\/local\/art\u2026","indices":[107,130]}],"user_mentions":[{"screen_name":"Tom_Dart","name":"Tom Dart","id":151412907,"id_str":"151412907","indices":[0,9]},{"screen_name":"mySA","name":"mySA","id":9830752,"id_str":"9830752","indices":[135,140]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1452563230776"}


In [27]:
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

import json
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

#Variables that contains the user credentials to access Twitter API 
access_token = '73090441-N74orPfAdWYhOvWUHpKFkUkejTqSmOeTjadhfX0Yd'
access_token_secret = 'Ah7WiI4tvUcEVoBDXbQJECRSHfUHrCWE78TmU2G3nMGE9'
consumer_key = 'Q3M3ZNo3QL9PDAjOoZCMKowM1'
consumer_secret = 'SOenR4B7JzF6KMi5WtbSpAYX9MLlzSy6mlud3c3nn0on5yu3Qx'
In [28]:
tweets_data_path = 'C:/STAT/BRIDGE/Module-4/_______Python Own/Wk-1/Twitter1/_Tech_1.txt'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
In [29]:
print len(tweets_data)
363
In [30]:
tweets = pd.DataFrame()
tweets['tweet'] = map(lambda tweet: tweet['text'] if 'text' in tweet else None, tweets_data)
tweets['created_at'] = map(lambda tweet: tweet['created_at'] if 'created_at' in tweet else None, tweets_data)
tweets['user_id'] = map(lambda tweet: tweet['user']['id'] if 'user' in tweet else None, tweets_data)
tweets['id_str'] = map(lambda tweet: tweet['user']['id_str'] if 'user' in tweet else None, tweets_data)
tweets['username'] = map(lambda tweet: tweet['user']['name'] if 'user' in tweet else None, tweets_data)
tweets['screen_name'] = map(lambda tweet: tweet['user']['screen_name'] if 'user' in tweet else None, tweets_data)
tweets['location'] = map(lambda tweet: tweet['user']['location'] if 'user' in tweet else None, tweets_data)
tweets['followers_count'] = map(lambda tweet: tweet['user']['followers_count'] if 'user' in tweet else None, tweets_data)
tweets['friends_count'] = map(lambda tweet: tweet['user']['friends_count'] if 'user' in tweet else None, tweets_data)
tweets['created_at'] = map(lambda tweet: tweet['user']['created_at'] if 'user' in tweet else None, tweets_data)
tweets['user_lang'] = map(lambda tweet: tweet['user']['lang'] if 'user' in tweet else None, tweets_data)
tweets['following'] = map(lambda tweet: tweet['user']['following'] if 'user' in tweet else None, tweets_data)
tweets['geo'] = map(lambda tweet: tweet['geo'], tweets_data)
#
tweets['coordinates'] = map(lambda tweet: tweet['coordinates'], tweets_data)
tweets['retweet_count'] = map(lambda tweet: tweet['retweet_count'], tweets_data)
tweets['favorite_count'] = map(lambda tweet: tweet['favorite_count'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
# if k1 in d and k2 in d[k1]
tweets['country'] = map(lambda tweet: tweet['place']['country'] if 'country' in tweet else None, tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)

print (tweets)
                                                 tweet  \
0    https://t.co/9F9j8Usc8Y https://t.co/gHZV0Y3PP...   
1    https://t.co/HH0P8pluxb https://t.co/xRhIYSGsr...   
2    설공김추성안차장한문곽민홍신손곽길남노 https://t.co/tM1UiWQmZz ht...   
3    https://t.co/sE72SAsDpf https://t.co/XHFnN6FyN...   
4    My descarga Carreteras Secundarias porque ¨Est...   
5    https://t.co/GyC1jlHPWu https://t.co/w5yjxIuuG...   
6    #Java Multi Dimensional array/2d array\n#Tech ...   
7    💋 LUNA de #Caracas 🇻🇪 ver telefono y tarifa...   
8    RT @CanselSevenler: @cansellelcin #làoùatillap...   
9    https://t.co/7wLUtAUG82 https://t.co/QDN14LnUh...   
10   https://t.co/e9lg9NbBGo https://t.co/LW07kta5h...   
11   https://t.co/3Lv0xlvYUL https://t.co/3080T0BNz...   
12   RT @conagua_clima: #Pronóstico a muy Corto Pla...   
13   J0116-0672 - Java Developer - Java Developer, ...   
14   Neobux: Aprenda Lucrar Com Neobux. Tutorial, T...   
15   https://t.co/IxfhBonpV3 https://t.co/lRuWYgt48...   
16   Everything You Need to Know About This Weekend...   
17   https://t.co/vpSyXhyIJL https://t.co/LT1Vjvz2c...   
18   RT @JoeTraver: Flowers http://t.co/MtM3u7Z3lH ...   
19   Last survivor of 1906 San Francisco earthquake...   
20   RT @Mark_Beech: Ok, one more. Five Years. @Dav...   
21   RT @GBNdiscounts: 89% off Mastering JavaScript...   
22   RT @lyonsempirefox: JANUARY, 10 | @TherealTara...   
23   https://t.co/QdZYiQK5Ec https://t.co/eQ3YvdLPc...   
24   RT @Kmm_eunnie: แนะนำและหนำเหนอสุด~^^ https://...   
25   @PudimDUrsoPanda boa noite... ajuda num duelo ...   
26   https://t.co/O9SygnrbBq https://t.co/pkGeVQthS...   
27   https://t.co/yP9aIqZoGo https://t.co/ubEBHUJ2C...   
28   @28h3d 盗賊Lvが38にあがった!(+1) ほろにがさ、はらだたしさ、せこさ等があがっ...   
29   @TerryKingOBE -1st mention of @trjfp #Dewsbury...   
..                                                 ...   
333  RT @Mark_Beech: Ok, one more. Five Years. @Dav...   
334  #linux  #Tutorial B2 Notifier: Email Notificat...   
335  #linux  #Tutorial Ad Trackz Gold: Industry lea...   
336  RT @KayQuirk1: Be a part of #PJNET's live #Cru...   
337  Minha última criação Idoru fan está vendendo r...   
338  Se instaló Tribunal de lo Contencioso Administ...   
339  Daily Dealshttp://www.sherif.ws/main2.php?link...   
340  ❋肉いっぱい食べても痩せる??今なら無料レポートもらえるらしい。 https://t.co/...   
341  Awesome PHP Resources  https://t.co/UxJb2erxzI...   
342  外資系証券経由の注文状況 20万株の売り越し観測 https://t.co/85D8JxlJ...   
343     To aqui vendo tutorial de como baixar the sims   
344  アニメ化された名作「ギャラリーフェイク」が期間限定で16巻まで読める! https://t....   
345  甘利経済再生相 中国株、やがて落ち着くこと期待 https://t.co/85D8JxlJC...   
346  الحياء الحياء يا ابنة الاسلام...يا من جذورها ا...   
347  RT @catgirls_bot: https://t.co/DNwpCHP8wp http...   
348  dance tutorial on 7/11.\n\nTime Post: Tue Jan ...   
349  @GayGayFox 兵士Lvが87にあがった!(+1) いたいけなさ、せつなさ、ゆるぎなさ...   
350  JavaでYAMLをあつかうなら、SnakeYAMLを利用するのが良いのかしら。今更、XML...   
351  麻生財務相 あまり騒ぎすぎず静かに見た方がよい(金融市場の混乱) https://t.co/...   
352  RT @al3shqvb: حصريا مع اقوي المسلسلات التركيه ...   
353  アートの世界に酔いしれろ!名作「ギャラリーフェイク」期間限定16巻まで公開! https:/...   
354  東京市場 IMM通貨先物で円は買い越しに転換 https://t.co/85D8JxlJC4...   
355  Daily Dealshttp://www.sherif.ws/main2.php?link...   
356  RT @alruschita: ЗАРАБОТОК БЕЗ ВЛОЖЕНИЙ в новой...   
357  "Higher order functions in ES6:Easy as a =>...   
358  RT @conagua_clima: #Pronóstico a muy Corto Pla...   
359  シンガポール日経平均先物 17435円で取引開始 https://t.co/85D8JxlJ...   
360  RT @valizepeda: Mario Pino: “Monte Verde es un...   
361  Solo pic:https://t.co/i9haqVL83K  Complete gal...   
362  @FutWatch put CR7 in there https://t.co/DKSrjW...   

                         created_at     user_id      id_str  \
0    Wed May 13 13:00:30 +0000 2015  3251546824  3251546824   
1    Sun Dec 27 07:28:37 +0000 2015  4659904212  4659904212   
2    Sun Dec 27 07:26:59 +0000 2015  4660028839  4660028839   
3    Wed May 13 13:00:30 +0000 2015  3251546824  3251546824   
4    Thu Nov 05 20:37:13 +0000 2015  4121973370  4121973370   
5    Fri Jan 09 19:30:12 +0000 2015  2970230829  2970230829   
6    Tue Dec 22 12:59:10 +0000 2015  4631232013  4631232013   
7    Tue Nov 10 19:56:42 +0000 2009    89007058    89007058   
8    Sun Mar 09 20:27:58 +0000 2014  2380985478  2380985478   
9    Tue May 12 20:38:56 +0000 2015  3248770619  3248770619   
10   Mon May 04 03:42:57 +0000 2015  3184780645  3184780645   
11   Sat May 16 06:46:25 +0000 2015  3259525461  3259525461   
12   Wed Jun 30 20:11:31 +0000 2010   161417778   161417778   
13   Wed Oct 06 21:37:21 +0000 2010   199438730   199438730   
14   Wed Jan 04 17:47:59 +0000 2012   455054200   455054200   
15   Tue May 12 20:38:56 +0000 2015  3248770619  3248770619   
16   Mon Jun 16 16:00:51 +0000 2014  2571211135  2571211135   
17   Mon May 04 03:42:57 +0000 2015  3184780645  3184780645   
18   Tue Feb 08 12:56:42 +0000 2011   249135370   249135370   
19   Mon Jun 02 19:55:51 +0000 2008    14984212    14984212   
20   Wed Mar 19 06:59:38 +0000 2014  2397416503  2397416503   
21   Thu Feb 18 12:44:29 +0000 2010   115368057   115368057   
22   Fri Nov 16 19:11:18 +0000 2012   952242330   952242330   
23   Tue Nov 19 12:13:16 +0000 2013  2203004029  2203004029   
24   Wed Jun 24 07:35:02 +0000 2015  3254410958  3254410958   
25   Fri Jan 27 01:33:21 +0000 2012   475419487   475419487   
26   Mon May 04 03:42:57 +0000 2015  3184780645  3184780645   
27   Tue Nov 19 12:13:16 +0000 2013  2203004029  2203004029   
28   Thu Mar 14 06:19:09 +0000 2013  1266322362  1266322362   
29   Sat Jan 09 18:38:42 +0000 2016  4756791435  4756791435   
..                              ...         ...         ...   
333  Wed May 27 15:34:45 +0000 2015  3300481947  3300481947   
334  Sat Apr 21 02:16:29 +0000 2012   559116767   559116767   
335  Sat Apr 21 02:16:29 +0000 2012   559116767   559116767   
336  Wed Jun 04 19:52:36 +0000 2014  2546764453  2546764453   
337  Sat Jan 28 19:49:59 +0000 2012   477054476   477054476   
338  Fri Feb 17 01:25:15 +0000 2012   494565289   494565289   
339  Wed Apr 02 23:11:48 +0000 2014  2424456128  2424456128   
340  Mon Aug 18 00:39:57 +0000 2014  2740839318  2740839318   
341  Mon Jun 27 14:19:44 +0000 2011   324962982   324962982   
342  Sun Mar 08 12:36:49 +0000 2015  3068193218  3068193218   
343  Sat Apr 10 20:46:18 +0000 2010   131620275   131620275   
344  Sat Oct 12 10:19:30 +0000 2013  1956288690  1956288690   
345  Sun Mar 08 12:36:49 +0000 2015  3068193218  3068193218   
346  Sat Aug 04 00:28:54 +0000 2012   735786032   735786032   
347  Mon Jul 13 13:23:39 +0000 2015  3373958909  3373958909   
348  Sun Dec 20 04:55:02 +0000 2015  4609297812  4609297812   
349  Sat Mar 01 00:54:46 +0000 2014  2366297082  2366297082   
350  Tue Feb 02 15:01:56 +0000 2010   110721426   110721426   
351  Sun Mar 08 12:36:49 +0000 2015  3068193218  3068193218   
352  Tue Dec 29 20:44:30 +0000 2015  4646634981  4646634981   
353  Tue Aug 04 02:11:57 +0000 2015  3305635999  3305635999   
354  Sun Mar 08 12:36:49 +0000 2015  3068193218  3068193218   
355  Wed Apr 02 23:11:48 +0000 2014  2424456128  2424456128   
356  Sat Jan 09 13:40:27 +0000 2016  4754854636  4754854636   
357  Thu Sep 06 03:14:35 +0000 2012   805941937   805941937   
358  Wed Nov 04 06:47:16 +0000 2015  4121167392  4121167392   
359  Sun Mar 08 12:36:49 +0000 2015  3068193218  3068193218   
360  Thu Jul 02 22:55:56 +0000 2009    53216310    53216310   
361  Wed Feb 25 19:33:33 +0000 2015  3063038925  3063038925   
362  Fri Dec 18 00:35:42 +0000 2015  4519242623  4519242623   

                 username      screen_name                  location  \
0        Таня Жураковская  tanyazhurakovs1                      None   
1                     순유환       esawhorsea                      None   
2                     도율우       rependymak                      None   
3        Таня Жураковская  tanyazhurakovs1                      None   
4            Bajar eBooks      bajarebooks  Papyre FB2 ePub PDF MOBI   
5         Jillayne Mcatee        duhugJill                      None   
6               Tech Club  TechIssuesToday                       USA   
7     tusfantasiascom +18  tusfantasiascom                 Venezuela   
8       Montreal Retweets  MontrealRetweet  Montréal, Québec, Canada   
9     Елизавета Забокрицк  misszabokritsk1                      None   
10         Ines Stagliano    InesStagliano                      None   
11          Данила Горбин    danila_gorbin                      None   
12      Miguel López Luna  miguel_lopezPRI          Zapopan, Jalisco   
13         TechWriter2015   TechWriter2015              ------------   
14           AfjMarketing    Serviceinline            SAIBA MAIS ...   
15    Елизавета Забокрицк  misszabokritsk1                      None   
16      Cynthia Priestley  CynthiaPriestl1                      None   
17         Ines Stagliano    InesStagliano                      None   
18              sinmeikan        sinmeikan                  ,千葉県市川市。   
19                pwitkin          pwitkin             San Francisco   
20             Abe Garver        AbbGarver             Massachusetts   
21   Felipe Renan Donatti   feliperdonatti                      None   
22                      ️     willaholloyd           London, England   
23           Hermy Golden      fuvupotixeh                      None   
24              ᴮᴬᴿᴱʕ•ᴥ•ʔ          BareExo                      None   
25         Nathi Ferreira        FcHtrazom                      None   
26         Ines Stagliano    InesStagliano                      None   
27           Hermy Golden      fuvupotixeh                      None   
28               darkness            28h3d                 somewhere   
29          TRJFPDewsbury            trjfp                      None   
..                    ...              ...                       ...   
333              Jackson      aJxRutledgpe            Washington, DC   
334                wenner         wenner79                   Jakarta   
335                wenner         wenner79                   Jakarta   
336                 MADDD        Da_Momayz       Land of NO FACEBOOK   
337       Bruna Fernandez  BrunaFernandez6                      None   
338        MVM Televisión    MVMTelevision  Oaxaca de Juarez, México   
339           DEALS DEALS      Randajohn90                       USA   
340              二の腕ダイエット        sapiko114                      None   
341        Meryem Akdoğan      MrymAkdogan                      None   
342        パチンコ・パチスロで稼ぐ方法  sougofollowgame                      None   
343               Julieta          Juxubis                       SJC   
344                    電柱     colorjomanda                      None   
345        パチンコ・パチスロで稼ぐ方法  sougofollowgame                      None   
346       waleed almouzan         215e6831                    الرياض   
347   Zenonk (wallpapers)    Zenonk_Felipe                      None   
348               minhpro         minhpro9                      None   
349             【帝】GAYFOX        GayGayFox                       横浜県   
350               cobot_1          cobot_1                      下総の国   
351        パチンコ・パチスロで稼ぐ方法  sougofollowgame                      None   
352       محمد عباس الحسن      brgalthok28                      None   
353                    まる        miyatnmmw                      None   
354        パチンコ・パチスロで稼ぐ方法  sougofollowgame                      None   
355           DEALS DEALS      Randajohn90                       USA   
356       Татьяна Петрова   petrov_tatjana                    Россия   
357         Tze-Chien Chu      TzeChienChu             Taipei,Taiwan   
358             acuosfera     acuosferagdl                      None   
359        パチンコ・パチスロで稼ぐ方法  sougofollowgame                      None   
360         Felipe Zúñiga      felipzuniga                    Chile    
361      HardCoreTeenFuck  hardecoreteenfu                    France   
362        Meshach Gopaul         73Gopaul                      None   

     followers_count  friends_count user_lang following   geo coordinates  \
0                  0              0        ko      None  None        None   
1                  0              0        ko      None  None        None   
2                  1              0        ko      None  None        None   
3                  0              0        ko      None  None        None   
4                 10             59        es      None  None        None   
5                  0              0        ko      None  None        None   
6                394            854        en      None  None        None   
7              69918           1446        es      None  None        None   
8               2872           1445        fr      None  None        None   
9                  0              0        ko      None  None        None   
10                 0              0        ko      None  None        None   
11                 0              0        ko      None  None        None   
12              1081            879        es      None  None        None   
13               763            926        en      None  None        None   
14              1310            578        pt      None  None        None   
15                 0              0        ko      None  None        None   
16               340            992        en      None  None        None   
17                 0              0        ko      None  None        None   
18              5279           5777        ja      None  None        None   
19                27            155        en      None  None        None   
20                26            614        en      None  None        None   
21               586           2125        en      None  None        None   
22              4598            841        fr      None  None        None   
23                 0              0        ko      None  None        None   
24                14             65        en      None  None        None   
25               843           1360        pt      None  None        None   
26                 0              0        ko      None  None        None   
27                 0              0        ko      None  None        None   
28               929            969        ja      None  None        None   
29                23            147        en      None  None        None   
..               ...            ...       ...       ...   ...         ...   
333               70            380        en      None  None        None   
334              806            708        en      None  None        None   
335              806            708        en      None  None        None   
336             1157           1608        en      None  None        None   
337                0              0        pt      None  None        None   
338            11135            995        es      None  None        None   
339               61             32        en      None  None        None   
340             1209           1246        ja      None  None        None   
341              249            343        tr      None  None        None   
342             1050           1213        ja      None  None        None   
343              207            152        pt      None  None        None   
344               48             52        ja      None  None        None   
345             1050           1213        ja      None  None        None   
346               71            108        ar      None  None        None   
347               29            109        pt      None  None        None   
348               67            961        vi      None  None        None   
349              335            181        ja      None  None        None   
350              170            200        ja      None  None        None   
351             1050           1213        ja      None  None        None   
352                1              1        ar      None  None        None   
353                0              1        ja      None  None        None   
354             1050           1213        ja      None  None        None   
355               61             32        en      None  None        None   
356               62           1786        ru      None  None        None   
357               47            112        en      None  None        None   
358                3             10        es      None  None        None   
359             1050           1213        ja      None  None        None   
360             5715           4784        es      None  None        None   
361             6664           7056     en-gb      None  None        None   
362               23             58        en      None  None        None   

     retweet_count  favorite_count lang        country  
0                0               0  und           None  
1                0               0   ko           None  
2                0               0   ko           None  
3                0               0  und           None  
4                0               0   es           None  
5                0               0  und           None  
6                0               0  und           None  
7                0               0   es           None  
8                0               0   tr           None  
9                0               0  und           None  
10               0               0  und           None  
11               0               0  und           None  
12               0               0   es           None  
13               0               0  und           None  
14               0               0   pt           None  
15               0               0  und           None  
16               0               0   en  United States  
17               0               0  und           None  
18               0               0   en           None  
19               0               0   en           None  
20               0               0   en           None  
21               0               0   en           None  
22               0               0   en           None  
23               0               0  und           None  
24               0               0   th           None  
25               0               0   pt           None  
26               0               0  und           None  
27               0               0  und           None  
28               0               0   ja           None  
29               0               0   en           None  
..             ...             ...  ...            ...  
333              0               0   en           None  
334              0               0   fr           None  
335              0               0   en           None  
336              0               0   en           None  
337              0               0   pt           None  
338              0               0   es           None  
339              0               0   en           None  
340              0               0   ja           None  
341              0               0   en           None  
342              0               0   ja           None  
343              0               0   pt           None  
344              0               0   ja           None  
345              0               0   ja           None  
346              0               0   ar           None  
347              0               0  und           None  
348              0               0   en           None  
349              0               0   ja           None  
350              0               0   ja           None  
351              0               0   ja           None  
352              0               0   ar           None  
353              0               0   ja           None  
354              0               0   ja           None  
355              0               0   en           None  
356              0               0   ru           None  
357              0               0   en           None  
358              0               0   es           None  
359              0               0   ja           None  
360              0               0   es           None  
361              0               0   it           None  
362              0               0   en           None  

[363 rows x 17 columns]
In [31]:
tweets_by_loc = tweets['location'].value_counts()
print len(tweets_by_loc)
130
In [33]:
tweets_by_loc = tweets['location'].value_counts()                # determines most frequent tweet countries
print tweets_by_loc          # prints out countries and how many tweets came from each in descending order
USA                           9
おもしろ動画 パワースポット・古代ローマ          6
Москва                        5
Jakarta                       4
UK                            4
Россия                        3
United States                 3
xalapa, ver.                  3
Otonokizaka Academy           2
Sousa - Paraíba               2
NYC                           2
London, England               2
------------                  2
Rio de Janeiro                2
México                        2
Singapore                     2
Everywhere                    2
某サイトにウロチョロ                    1
Seattle•NYC•CHI•Hogwarts      1
Tokyo                         1
北の大地                          1
Land of NO FACEBOOK           1
14ちゃんの猫目の間                    1
Janesville, WI                1
San Luis Obispo, Ca           1
Jakarta, Indonesia            1
Bilbao                        1
Leicester / Wellingborough    1
広島                            1
愛知県                           1
                             ..
,千葉県市川市。                      1
Leiden                        1
2014/09/18〜                   1
Vienna, Austria               1
السعودية/الرياض               1
Catalunya                     1
New York                      1
Taiwan                        1
大雨が降る場所                       1
Chile                         1
Iquique                       1
Panamá                        1
Chicago, Illinois             1
Toronto, Ontario, Canada      1
France                        1
Italia                        1
Japaaaaaan Hokkkkkkkaido      1
Glen Allen, Virginia          1
後日談の世界より                      1
天国と地獄の狭間                      1
North Carolina, USA           1
Cwb                           1
Venezuela                     1
Almost there                  1
SJC                           1
Half of a metal Podcast.      1
Cambridge, MA                 1
Korea                         1
Taipei,Taiwan                 1
Yokohama, Japan               1
Name: location, dtype: int64
In [34]:
tweets_by_lang = tweets['user_lang'].value_counts()                # determines most frequent tweet countries
print tweets_by_lang          # prints out countries and how many tweets came from each in descending order
en       111
ja        81
ko        68
es        26
pt        24
ru        13
ar        12
fr         8
en-gb      4
it         3
en-GB      2
id         2
de         2
th         2
zh-cn      1
tr         1
ca         1
vi         1
pl         1
Name: user_lang, dtype: int64
In [35]:
tweets_by_lang = tweets['user_lang'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('User_lang', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 25 user_lang', fontsize=25, fontweight='bold',color='black')
tweets_by_lang[:25].plot(ax=ax, kind='bar', color='black')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0xfa02518>
In [36]:
tweets_by_loc = tweets['location'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Locations', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 25 Locations', fontsize=25, fontweight='bold',color='black')
tweets_by_loc[:25].plot(ax=ax, kind='bar', color='black')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0xf98d438>
In [38]:
tweets_by_followers_count = tweets['followers_count'].value_counts() # Need to re- check seems something wrong here ...

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('followers_count', fontsize=15)
ax.set_ylabel(' Count' , fontsize=15)
ax.set_title('Top 15 followers_count', fontsize=25, fontweight='bold',color='black')
tweets_by_followers_count[:15].plot(ax=ax, kind='bar', color='black')
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x107f49b0>
In [24]:
import re
In [45]:
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

if __name__ == '__main__':
    
    tweets_data_path = 'C:/STAT/BRIDGE/Module-4/_______Python Own/Wk-1/Twitter1/_Tech_1.txt'

    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue

    tweets = pd.DataFrame()

    texts = []
    langs = []
    countries = []
    for line, tweet in enumerate(tweets_data):
        #print line, tweet
        try:
            texts.append(tweet['text'])
#            langs.append(tweet['lang'])
#            countries.append(tweet['place']['country'] if tweet['place'] != None else None)
        except:
            print "Error line %d" % (line)
            

    tweets['text'] = texts
#    tweets['lang'] = langs
#    tweets['country'] = countries

    #Mining
    tweets['python'] = tweets['text'].apply(lambda tweet: word_in_text('python', tweet))
    tweets['javascript'] = tweets['text'].apply(lambda tweet: word_in_text('javascript', tweet))
    tweets['ruby'] = tweets['text'].apply(lambda tweet: word_in_text('ruby', tweet))

    tweets['programming'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet))
    tweets['tutorial'] = tweets['text'].apply(lambda tweet: word_in_text('tutorial', tweet))

    tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet) or word_in_text('tutorial', tweet))

    tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))

    tweets_relevant = tweets[tweets['relevant'] == True]
    tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']

    print tweets_relevant_with_link[tweets_relevant_with_link['python'] == True]['link']
    print tweets_relevant_with_link[tweets_relevant_with_link['javascript'] == True]['link']
    print tweets_relevant_with_link[tweets_relevant_with_link['ruby'] == True]['link']
    plt.show()
Series([], Name: link, dtype: object)
21    https://t.co/iZScYeD2Ij
Name: link, dtype: object
Series([], Name: link, dtype: object)

No comments:

Post a Comment