# Code inspired by = http://adilmoujahid.com/posts/2014/07/twitter-analytics/
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
#Variables that contains the user credentials to access Twitter API
access_token = '73090441-N74od'
access_token_secret = 'A########9'
consumer_key = 'Q#############1'
consumer_secret = 'S#############x'
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['#rstats , #python, JavaScript , Perl , JAVA, PHP , RSTUDIO, Tutorial , oreilly , oreilly , Springer , Python , Pearson , Elsevier , Bertelsmann AG , McGrawHill , Wiley , APRESS , CAMBRIDGE , HARVARD , HarperCollins, HBR , Kadokawa Publishing'])
for tweet in stream:
print tweet['text']
# Note only 1 JSON Dump shown below ----
{"created_at":"Tue Jan 12 01:47:10 +0000 2016","id":686726079554269185,"id_str":"686726079554269185","text":"@Tom_Dart Talk to Tip Top Cafe's Jim Scott, who says 'armed society is a civil society,' allows open carry https:\/\/t.co\/10VHLJKbW4 via @mySA","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":151412907,"in_reply_to_user_id_str":"151412907","in_reply_to_screen_name":"Tom_Dart","user":{"id":1599217352,"id_str":"1599217352","name":"Arthur Cavazos","screen_name":"artcavazos1","location":null,"url":null,"description":"Veteran public relations executive and business owner with a sterling reputation built on solid values, experience, and strategic thinking.","protected":false,"verified":false,"followers_count":901,"friends_count":1976,"listed_count":11,"favourites_count":320,"statuses_count":2435,"created_at":"Tue Jul 16 20:29:31 +0000 2013","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000143871263\/80968f059be13a992f9da992acba42bb_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000143871263\/80968f059be13a992f9da992acba42bb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1599217352\/1398272181","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/10VHLJKbW4","expanded_url":"http:\/\/www.mysanantonio.com\/news\/local\/article\/San-Antonio-staple-Tip-Top-Cafe-to-allow-open-6740579.php?cmpid=twitter-desktop","display_url":"mysanantonio.com\/news\/local\/art\u2026","indices":[107,130]}],"user_mentions":[{"screen_name":"Tom_Dart","name":"Tom Dart","id":151412907,"id_str":"151412907","indices":[0,9]},{"screen_name":"mySA","name":"mySA","id":9830752,"id_str":"9830752","indices":[135,140]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1452563230776"}
In [27]:#Import the necessary methods from tweepy library from tweepy.streaming import StreamListener from tweepy import OAuthHandler from tweepy import Stream import json import pandas as pd %matplotlib inline import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') #Variables that contains the user credentials to access Twitter API access_token = '73090441-N74orPfAdWYhOvWUHpKFkUkejTqSmOeTjadhfX0Yd' access_token_secret = 'Ah7WiI4tvUcEVoBDXbQJECRSHfUHrCWE78TmU2G3nMGE9' consumer_key = 'Q3M3ZNo3QL9PDAjOoZCMKowM1' consumer_secret = 'SOenR4B7JzF6KMi5WtbSpAYX9MLlzSy6mlud3c3nn0on5yu3Qx'In [28]:tweets_data_path = 'C:/STAT/BRIDGE/Module-4/_______Python Own/Wk-1/Twitter1/_Tech_1.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continueIn [29]:print len(tweets_data)363In [30]:tweets = pd.DataFrame() tweets['tweet'] = map(lambda tweet: tweet['text'] if 'text' in tweet else None, tweets_data) tweets['created_at'] = map(lambda tweet: tweet['created_at'] if 'created_at' in tweet else None, tweets_data) tweets['user_id'] = map(lambda tweet: tweet['user']['id'] if 'user' in tweet else None, tweets_data) tweets['id_str'] = map(lambda tweet: tweet['user']['id_str'] if 'user' in tweet else None, tweets_data) tweets['username'] = map(lambda tweet: tweet['user']['name'] if 'user' in tweet else None, tweets_data) tweets['screen_name'] = map(lambda tweet: tweet['user']['screen_name'] if 'user' in tweet else None, tweets_data) tweets['location'] = map(lambda tweet: tweet['user']['location'] if 'user' in tweet else None, tweets_data) tweets['followers_count'] = map(lambda tweet: tweet['user']['followers_count'] if 'user' in tweet else None, tweets_data) tweets['friends_count'] = map(lambda tweet: tweet['user']['friends_count'] if 'user' in tweet else None, tweets_data) tweets['created_at'] = map(lambda tweet: tweet['user']['created_at'] if 'user' in tweet else None, tweets_data) tweets['user_lang'] = map(lambda tweet: tweet['user']['lang'] if 'user' in tweet else None, tweets_data) tweets['following'] = map(lambda tweet: tweet['user']['following'] if 'user' in tweet else None, tweets_data) tweets['geo'] = map(lambda tweet: tweet['geo'], tweets_data) # tweets['coordinates'] = map(lambda tweet: tweet['coordinates'], tweets_data) tweets['retweet_count'] = map(lambda tweet: tweet['retweet_count'], tweets_data) tweets['favorite_count'] = map(lambda tweet: tweet['favorite_count'], tweets_data) tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data) # if k1 in d and k2 in d[k1] tweets['country'] = map(lambda tweet: tweet['place']['country'] if 'country' in tweet else None, tweets_data) tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data) print (tweets)tweet \ 0 https://t.co/9F9j8Usc8Y https://t.co/gHZV0Y3PP... 1 https://t.co/HH0P8pluxb https://t.co/xRhIYSGsr... 2 설공김추성안차장한문곽민홍신손곽길남노 https://t.co/tM1UiWQmZz ht... 3 https://t.co/sE72SAsDpf https://t.co/XHFnN6FyN... 4 My descarga Carreteras Secundarias porque ¨Est... 5 https://t.co/GyC1jlHPWu https://t.co/w5yjxIuuG... 6 #Java Multi Dimensional array/2d array\n#Tech ... 7 💋 LUNA de #Caracas 🇻🇪 ver telefono y tarifa... 8 RT @CanselSevenler: @cansellelcin #làoùatillap... 9 https://t.co/7wLUtAUG82 https://t.co/QDN14LnUh... 10 https://t.co/e9lg9NbBGo https://t.co/LW07kta5h... 11 https://t.co/3Lv0xlvYUL https://t.co/3080T0BNz... 12 RT @conagua_clima: #Pronóstico a muy Corto Pla... 13 J0116-0672 - Java Developer - Java Developer, ... 14 Neobux: Aprenda Lucrar Com Neobux. Tutorial, T... 15 https://t.co/IxfhBonpV3 https://t.co/lRuWYgt48... 16 Everything You Need to Know About This Weekend... 17 https://t.co/vpSyXhyIJL https://t.co/LT1Vjvz2c... 18 RT @JoeTraver: Flowers http://t.co/MtM3u7Z3lH ... 19 Last survivor of 1906 San Francisco earthquake... 20 RT @Mark_Beech: Ok, one more. Five Years. @Dav... 21 RT @GBNdiscounts: 89% off Mastering JavaScript... 22 RT @lyonsempirefox: JANUARY, 10 | @TherealTara... 23 https://t.co/QdZYiQK5Ec https://t.co/eQ3YvdLPc... 24 RT @Kmm_eunnie: แนะนำและหนำเหนอสุด~^^ https://... 25 @PudimDUrsoPanda boa noite... ajuda num duelo ... 26 https://t.co/O9SygnrbBq https://t.co/pkGeVQthS... 27 https://t.co/yP9aIqZoGo https://t.co/ubEBHUJ2C... 28 @28h3d 盗賊Lvが38にあがった!(+1) ほろにがさ、はらだたしさ、せこさ等があがっ... 29 @TerryKingOBE -1st mention of @trjfp #Dewsbury... .. ... 333 RT @Mark_Beech: Ok, one more. Five Years. @Dav... 334 #linux #Tutorial B2 Notifier: Email Notificat... 335 #linux #Tutorial Ad Trackz Gold: Industry lea... 336 RT @KayQuirk1: Be a part of #PJNET's live #Cru... 337 Minha última criação Idoru fan está vendendo r... 338 Se instaló Tribunal de lo Contencioso Administ... 339 Daily Dealshttp://www.sherif.ws/main2.php?link... 340 ❋肉いっぱい食べても痩せる??今なら無料レポートもらえるらしい。 https://t.co/... 341 Awesome PHP Resources https://t.co/UxJb2erxzI... 342 外資系証券経由の注文状況 20万株の売り越し観測 https://t.co/85D8JxlJ... 343 To aqui vendo tutorial de como baixar the sims 344 アニメ化された名作「ギャラリーフェイク」が期間限定で16巻まで読める! https://t.... 345 甘利経済再生相 中国株、やがて落ち着くこと期待 https://t.co/85D8JxlJC... 346 الحياء الحياء يا ابنة الاسلام...يا من جذورها ا... 347 RT @catgirls_bot: https://t.co/DNwpCHP8wp http... 348 dance tutorial on 7/11.\n\nTime Post: Tue Jan ... 349 @GayGayFox 兵士Lvが87にあがった!(+1) いたいけなさ、せつなさ、ゆるぎなさ... 350 JavaでYAMLをあつかうなら、SnakeYAMLを利用するのが良いのかしら。今更、XML... 351 麻生財務相 あまり騒ぎすぎず静かに見た方がよい(金融市場の混乱) https://t.co/... 352 RT @al3shqvb: حصريا مع اقوي المسلسلات التركيه ... 353 アートの世界に酔いしれろ!名作「ギャラリーフェイク」期間限定16巻まで公開! https:/... 354 東京市場 IMM通貨先物で円は買い越しに転換 https://t.co/85D8JxlJC4... 355 Daily Dealshttp://www.sherif.ws/main2.php?link... 356 RT @alruschita: ЗАРАБОТОК БЕЗ ВЛОЖЕНИЙ в новой... 357 "Higher order functions in ES6:Easy as a =>... 358 RT @conagua_clima: #Pronóstico a muy Corto Pla... 359 シンガポール日経平均先物 17435円で取引開始 https://t.co/85D8JxlJ... 360 RT @valizepeda: Mario Pino: “Monte Verde es un... 361 Solo pic:https://t.co/i9haqVL83K Complete gal... 362 @FutWatch put CR7 in there https://t.co/DKSrjW... created_at user_id id_str \ 0 Wed May 13 13:00:30 +0000 2015 3251546824 3251546824 1 Sun Dec 27 07:28:37 +0000 2015 4659904212 4659904212 2 Sun Dec 27 07:26:59 +0000 2015 4660028839 4660028839 3 Wed May 13 13:00:30 +0000 2015 3251546824 3251546824 4 Thu Nov 05 20:37:13 +0000 2015 4121973370 4121973370 5 Fri Jan 09 19:30:12 +0000 2015 2970230829 2970230829 6 Tue Dec 22 12:59:10 +0000 2015 4631232013 4631232013 7 Tue Nov 10 19:56:42 +0000 2009 89007058 89007058 8 Sun Mar 09 20:27:58 +0000 2014 2380985478 2380985478 9 Tue May 12 20:38:56 +0000 2015 3248770619 3248770619 10 Mon May 04 03:42:57 +0000 2015 3184780645 3184780645 11 Sat May 16 06:46:25 +0000 2015 3259525461 3259525461 12 Wed Jun 30 20:11:31 +0000 2010 161417778 161417778 13 Wed Oct 06 21:37:21 +0000 2010 199438730 199438730 14 Wed Jan 04 17:47:59 +0000 2012 455054200 455054200 15 Tue May 12 20:38:56 +0000 2015 3248770619 3248770619 16 Mon Jun 16 16:00:51 +0000 2014 2571211135 2571211135 17 Mon May 04 03:42:57 +0000 2015 3184780645 3184780645 18 Tue Feb 08 12:56:42 +0000 2011 249135370 249135370 19 Mon Jun 02 19:55:51 +0000 2008 14984212 14984212 20 Wed Mar 19 06:59:38 +0000 2014 2397416503 2397416503 21 Thu Feb 18 12:44:29 +0000 2010 115368057 115368057 22 Fri Nov 16 19:11:18 +0000 2012 952242330 952242330 23 Tue Nov 19 12:13:16 +0000 2013 2203004029 2203004029 24 Wed Jun 24 07:35:02 +0000 2015 3254410958 3254410958 25 Fri Jan 27 01:33:21 +0000 2012 475419487 475419487 26 Mon May 04 03:42:57 +0000 2015 3184780645 3184780645 27 Tue Nov 19 12:13:16 +0000 2013 2203004029 2203004029 28 Thu Mar 14 06:19:09 +0000 2013 1266322362 1266322362 29 Sat Jan 09 18:38:42 +0000 2016 4756791435 4756791435 .. ... ... ... 333 Wed May 27 15:34:45 +0000 2015 3300481947 3300481947 334 Sat Apr 21 02:16:29 +0000 2012 559116767 559116767 335 Sat Apr 21 02:16:29 +0000 2012 559116767 559116767 336 Wed Jun 04 19:52:36 +0000 2014 2546764453 2546764453 337 Sat Jan 28 19:49:59 +0000 2012 477054476 477054476 338 Fri Feb 17 01:25:15 +0000 2012 494565289 494565289 339 Wed Apr 02 23:11:48 +0000 2014 2424456128 2424456128 340 Mon Aug 18 00:39:57 +0000 2014 2740839318 2740839318 341 Mon Jun 27 14:19:44 +0000 2011 324962982 324962982 342 Sun Mar 08 12:36:49 +0000 2015 3068193218 3068193218 343 Sat Apr 10 20:46:18 +0000 2010 131620275 131620275 344 Sat Oct 12 10:19:30 +0000 2013 1956288690 1956288690 345 Sun Mar 08 12:36:49 +0000 2015 3068193218 3068193218 346 Sat Aug 04 00:28:54 +0000 2012 735786032 735786032 347 Mon Jul 13 13:23:39 +0000 2015 3373958909 3373958909 348 Sun Dec 20 04:55:02 +0000 2015 4609297812 4609297812 349 Sat Mar 01 00:54:46 +0000 2014 2366297082 2366297082 350 Tue Feb 02 15:01:56 +0000 2010 110721426 110721426 351 Sun Mar 08 12:36:49 +0000 2015 3068193218 3068193218 352 Tue Dec 29 20:44:30 +0000 2015 4646634981 4646634981 353 Tue Aug 04 02:11:57 +0000 2015 3305635999 3305635999 354 Sun Mar 08 12:36:49 +0000 2015 3068193218 3068193218 355 Wed Apr 02 23:11:48 +0000 2014 2424456128 2424456128 356 Sat Jan 09 13:40:27 +0000 2016 4754854636 4754854636 357 Thu Sep 06 03:14:35 +0000 2012 805941937 805941937 358 Wed Nov 04 06:47:16 +0000 2015 4121167392 4121167392 359 Sun Mar 08 12:36:49 +0000 2015 3068193218 3068193218 360 Thu Jul 02 22:55:56 +0000 2009 53216310 53216310 361 Wed Feb 25 19:33:33 +0000 2015 3063038925 3063038925 362 Fri Dec 18 00:35:42 +0000 2015 4519242623 4519242623 username screen_name location \ 0 Таня Жураковская tanyazhurakovs1 None 1 순유환 esawhorsea None 2 도율우 rependymak None 3 Таня Жураковская tanyazhurakovs1 None 4 Bajar eBooks bajarebooks Papyre FB2 ePub PDF MOBI 5 Jillayne Mcatee duhugJill None 6 Tech Club TechIssuesToday USA 7 tusfantasiascom +18 tusfantasiascom Venezuela 8 Montreal Retweets MontrealRetweet Montréal, Québec, Canada 9 Елизавета Забокрицк misszabokritsk1 None 10 Ines Stagliano InesStagliano None 11 Данила Горбин danila_gorbin None 12 Miguel López Luna miguel_lopezPRI Zapopan, Jalisco 13 TechWriter2015 TechWriter2015 ------------ 14 AfjMarketing Serviceinline SAIBA MAIS ... 15 Елизавета Забокрицк misszabokritsk1 None 16 Cynthia Priestley CynthiaPriestl1 None 17 Ines Stagliano InesStagliano None 18 sinmeikan sinmeikan ,千葉県市川市。 19 pwitkin pwitkin San Francisco 20 Abe Garver AbbGarver Massachusetts 21 Felipe Renan Donatti feliperdonatti None 22 ️ willaholloyd London, England 23 Hermy Golden fuvupotixeh None 24 ᴮᴬᴿᴱʕ•ᴥ•ʔ BareExo None 25 Nathi Ferreira FcHtrazom None 26 Ines Stagliano InesStagliano None 27 Hermy Golden fuvupotixeh None 28 darkness 28h3d somewhere 29 TRJFPDewsbury trjfp None .. ... ... ... 333 Jackson aJxRutledgpe Washington, DC 334 wenner wenner79 Jakarta 335 wenner wenner79 Jakarta 336 MADDD Da_Momayz Land of NO FACEBOOK 337 Bruna Fernandez BrunaFernandez6 None 338 MVM Televisión MVMTelevision Oaxaca de Juarez, México 339 DEALS DEALS Randajohn90 USA 340 二の腕ダイエット sapiko114 None 341 Meryem Akdoğan MrymAkdogan None 342 パチンコ・パチスロで稼ぐ方法 sougofollowgame None 343 Julieta Juxubis SJC 344 電柱 colorjomanda None 345 パチンコ・パチスロで稼ぐ方法 sougofollowgame None 346 waleed almouzan 215e6831 الرياض 347 Zenonk (wallpapers) Zenonk_Felipe None 348 minhpro minhpro9 None 349 【帝】GAYFOX GayGayFox 横浜県 350 cobot_1 cobot_1 下総の国 351 パチンコ・パチスロで稼ぐ方法 sougofollowgame None 352 محمد عباس الحسن brgalthok28 None 353 まる miyatnmmw None 354 パチンコ・パチスロで稼ぐ方法 sougofollowgame None 355 DEALS DEALS Randajohn90 USA 356 Татьяна Петрова petrov_tatjana Россия 357 Tze-Chien Chu TzeChienChu Taipei,Taiwan 358 acuosfera acuosferagdl None 359 パチンコ・パチスロで稼ぐ方法 sougofollowgame None 360 Felipe Zúñiga felipzuniga Chile 361 HardCoreTeenFuck hardecoreteenfu France 362 Meshach Gopaul 73Gopaul None followers_count friends_count user_lang following geo coordinates \ 0 0 0 ko None None None 1 0 0 ko None None None 2 1 0 ko None None None 3 0 0 ko None None None 4 10 59 es None None None 5 0 0 ko None None None 6 394 854 en None None None 7 69918 1446 es None None None 8 2872 1445 fr None None None 9 0 0 ko None None None 10 0 0 ko None None None 11 0 0 ko None None None 12 1081 879 es None None None 13 763 926 en None None None 14 1310 578 pt None None None 15 0 0 ko None None None 16 340 992 en None None None 17 0 0 ko None None None 18 5279 5777 ja None None None 19 27 155 en None None None 20 26 614 en None None None 21 586 2125 en None None None 22 4598 841 fr None None None 23 0 0 ko None None None 24 14 65 en None None None 25 843 1360 pt None None None 26 0 0 ko None None None 27 0 0 ko None None None 28 929 969 ja None None None 29 23 147 en None None None .. ... ... ... ... ... ... 333 70 380 en None None None 334 806 708 en None None None 335 806 708 en None None None 336 1157 1608 en None None None 337 0 0 pt None None None 338 11135 995 es None None None 339 61 32 en None None None 340 1209 1246 ja None None None 341 249 343 tr None None None 342 1050 1213 ja None None None 343 207 152 pt None None None 344 48 52 ja None None None 345 1050 1213 ja None None None 346 71 108 ar None None None 347 29 109 pt None None None 348 67 961 vi None None None 349 335 181 ja None None None 350 170 200 ja None None None 351 1050 1213 ja None None None 352 1 1 ar None None None 353 0 1 ja None None None 354 1050 1213 ja None None None 355 61 32 en None None None 356 62 1786 ru None None None 357 47 112 en None None None 358 3 10 es None None None 359 1050 1213 ja None None None 360 5715 4784 es None None None 361 6664 7056 en-gb None None None 362 23 58 en None None None retweet_count favorite_count lang country 0 0 0 und None 1 0 0 ko None 2 0 0 ko None 3 0 0 und None 4 0 0 es None 5 0 0 und None 6 0 0 und None 7 0 0 es None 8 0 0 tr None 9 0 0 und None 10 0 0 und None 11 0 0 und None 12 0 0 es None 13 0 0 und None 14 0 0 pt None 15 0 0 und None 16 0 0 en United States 17 0 0 und None 18 0 0 en None 19 0 0 en None 20 0 0 en None 21 0 0 en None 22 0 0 en None 23 0 0 und None 24 0 0 th None 25 0 0 pt None 26 0 0 und None 27 0 0 und None 28 0 0 ja None 29 0 0 en None .. ... ... ... ... 333 0 0 en None 334 0 0 fr None 335 0 0 en None 336 0 0 en None 337 0 0 pt None 338 0 0 es None 339 0 0 en None 340 0 0 ja None 341 0 0 en None 342 0 0 ja None 343 0 0 pt None 344 0 0 ja None 345 0 0 ja None 346 0 0 ar None 347 0 0 und None 348 0 0 en None 349 0 0 ja None 350 0 0 ja None 351 0 0 ja None 352 0 0 ar None 353 0 0 ja None 354 0 0 ja None 355 0 0 en None 356 0 0 ru None 357 0 0 en None 358 0 0 es None 359 0 0 ja None 360 0 0 es None 361 0 0 it None 362 0 0 en None [363 rows x 17 columns]In [31]:tweets_by_loc = tweets['location'].value_counts() print len(tweets_by_loc)130In [33]:tweets_by_loc = tweets['location'].value_counts() # determines most frequent tweet countries print tweets_by_loc # prints out countries and how many tweets came from each in descending orderUSA 9 おもしろ動画 パワースポット・古代ローマ 6 Москва 5 Jakarta 4 UK 4 Россия 3 United States 3 xalapa, ver. 3 Otonokizaka Academy 2 Sousa - Paraíba 2 NYC 2 London, England 2 ------------ 2 Rio de Janeiro 2 México 2 Singapore 2 Everywhere 2 某サイトにウロチョロ 1 Seattle•NYC•CHI•Hogwarts 1 Tokyo 1 北の大地 1 Land of NO FACEBOOK 1 14ちゃんの猫目の間 1 Janesville, WI 1 San Luis Obispo, Ca 1 Jakarta, Indonesia 1 Bilbao 1 Leicester / Wellingborough 1 広島 1 愛知県 1 .. ,千葉県市川市。 1 Leiden 1 2014/09/18〜 1 Vienna, Austria 1 السعودية/الرياض 1 Catalunya 1 New York 1 Taiwan 1 大雨が降る場所 1 Chile 1 Iquique 1 Panamá 1 Chicago, Illinois 1 Toronto, Ontario, Canada 1 France 1 Italia 1 Japaaaaaan Hokkkkkkkaido 1 Glen Allen, Virginia 1 後日談の世界より 1 天国と地獄の狭間 1 North Carolina, USA 1 Cwb 1 Venezuela 1 Almost there 1 SJC 1 Half of a metal Podcast. 1 Cambridge, MA 1 Korea 1 Taipei,Taiwan 1 Yokohama, Japan 1 Name: location, dtype: int64In [34]:tweets_by_lang = tweets['user_lang'].value_counts() # determines most frequent tweet countries print tweets_by_lang # prints out countries and how many tweets came from each in descending orderen 111 ja 81 ko 68 es 26 pt 24 ru 13 ar 12 fr 8 en-gb 4 it 3 en-GB 2 id 2 de 2 th 2 zh-cn 1 tr 1 ca 1 vi 1 pl 1 Name: user_lang, dtype: int64In [35]:tweets_by_lang = tweets['user_lang'].value_counts() fig, ax = plt.subplots() ax.tick_params(axis='x', labelsize=15) ax.tick_params(axis='y', labelsize=10) ax.set_xlabel('User_lang', fontsize=15) ax.set_ylabel('Number of tweets' , fontsize=15) ax.set_title('Top 25 user_lang', fontsize=25, fontweight='bold',color='black') tweets_by_lang[:25].plot(ax=ax, kind='bar', color='black')Out[35]:<matplotlib.axes._subplots.AxesSubplot at 0xfa02518>In [36]:tweets_by_loc = tweets['location'].value_counts() fig, ax = plt.subplots() ax.tick_params(axis='x', labelsize=15) ax.tick_params(axis='y', labelsize=10) ax.set_xlabel('Locations', fontsize=15) ax.set_ylabel('Number of tweets' , fontsize=15) ax.set_title('Top 25 Locations', fontsize=25, fontweight='bold',color='black') tweets_by_loc[:25].plot(ax=ax, kind='bar', color='black')Out[36]:<matplotlib.axes._subplots.AxesSubplot at 0xf98d438>In [38]:tweets_by_followers_count = tweets['followers_count'].value_counts() # Need to re- check seems something wrong here ... fig, ax = plt.subplots() ax.tick_params(axis='x', labelsize=15) ax.tick_params(axis='y', labelsize=10) ax.set_xlabel('followers_count', fontsize=15) ax.set_ylabel(' Count' , fontsize=15) ax.set_title('Top 15 followers_count', fontsize=25, fontweight='bold',color='black') tweets_by_followers_count[:15].plot(ax=ax, kind='bar', color='black')Out[38]:<matplotlib.axes._subplots.AxesSubplot at 0x107f49b0>In [24]:import reIn [45]:def word_in_text(word, text): word = word.lower() text = text.lower() match = re.search(word, text) if match: return True return False def extract_link(text): regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+' match = re.search(regex, text) if match: return match.group() return '' if __name__ == '__main__': tweets_data_path = 'C:/STAT/BRIDGE/Module-4/_______Python Own/Wk-1/Twitter1/_Tech_1.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue tweets = pd.DataFrame() texts = [] langs = [] countries = [] for line, tweet in enumerate(tweets_data): #print line, tweet try: texts.append(tweet['text']) # langs.append(tweet['lang']) # countries.append(tweet['place']['country'] if tweet['place'] != None else None) except: print "Error line %d" % (line) tweets['text'] = texts # tweets['lang'] = langs # tweets['country'] = countries #Mining tweets['python'] = tweets['text'].apply(lambda tweet: word_in_text('python', tweet)) tweets['javascript'] = tweets['text'].apply(lambda tweet: word_in_text('javascript', tweet)) tweets['ruby'] = tweets['text'].apply(lambda tweet: word_in_text('ruby', tweet)) tweets['programming'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet)) tweets['tutorial'] = tweets['text'].apply(lambda tweet: word_in_text('tutorial', tweet)) tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet) or word_in_text('tutorial', tweet)) tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet)) tweets_relevant = tweets[tweets['relevant'] == True] tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != ''] print tweets_relevant_with_link[tweets_relevant_with_link['python'] == True]['link'] print tweets_relevant_with_link[tweets_relevant_with_link['javascript'] == True]['link'] print tweets_relevant_with_link[tweets_relevant_with_link['ruby'] == True]['link'] plt.show()Series([], Name: link, dtype: object) 21 https://t.co/iZScYeD2Ij Name: link, dtype: object Series([], Name: link, dtype: object)
No comments:
Post a Comment