Friday 8 January 2016

Web Scraping with “RSelenium”

Rohit @Strategic Leadership India

Web Scraping with “RSelenium”

# R Code Starts here ...
# devtools::install_github("ropensci/RSelenium")
library("RSelenium", lib.loc="~/R/win-library/3.1")
library(RSelenium)
# start the server if one isnt running
startServer()# use default server initialisation values
remDr <- remoteDriver$new()
remDr$open(silent=T) # Opens up a Firefox window ... 
library("XML")
remDr$navigate("http://www.shopclues.com/mobiles/mobile-phones.html")
#
webElems <- remDr$findElements(using = 'css selector', ".name")
CSS_Text_Headers<- unlist(lapply(webElems, function(x){x$getElementText()}))
CSS_Text_Headers # Not recommended ...large output 
[1] “Zen Ultraphone Powermax ( 4200 MAh, 5" IPS, Quadcore, 1GB….”
[2] “Zen X8i Shine + Speakers+ Aluma Wallet + Watch”
[3] “Nosama VIRAAT The Phone With Inbuilt Power Bank”
[4] “Nokia 105 (Nokia Pack Of 3)”
[5] “Zopo ZP951 Speed 7 With 3GB RAM + 16GB ROM”
[6] “ADCOM MOBILE X5 WITH VOICE CHANGER FEATURE - Black & Orange”
[7] “Gionee F103 - 3 GB RAM”
[8] “OnePlus One(64GB)- 1 Year Manufacturer Warranty”
[9] “Kenxinda K1( 1GB RAM, MTK 1.3GHz Quadcore) & Free Leather….”
[10] “NOSAMA CHARCOAL 5 QS1 WITH 13 MP ROTATING CAMERA”
[11] “Samsung Galaxy J7”
[12] “Panasonic P55 Novo 2GB 16GB Champ Gold”
[13] “Gionee Marathon M3”
[14] “SAMSUNG GALAXY J5”
[15] “Adcom X14 Chatty With Whatsapp/Facebook-black”
[16] “Rage Yo C Mobile Dual Sim With Bluetooth, MP3 Player Red &….”
[17] “REACH SENSE 402”
[18] “I-Smart IS 201i LITE ( WhatsApp And Advanced Auto Call Recording)”
[19] “I-SMART 111 LITE (with Gujrati , Bengali & Hindi Language)….”
[20] “I-Smart 301i ELITE ( With Whats App )”
[21] “Intex Aqua Power Plus (Champagne)(2GB RAM+ 16GB ROM, 4000mAh Battery)” [22] “Lava IRIS Fuel F1 (4000mah Battery, 2gb, 5, 8/2mp Camera, 8gb) 1….” [23] “Google Nexus 5x (32GB, Corbon )”
[24] “I-SMART IS 100L (1000 MAh Battery & Digital Camera, Bluetooth,….”
[25] “Zopo ZP951 Speed 7 With 3GB RAM + 16GB ROM - Black”
[26] “FORME K09 Red”
[27] “Josh Topaz Dual SIM Mobile Buy One Get One Free”
[28] “Gionee Elife S7”
[29] “Google Nexus 5X (16GB, Carbon)”
[30] “Kestrel KM 100 Multimedia Phone (Black)”
[31] “Vell Com V-07 Dual Sim GSM With Whatsapp Multimedia Camera Mobile”
[32] “MicroSoft Lumia 640 XL Dual Sim”
[33] “INTEX MOBILE AquaPlay”
[34] “HTC Desire 526G + (16 GB)”
[35] “Micromax Bolt A58 Red”
[36] “HTC Desire 626g Plus ( Blue )”
[37] “Samsung Galaxy Core 2”
[38] “Samsung Galaxy Core Prime”
[39] “Panasonic Eluga S (White)”
[40] “Vox Kick K4 Android Kitkat Smart Phone Black”
[41] “Celkon Campus Nova A352E (White)”
[42] “Panasonic Eluga A (White)”
[43] “XOLT Mobile Phone XS1”
[44] “Forme Summer S700 (Red)”
[45] “MTS CG 131 ( ZTE D286 ) CDMA + GSM MOBILE FOR TATA RELIANCE MTS….”
[46] “ZTE N799D BLADE EG EVDO+GSM CDMA ANDROID 4.1 ALL Reliance Tata….”
df<-as.data.frame(CSS_Text_Headers)
head(df);tail(df)
                                           CSS_Text_Headers
1 Zen Ultraphone Powermax ( 4200 MAh, 5" IPS, Quadcore, 1GB…. 2 Zen X8i Shine + Speakers+ Aluma Wallet + Watch 3 Nosama VIRAAT The Phone With Inbuilt Power Bank 4 Nokia 105 (Nokia Pack Of 3) 5 Zopo ZP951 Speed 7 With 3GB RAM + 16GB ROM 6 ADCOM MOBILE X5 WITH VOICE CHANGER FEATURE - Black & Orange CSS_Text_Headers 41 Celkon Campus Nova A352E (White) 42 Panasonic Eluga A (White) 43 XOLT Mobile Phone XS1 44 Forme Summer S700 (Red) 45 MTS CG 131 ( ZTE D286 ) CDMA + GSM MOBILE FOR TATA RELIANCE MTS…. 46 ZTE N799D BLADE EG EVDO+GSM CDMA ANDROID 4.1 ALL Reliance Tata….
#
webElems1 <- remDr$findElements(using = 'css selector', ".product-price")
CSS_Text_Headers1 <- unlist(lapply(webElems1, function(x){x$getElementText()}))
df1<-as.data.frame(CSS_Text_Headers1)
head(df1);tail(df1)
            CSS_Text_Headers1
1 Rs. 4,799.5,799 Rs.7,699 2 Rs. 1,299.1,499 Rs.4,126 3 Rs. 1,899.2,499 4 Rs. 3,420.4,016 Rs.4,295 5 Rs. 12,999.14,499 Rs.14,600 6 Rs. 649.899 Rs.999 CSS_Text_Headers1 41 Rs. 2,099.2,499 Rs.2,810 42 Rs. 7,140.10,447 43 Rs. 772.1,349 44 Rs. 1,395.1,999 45 Rs. 2,299.2,550 46 Rs. 6,999.7,999
# .old-price has more than 1 value for each item. 
# .old-price is seen as shown by CSS Selector Gadget - but Not Seen in the Webpage code ? 
webElems2 <- remDr$findElements(using = 'css selector', ".old-price")
CSS_Text_Headers2 <- unlist(lapply(webElems2, function(x){x$getElementText()}))
df2<-as.data.frame(CSS_Text_Headers2)
head(df2);tail(df2)
CSS_Text_Headers2 1 Rs.5,799 2 Rs.7,699 3 Rs.1,499 4 Rs.4,126 5 Rs.2,499 6 Rs.4,016 CSS_Text_Headers2 65 Rs.2,810 66 Rs.10,447 67 Rs.1,349 68 Rs.1,999 69 Rs.2,550 70 Rs.7,999
#
webElems3 <- remDr$findElements(using = 'css selector', ".price")
CSS_Text_Headers3 <- unlist(lapply(webElems3, function(x){x$getElementText()}))
df3<-as.data.frame(CSS_Text_Headers3)
head(df3);tail(df3)
CSS_Text_Headers3 1
2 4,799 3 1,299 4 1,899 5 3,420 6 12,999 CSS_Text_Headers3 42 2,099 43 7,140 44 772 45 1,395 46 2,299 47 6,999
#
#DF_Main<-cbind(df,df3)
# View(DF_Main)
# Take it Offline ...Knitter also throws an error as Variable Length is not same ... 
getwd()
[1] “C:/STAT/RVest-2”
write.table(df,"C:/STAT/RVest-2/df.csv",sep=",")
write.table(df1,"C:/STAT/RVest-2/df1.csv",sep=",")
write.table(df2,"C:/STAT/RVest-2/df2.csv",sep=",")
write.table(df3,"C:/STAT/RVest-2/df3.csv",sep=",")

# vendors 
webElems4<- remDr$findElements("css selector", "[href]")
CSS_HREF_Attr<-unlist(sapply(webElems4, function(x){x$getElementAttribute("href")})) # 
# Above we were looking for Element Text - now we need HREF Attributes 
# Giving a Traceback and Debug error but we can ignore that ...
df4<-as.data.frame(CSS_HREF_Attr)
head(df4);tail(df4)# Success ... 560 URL's also javascript:void(0); values ...
                                                              CSS_HREF_Attr
# Needs to be tackled offline 

# R Code Ends here ...


Contact…

No comments:

Post a Comment