Data scraping/web scraping
- Crawling
- Extraction: parsing, searching, reformatting…
Icon made by Freepik from http://www.flaticon.com
10/30/2017
library(twitteR) consumer_key <- "your consumer key" consumer_secret <- "your consumer secret" access_token <- "your token" access_secret <- "your secret" setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
seed_user <- getUser('@chekao') seed_user$getFollowerIDs() tweets<-searchTwitter('vaccines AND autism', n=3) target_users<-sapply(tweets, function(x) x$screenName) target_tweets<-sapply(tweets, function(x) x$getId()) #retweet related functions are buggy, need to report for (f in 1:length(target_tweets)){ temp_name<-paste("@", target_users[f], sep="") temp_seed<-getUser(temp_name) temp_location<-temp_seed$getLocation() temp_followers<-try(temp_seed$getFollowerIDs()) assign(paste(target_users[f],target_tweets[f],sep="_"),c(temp_seed, temp_location, temp_followers)) } # try different search operators (check twitter search)
library(Rfacebook) fb_oauth <- fbOAuth( app_id="your app id", app_secret="you app secret", extended_permissions = TRUE) save(fb_oauth, file="token") #next time you can just load the token #load("token")
getUsers("me",token=fb_oauth) #public profile head(getLikes(user="me", token=fb_oauth)) #likes updateStatus("hiho", fb_oauth) #fb post via R #yhk_friends <- getFriends(fb_oauth, simplify = TRUE) #get friends who are using the app
library(Rfacebook) fb_oauth <- "your temp token" getUsers("me",token=fb_oauth) #public profile head(getLikes(user="me", token=fb_oauth)) #likes updateStatus("hiho", fb_oauth) #fb post via R yhk_friends <- getFriends(fb_oauth, simplify = TRUE)
pages<-getPage("DonaldTrump",fb_oauth,n=5) target_post<-getPost(pages$id[1], fb_oauth, n=100, likes = TRUE, comments = FALSE) target_user<-getUsers(target_post$likes$from_id, fb_oauth)
# adapted the script from Jose Gonzalez library(RCurl) library(RJSONIO) #write query query_url<- function(address, return.call = "json", sensor = "false") { root <- "http://maps.google.com/maps/api/geocode/" u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, sep = "") return(URLencode(u)) }
#get and parse json result geoCode <- function(address,verbose=FALSE) { if(verbose) cat(address,"\n") u <- query_url(address) doc <- getURL(u) x <- fromJSON(doc,simplify = FALSE) if(x$status=="OK") { lat <- x$results[[1]]$geometry$location$lat lng <- x$results[[1]]$geometry$location$lng location_type <- x$results[[1]]$geometry$location_type formatted_address <- x$results[[1]]$formatted_address return(c(lat, lng, location_type, formatted_address)) Sys.sleep(0.5) } else { return(c(NA,NA,NA, NA)) } }
# adapted the script from Jose Gonzalez library(ggmap) target_loc <- geoCode("ann arbor sph") sphmap <- get_map(location = c(lon = as.numeric(target_loc[2]), lat = as.numeric(target_loc[1])), zoom = 10, maptype = "roadmap", scale = 2) ggmap(sphmap)