Data scraping/web scraping
- Crawling
- Extraction: parsing, searching, reformatting…
Icon made by Freepik from http://www.flaticon.com
10/30/2017
library(twitteR) consumer_key <- "your consumer key" consumer_secret <- "your consumer secret" access_token <- "your token" access_secret <- "your secret" setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
seed_user <- getUser('@chekao')
seed_user$getFollowerIDs()
tweets<-searchTwitter('vaccines AND autism', n=3)
target_users<-sapply(tweets, function(x) x$screenName)
target_tweets<-sapply(tweets, function(x) x$getId())
#retweet related functions are buggy, need to report
for (f in 1:length(target_tweets)){
temp_name<-paste("@", target_users[f], sep="")
temp_seed<-getUser(temp_name)
temp_location<-temp_seed$getLocation()
temp_followers<-try(temp_seed$getFollowerIDs())
assign(paste(target_users[f],target_tweets[f],sep="_"),c(temp_seed, temp_location, temp_followers))
}
# try different search operators (check twitter search)
library(Rfacebook)
fb_oauth <- fbOAuth(
app_id="your app id",
app_secret="you app secret",
extended_permissions = TRUE)
save(fb_oauth, file="token") #next time you can just load the token
#load("token")
getUsers("me",token=fb_oauth) #public profile
head(getLikes(user="me", token=fb_oauth)) #likes
updateStatus("hiho", fb_oauth) #fb post via R
#yhk_friends <- getFriends(fb_oauth, simplify = TRUE) #get friends who are using the app
library(Rfacebook)
fb_oauth <- "your temp token"
getUsers("me",token=fb_oauth) #public profile
head(getLikes(user="me", token=fb_oauth)) #likes
updateStatus("hiho", fb_oauth) #fb post via R
yhk_friends <- getFriends(fb_oauth, simplify = TRUE)
pages<-getPage("DonaldTrump",fb_oauth,n=5)
target_post<-getPost(pages$id[1], fb_oauth, n=100, likes = TRUE, comments = FALSE)
target_user<-getUsers(target_post$likes$from_id, fb_oauth)
# adapted the script from Jose Gonzalez
library(RCurl)
library(RJSONIO)
#write query
query_url<- function(address, return.call = "json", sensor = "false") {
root <- "http://maps.google.com/maps/api/geocode/"
u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, sep = "")
return(URLencode(u))
}
#get and parse json result
geoCode <- function(address,verbose=FALSE) {
if(verbose) cat(address,"\n")
u <- query_url(address)
doc <- getURL(u)
x <- fromJSON(doc,simplify = FALSE)
if(x$status=="OK") {
lat <- x$results[[1]]$geometry$location$lat
lng <- x$results[[1]]$geometry$location$lng
location_type <- x$results[[1]]$geometry$location_type
formatted_address <- x$results[[1]]$formatted_address
return(c(lat, lng, location_type, formatted_address))
Sys.sleep(0.5)
} else {
return(c(NA,NA,NA, NA))
}
}
# adapted the script from Jose Gonzalez
library(ggmap)
target_loc <- geoCode("ann arbor sph")
sphmap <- get_map(location = c(lon = as.numeric(target_loc[2]),
lat = as.numeric(target_loc[1])), zoom = 10,
maptype = "roadmap", scale = 2)
ggmap(sphmap)