/<node_id>/<edge_name>?fields=f1,f2,...
me node: me/photos?fields=fromme node resolves to id of the user that owns the current tokenmetadata=1 to show available fields/edges on a node
<node_id>?metadata=1me?metadata=1<node_id>?fields=f1{subf1,subf2,...},f2,...me?fields=photos{from,id}<node_id>?fields=f1.fields(subf1,subf2,...),f2,...me?fields=photos.fields(from,id)me?fields=photos{from}me/photos?fields=fromto restrict the number of records returned
me?fields=photos.limit(10)me/photos?limit=10to return a count for totals regardless of the actuall number of records returned
<photo_id>?fields=likes.summary(true)<photo_id>/likes?summary=trueto restrict the valid starting timeframe of the data returned
me/posts?since=1420070400multiple modifiers can be chained together
<photo_id>?files=likes.limit(1).summary(true)<photo_id>/likes?limit=1&summary=truemeposts an edge or a field of me?
me/postsme?fields=postsmetadata=1 query string to investigate!likes an edge or a field of posts?since modifier to specify a starting timesummary modifier to get a total countsTL;NR? me/posts?fields=likes.fields(name).summary(true)&since=1420070400&limit=100
GET method to query the graph APIPOST method to create/update data onto the grath APIDELETE method to delete data from the graph APImessage field on me/feedlikes edge on a post nodethe bottom-right of the Graph API Explorer:
curl -i -X GET \
"https://graph.facebook.com/v2.4/me?fields=context&access_token=CAACEdEose0cBAL34H6NiMZB3ZCnPaZBnShQoSY9GZCh81kDLbQZArxKGEPY981H7KfBUjG99jThga2OxQ7owu03IZCgoEcjMDmVSyeZAzos3JZBvWEzbRbfX0DZAl0Au2ybbbZCNZBOsZCYGmjKqCLyTHftwrnOerU07Pismb3QBxYommKEo7oGsWTIIREpbKu4VlHMJY7Q7ZBY00aAZDZD"me Nodeif ( ! "httr" %in% rownames(installed.packages()) )
install.packages("httr")
library("httr")
cURL section from the “get code” button# the target API url
api_addr <- "https://graph.facebook.com/v2.4/me"
# a valid token
token <- "CAACEdEose0cBAPZC40ytYxVPs1y2vOm6pL4qcQY8sIVeK6e3ncJzog2ZA5qMi6X1ZBd5SlycMvo0pTb6FaZBSIOvA6EZByHe5y2T0SgWHGWajzHWTi375ht8jq8eajLUC1x1JZBfOZBjYZAOMYkRVrBZBkPW6QjjhW3FGj1ZAVVzGm71PVk3XZCrSLECGUNgVEwTLS1g4XcbhXXXgZDZD"
# build the query string for the GET method
qs <- list(metadata=1, access_token=token)
r <- GET(api_addr, query=qs)
print(r)
## Response [https://graph.facebook.com/v2.4/me?metadata=1&access_token=CAACEdEose0cBANFZBRTbr6T3FSQ2XZBFbUMQFH0RveWKJubi76xWbnjnZBBU8xq6nCdDFePqnkMkJqnH2QHR8xbdsIaSGabo3kY1CszsOzF88C1mJZA2Xm48s2GXcDahy3ossOQlgBG0hBa0ZBpPffwkLAUB2gH3IrvqiQ1O9XuZCWIvve0HpFMJ1ZCkOstcHkMwy5TYu25SQZDZD]
## Date: 2016-01-22 02:59
## Status: 200
## Content-Type: application/json; charset=UTF-8
## Size: 26.6 kB
str(r, max.level=1)
## List of 10
## $ url : chr "https://graph.facebook.com/v2.4/me?metadata=1&access_token=CAACEdEose0cBANFZBRTbr6T3FSQ2XZBFbUMQFH0RveWKJubi76xWbnjnZBBU8xq6nCd"| __truncated__
## $ status_code: int 200
## $ headers :List of 15
## ..- attr(*, "class")= chr [1:2] "insensitive" "list"
## $ all_headers:List of 1
## $ cookies :'data.frame': 0 obs. of 7 variables:
## $ content : raw [1:26581] 7b 22 6e 61 ...
## $ date : POSIXct[1:1], format: "2016-01-22 02:59:17"
## $ times : Named num [1:6] 0 0.0155 0.017 0.7676 1.0796 ...
## ..- attr(*, "names")= chr [1:6] "redirect" "namelookup" "connect" "pretransfer" ...
## $ request :List of 7
## ..- attr(*, "class")= chr "request"
## $ handle :Class 'curl_handle' <externalptr>
## - attr(*, "class")= chr "response"
# parse json
parsed_contents <- content(r) # see ?content for more details
str(parsed_contents, max.level=2)
## List of 3
## $ name : chr "Yueh Hsuan Chung"
## $ metadata:List of 3
## ..$ fields :List of 51
## ..$ type : chr "user"
## ..$ connections:List of 68
## $ id : chr "100000862115668"
# extract all fields
parsed_contents$metadata$fields %>%
sapply(function(x) x[["name"]]) %>%
writeLines
## id
## about
## age_range
## bio
## birthday
## context
## currency
## devices
## education
## email
## favorite_athletes
## favorite_teams
## first_name
## gender
## hometown
## inspirational_people
## install_type
## installed
## interested_in
## is_shared_login
## is_verified
## languages
## last_name
## link
## location
## locale
## meeting_for
## middle_name
## name
## name_format
## payment_pricepoints
## test_group
## political
## relationship_status
## religion
## security_settings
## significant_other
## sports
## quotes
## third_party_id
## timezone
## token_for_business
## updated_time
## shared_login_upgrade_required_by
## verified
## video_upload_limits
## viewer_can_send_gift
## website
## work
## public_key
## cover
# extract all edges (connections)
parsed_contents$metadata$connections %>%
names %>%
writeLines
## favorite_requests
## request_history
## accounts
## achievements
## adaccounts
## adaccountgroups
## adcontracts
## admined_groups
## adnetworkanalytics
## albums
## applications
## apprequests
## apprequestformerrecipients
## books
## brand_teams
## businesses
## businesssettinglogs
## checkins
## commission_splits
## conversations
## domains
## events
## family
## feed
## stream_filters
## friendlists
## friendrequests
## friends
## ids_for_business
## insights
## integrated_plugin_feed
## invitable_friends
## games
## groups
## home
## inbox
## likes
## locations
## movies
## music
## notifications
## notify_me
## objects
## outbox
## ownerapps
## payment.subscriptions
## payments
## payment_transactions
## permissions
## photos
## picture
## tagged_places
## platformrequests
## pokes
## posts
## privacy_options
## promotable_domains
## promotable_events
## ratings
## scores
## screennames
## taggable_friends
## tagged
## television
## threads
## updates
## videos
## video_broadcasts
Scenario: find a post, get a list of names from all users liked it
likes data on it
<post_id>/likes or <post_id>?fields=likesme/posts?fields=likes.limit(1).summary(true)=> post without any like
=> post with little likes not paginated
=> post with lots of likes that get paginated
=> the final page for the above kind
getAllLikes1 <- function(token, node) {
require(httr)
result <- list()
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s/likes", node)
qs <- list(fields="name", access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
cursor_next <- res$paging$cursors$after
qs$after <- cursor_next
r <- GET(api_addr, query=qs)
res <- content(r)
result <- c(result, res$data)
}
result
}
}
getAllLikes2 <- function(token, node) {
result <- list()
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s/likes", node)
qs <- list(fields="name", access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
next_query <- res$paging$`next`
r <- GET(next_query)
res <- content(r)
result <- c(result, res$data)
}
result
}
}
node <- "100000862115668_932029203502475"
result1 <- getAllLikes1(token=token, node=node)
result2 <- getAllLikes2(token=token, node=node)
all.equal(result1, result2) # should return T
## [1] TRUE
# tidy into data.frame
head(do.call(rbind, result1))
## name id
## [1,] "Tindy Cheng" "315371435293768"
## [2,] "ChunKuei Chu" "969673423047100"
## [3,] "林瑋瑋" "756609997705112"
## [4,] "Mark Yang" "852448504770317"
## [5,] "蘇中才" "1487000218"
## [6,] "陳智泓" "1161704643846945"
me/posts?fields=created_time,likes.fields(id,name)&since=1420070400# think about a blueprint of your craqwler
# need a helper to convert datetime string to timestamp
str2Timestamp <- function(dts) {}
# need a function to crawl all post id in a given time
getPostId <- function(dts, token) {}
# need a function to crawl all like info given a post id
getLikes <- function(pid, token) {}
# unix time conversion
str2Timestamp <- function(dts)
as.integer(as.POSIXct(dts, origin="1970-01-01"))
# try it
str2Timestamp("2015-01-01")
## [1] 1420041600
# get post id given a start time
getPostId <- function(dts, token) {
require(data.table)
require(magrittr)
require(httr)
result <- list()
api_addr <- "https://graph.facebook.com/v2.4/me/posts"
qs <- list(since=str2Timestamp(dts), access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
next_query <- res$paging$`next`
r <- GET(next_query)
res <- content(r)
result <- c(result, res$data)
}
result %<>%
lapply(function(x) c(time=x$created_time, pid=x$id)) %>%
do.call(rbind, .) %>%
as.data.table %>%
.[, time:=as.POSIXct(time, format="%Y-%m-%dT%H:%M:%S%z")]
result
}
}
# try it
getPostId("2015-01-01", token=token) %>% head
## time pid
## 1: 2016-01-21 17:11:09 100000862115668_1061898027182258
## 2: 2016-01-19 13:43:22 100000862115668_1060871233951604
## 3: 2016-01-19 03:25:07 100000862115668_1060646857307375
## 4: 2016-01-13 10:21:21 100000862115668_1057543660951028
## 5: 2016-01-11 13:54:39 100000862115668_1056527544385973
## 6: 2016-01-10 22:40:32 100000862115668_1056155424423185
# caution!
# one post may have multiple timestamps due to attached photo editing
# get likes given a post id
getLikes <- function(pid, token) {
require(data.table)
require(magrittr)
require(httr)
result <- list()
fields <- c("id", "name")
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s/likes?fields=%s",
pid, paste(fields, collapse=','))
qs <- list(access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
next_query <- res$paging$`next`
r <- GET(next_query)
res <- content(r)
result <- c(result, res$data)
}
result %>% rbindlist %>% cbind(pid=pid)
}
}
# try it
getLikes(pid=getPostId("2015-01-01", token=token)$pid[1], token=token) %>% head
## id name pid
## 1: 10152101438344021 Chloe Lee 100000862115668_1061898027182258
## 2: 10203196401577752 Teresa Tc 100000862115668_1061898027182258
## 3: 1322255751 Summit Suen 100000862115668_1061898027182258
## 4: 889680407714198 陳宗薊 100000862115668_1061898027182258
all_posts <- getPostId("2014-01-01", token=token) %>%
.[, list(time=max(time)), by="pid"]
all_likes <- lapply(all_posts$pid, getLikes, token=token) %>%
do.call(rbind, .) %>%
merge(all_posts, by="pid")
head(all_likes)
## pid id name time
## 1: 100000862115668_1044466062258788 10152414169987377 Allen Lai 2015-12-20 15:59:24
## 2: 100000862115668_1044466062258788 729042290449332 楊承恩 2015-12-20 15:59:24
## 3: 100000862115668_1044466062258788 10204030571792411 Mansun Kuo 2015-12-20 15:59:24
## 4: 100000862115668_1044466062258788 1708592169 王雅人 2015-12-20 15:59:24
## 5: 100000862115668_1044466062258788 681600348569352 施文祥 2015-12-20 15:59:24
## 6: 100000862115668_1044466062258788 10202519467522063 葉祐欣 2015-12-20 15:59:24
library(parallel)
system.time(
all_likes1 <- lapply(all_posts$pid, getLikes, token=token)
)
## user system elapsed
## 2.555 0.075 39.829
# for Windows guys: parallel by Rscripting
cl <- makeCluster(detectCores())
system.time(
all_likes2 <- parLapply(cl, all_posts$pid, getLikes, token=token)
)
## user system elapsed
## 0.006 0.001 11.638
stopCluster(cl)
# parallel by forking: *nix only
# system.time(
# all_likes3 <- mclapply(all_posts$pid, getLikes, token=token,
# mc.cores=detectCores())
# )
all.equal(all_likes1, all_likes2)
## [1] TRUE
library(ggplot2)
setorder(all_likes, time)
all_likes <- all_likes[, timed:=as.Date(time)]
all_likes <- all_likes[, timed:=as.Date(time)]
like_trending1 <- all_likes[, list(n_like=.N), by="timed"]
like_trending2 <- all_likes[, list(n_like=.N), by="name,timed"]
top_likes <- all_likes[, list(n_like=.N), by="name"] %>% setorder(-n_like)
like_trending1 %>%
ggplot(aes(x=timed, y=n_like)) + geom_line() + geom_point()
like_trending2[name %in% top_likes$name[1:5]] %>%
ggplot(aes(x=timed, y=n_like, color=name)) + geom_line() + geom_point() +
theme(text=element_text(family='Heiti TC Light')) # required to show chinese on OSX
me/friendsdata-uid or uidposts edge of your friends
<uid>/postsgetFriends <- function(token) {
require(data.table)
require(magrittr)
require(httr)
result <- list()
api_addr <- "https://graph.facebook.com/v2.4/me/friends"
qs <- list(access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
next_query <- res$paging$`next`
r <- GET(next_query)
res <- content(r)
result <- c(result, res$data)
}
result %>% rbindlist
}
}
getFriends(token=token) %>% head
## name id
## 1: Han-Wen Chang 348900235
## 2: Ning Chen 503482470
## 3: 陳奎銘 523887614
## 4: Pin Wu 524871968
## 5: Mei-Yu Chen 631698692
## 6: Linhots Tsao 710176842
getFriendPostId <- function(uid, dts, token) {
require(data.table)
require(magrittr)
require(httr)
result <- list()
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s/posts", uid)
qs <- list(since=str2Timestamp(dts), access_token=token)
r <- GET(api_addr, query=qs)
res <- content(r)
if ( !length(res$data) ) {
result
} else {
result <- c(result, res$data)
while ( "next" %in% names(res$paging) ) {
next_query <- res$paging$`next`
r <- GET(next_query)
res <- content(r)
result <- c(result, res$data)
}
result %<>%
lapply(function(x) c(time=x$created_time, pid=x$id)) %>%
do.call(rbind, .) %>%
as.data.table %>%
.[, time:=as.POSIXct(time, format="%Y-%m-%dT%H:%M:%S%z")]
result
}
}
getFriendPostId("100000862115668", "2015-01-01", token=token) %>% head
## time pid
## 1: 2016-01-21 17:11:09 100000862115668_1061898027182258
## 2: 2016-01-19 13:43:22 100000862115668_1060871233951604
## 3: 2016-01-19 03:25:07 100000862115668_1060646857307375
## 4: 2016-01-13 10:21:21 100000862115668_1057543660951028
## 5: 2016-01-11 13:54:39 100000862115668_1056527544385973
## 6: 2016-01-10 22:40:32 100000862115668_1056155424423185
all_friends <- getFriends(token=token)
# since it may take some time, let's do it parallel
cl <- makeCluster(detectCores())
clusterExport(cl, "str2Timestamp")
all_friends_posts <- parLapplyLB(cl, all_friends$id, getFriendPostId,
dts="2015-01-01", token=token)
stopCluster(cl)
names(all_friends_posts) <- all_friends$name
post_counts <- sapply(all_friends_posts,
function(x) if (!length(x)) 0 else nrow(x))
paste("number of posts available:", names(post_counts), post_counts, sep=" | ") %>%
head %>%
write.table(quote=FALSE, row.names=FALSE, col.names=FALSE)
## number of posts available: | Han-Wen Chang | 0
## number of posts available: | Ning Chen | 0
## number of posts available: | 陳奎銘 | 32
## number of posts available: | Pin Wu | 0
## number of posts available: | Mei-Yu Chen | 259
## number of posts available: | Linhots Tsao | 83
postLike <- function(pid, token) {
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s/likes", pid)
qs <- list(access_token=token)
r <- POST(api_addr, query=qs)
r
}
# try it
postArticle <- function(mesg, token) {
api_addr <- sprintf("https://graph.facebook.com/v2.4/me/feed/")
r <- POST(api_addr, body=list(access_token=token, message=mesg))
content(r)
}
# notice that the resulting pid is a compound id in <uid_pid> format
(test_pid <- postArticle("test", token=token))
## $id
## [1] "100000862115668_1062252643813463"
postLike(pid=test_pid, token=token)
## Response [https://graph.facebook.com/v2.4/100000862115668_1062252643813463/likes?access_token=CAACEdEose0cBANFZBRTbr6T3FSQ2XZBFbUMQFH0RveWKJubi76xWbnjnZBBU8xq6nCdDFePqnkMkJqnH2QHR8xbdsIaSGabo3kY1CszsOzF88C1mJZA2Xm48s2GXcDahy3ossOQlgBG0hBa0ZBpPffwkLAUB2gH3IrvqiQ1O9XuZCWIvve0HpFMJ1ZCkOstcHkMwy5TYu25SQZDZD]
## Date: 2016-01-22 03:02
## Status: 200
## Content-Type: application/json; charset=UTF-8
## Size: 16 B
# delete the test post
deletePost <- function(pid, token) {
api_addr <- sprintf("https://graph.facebook.com/v2.4/%s", pid)
qs <- list(access_token=token)
r <- DELETE(api_addr, query=qs)
r
}
deletePost(test_pid, token=token)
## Response [https://graph.facebook.com/v2.4/100000862115668_1062252643813463?access_token=CAACEdEose0cBANFZBRTbr6T3FSQ2XZBFbUMQFH0RveWKJubi76xWbnjnZBBU8xq6nCdDFePqnkMkJqnH2QHR8xbdsIaSGabo3kY1CszsOzF88C1mJZA2Xm48s2GXcDahy3ossOQlgBG0hBa0ZBpPffwkLAUB2gH3IrvqiQ1O9XuZCWIvve0HpFMJ1ZCkOstcHkMwy5TYu25SQZDZD]
## Date: 2016-01-22 03:02
## Status: 200
## Content-Type: application/json; charset=UTF-8
## Size: 16 B
# # loop for all posts given a user
# me_id <- "100000862115668"
# all_my_posts <- getFriendPostId(me_id, "2015-01-01", token=token)
# lapply(all_my_posts$pid, postLike, token=token)
…a sequence of characters that define a search pattern, mainly for use in pattern matching with strings, or string matching, i.e. “find and replace”-like operations.
-wikipedia
有沒有⋯⋯的八卦?
if ( ! "rvest" %in% rownames(installed.packages()) )
install.packages("rvest")
library("rvest")
# crawl PTT Gossiping article titles
gossip <- GET("https://www.ptt.cc/bbs/Gossiping/index.html",
set_cookies(over18=1))$content %>% read_html
(
gossip_titles <- gossip %>%
html_nodes("div .title") %>%
html_text %>%
iconv(from="utf8", to="utf8") # for Windows
)
## [1] "\n\t\t\t\n\t\t\t\tRe: [新聞] 中油調高加油站毛利 下週油價降幅恐減\n\t\t\t\n\t\t\t"
## [2] "\n\t\t\t\n\t\t\t\t[問卦] 是不想被統一? 還是不想被中國統一?\n\t\t\t\n\t\t\t"
## [3] "\n\t\t\t\n\t\t\t\t[問卦] 有沒有什麼工作是和想像中完全不同的?\n\t\t\t\n\t\t\t"
## [4] "\n\t\t\t\n\t\t\t\t[問卦] 從救世主變成過街老鼠是什麼感覺?\n\t\t\t\n\t\t\t"
## [5] "\n\t\t\t\n\t\t\t\t[新聞] 今年數學「15級分」增加 考「半衰期」\n\t\t\t\n\t\t\t"
## [6] "\n\t\t\t\n\t\t\t\t[爆卦] 嘉義好市多之持續秒退滅頂\n\t\t\t\n\t\t\t"
## [7] "\n\t\t\t\n\t\t\t\t[問卦] 為何阿嘎不跟藏鏡人在一起?\n\t\t\t\n\t\t\t"
## [8] "\n\t\t\t\n\t\t\t\tRe: [問卦] 國民黨有誰有能力爭黨主席?\n\t\t\t\n\t\t\t"
## [9] "\n\t\t\t\n\t\t\t\tRe: [問卦] 有沒有今年學測數學題目的八卦\n\t\t\t\n\t\t\t"
## [10] "\n\t\t\t\n\t\t\t\tRe: [問卦] 遊樂園裡最危險的是海盜船嗎?\n\t\t\t\n\t\t\t"
## [11] "\n\t\t\t\n\t\t\t\t[問卦] 有沒有關雲\"長\"怎麼唸的八卦?\n\t\t\t\n\t\t\t"
## [12] "\n\t\t\t\n\t\t\t\t[公告] 八卦板板規(2015.12.01)\n\t\t\t\n\t\t\t"
## [13] "\n\t\t\t\n\t\t\t\tFw: [徵求] 1/20行車紀錄器,九里路上(里港往九如)\n\t\t\t\n\t\t\t"
## [14] "\n\t\t\t\n\t\t\t\t[協尋] 1/20台中死亡車禍 徵目擊\n\t\t\t\n\t\t\t"
## [15] "\n\t\t\t\n\t\t\t\tFw: [協尋] 1/19早上6:30行經台南北安橋上的行車記錄器\n\t\t\t\n\t\t\t"
## [16] "\n\t\t\t\n\t\t\t\t[公告] 2016一月置底689閒聊區\n\t\t\t\n\t\t\t"
(gossip_titles_cleansed <- gsub("\n\t*", '', gossip_titles))
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "[問卦] 有沒有凌晨三點鐘電視機自己開的八卦"
## [3] "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [5] "[新聞] 美國這次很挺喔!美智庫:小英已對北京釋" "Re: [問卦] 台灣對中國的魅力到底在哪裡"
## [7] "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [9] "(本文已被刪除) [kisweet999]" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [11] "[問卦] 怎樣的景點在座魯宅會一去再去?" "Re: [問卦] 陸客不來倒楣的到底是誰?"
## [13] "Re: [爆卦] KMT臥底記(?" "[新聞] 傳多人因低溫死亡 新北消防:無法斷定"
## [15] "Re: [FB] 馬英九八年感謝文-林雅強" "[問卦] 米國的戰機設計有問題?"
## [17] "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?" "[問卦] 為何中東人會入侵歐洲?"
## [19] "[公告] 八卦板板規(2015.12.01)" "Fw: [徵求] 1/20行車紀錄器,九里路上(里港往九如)"
## [21] "Fw: [徵求] 基隆中華路死亡車禍現場監視器或行車記錄器" "[協尋] 1/20台中死亡車禍 徵目擊"
## [23] "[公告] 2016一月置底689閒聊區"
# something wrong?
grep("[問卦]", gossip_titles_cleansed, value=TRUE)
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "[問卦] 有沒有凌晨三點鐘電視機自己開的八卦"
## [3] "Re: [問卦] 台灣對中國的魅力到底在哪裡" "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)"
## [5] "[問卦] 怎樣的景點在座魯宅會一去再去?" "Re: [問卦] 陸客不來倒楣的到底是誰?"
## [7] "Re: [爆卦] KMT臥底記(?" "[問卦] 米國的戰機設計有問題?"
## [9] "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?" "[問卦] 為何中東人會入侵歐洲?"
## [11] "[公告] 八卦板板規(2015.12.01)"
# how about...
grep("\\[問卦\\]", gossip_titles_cleansed, value=TRUE)
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "[問卦] 有沒有凌晨三點鐘電視機自己開的八卦"
## [3] "Re: [問卦] 台灣對中國的魅力到底在哪裡" "[問卦] 怎樣的景點在座魯宅會一去再去?"
## [5] "Re: [問卦] 陸客不來倒楣的到底是誰?" "[問卦] 米國的戰機設計有問題?"
## [7] "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?" "[問卦] 為何中東人會入侵歐洲?"
grep("^\\[問卦\\]", gossip_titles_cleansed, value=TRUE)
## [1] "[問卦] 有沒有凌晨三點鐘電視機自己開的八卦" "[問卦] 怎樣的景點在座魯宅會一去再去?"
## [3] "[問卦] 米國的戰機設計有問題?" "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?"
## [5] "[問卦] 為何中東人會入侵歐洲?"
grep("^Re: ", gossip_titles_cleansed, value=TRUE)
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [3] "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳" "Re: [問卦] 台灣對中國的魅力到底在哪裡"
## [5] "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [7] "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一" "Re: [問卦] 陸客不來倒楣的到底是誰?"
## [9] "Re: [爆卦] KMT臥底記(?" "Re: [FB] 馬英九八年感謝文-林雅強"
grep("?$", gossip_titles_cleansed, value=TRUE)
## [1] "[問卦] 怎樣的景點在座魯宅會一去再去?" "[問卦] 為何中東人會入侵歐洲?"
grep("[a-zA-Z]", gossip_titles_cleansed, value=TRUE)
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [3] "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳" "Re: [問卦] 台灣對中國的魅力到底在哪裡"
## [5] "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [7] "(本文已被刪除) [kisweet999]" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [9] "Re: [問卦] 陸客不來倒楣的到底是誰?" "Re: [爆卦] KMT臥底記(?"
## [11] "Re: [FB] 馬英九八年感謝文-林雅強" "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?"
## [13] "Fw: [徵求] 1/20行車紀錄器,九里路上(里港往九如)" "Fw: [徵求] 基隆中華路死亡車禍現場監視器或行車記錄器"
ABCabc123. ^ $ * + ? { } [ ] \ | ( )\d \D \s \S \w \W \b \B.\n) for Perl REtest_str <- c("hello world", "你好啊", "\n")
grep('.', test_str, value=TRUE, perl=TRUE)
## [1] "hello world" "你好啊"
grep('.', test_str, value=TRUE, perl=FALSE)
## [1] "hello world" "你好啊" "\n"
^ and $^ limits the begining character$ limits the ending charactergrep('^hello', test_str, value=TRUE)
## [1] "hello world"
grep('world$', test_str, value=TRUE)
## [1] "hello world"
grep('^[h你]', test_str, value=TRUE)
## [1] "hello world" "你好啊"
*, +, and ?*: the previous unit must repeat 0 or many times+: the previous unit must repeat 1 or many times?: the previous unit must repeat 0 or 1 timesgrep("l*", test_str, value=TRUE)
## [1] "hello world" "你好啊" "\n"
grep("l+", test_str, value=TRUE)
## [1] "hello world"
grep("l?", test_str, value=TRUE)
## [1] "hello world" "你好啊" "\n"
{m,n}grep("l{0,10}", test_str, value=TRUE)
## [1] "hello world" "你好啊" "\n"
grep("l{2}", test_str, value=TRUE)
## [1] "hello world"
grep("l{,2}", test_str, value=TRUE)
## [1] "hello world" "你好啊" "\n"
grep("l{2,}", test_str, value=TRUE)
## [1] "hello world"
[][0-9$] matches any of numbers 0 to 9 or the dollar sign($)$ is treated “as-is”"-: ranger
[0-9a-zA-Z]^: negator
[^0-9]grep("[a-z]", test_str, value=TRUE)
## [1] "hello world"
grep("[0-9]", test_str, value=TRUE)
## character(0)
# want to match the dash (-) ?
grep("[a-z-]", c("abc", "123-456"), value=TRUE)
## [1] "abc" "123-456"
()test_str2 <- c("hello", "olleh", "hellohello")
test_str2 %>% grep("^hello$", ., value=TRUE)
## [1] "hello"
test_str2 %>% grep("^[hello]$", ., value=TRUE)
## character(0)
test_str2 %>% grep("^(hello)?$", ., value=TRUE)
## [1] "hello"
test_str2 %>% grep("^(hello)+$", ., value=TRUE)
## [1] "hello" "hellohello"
|grep("^h|^你", test_str, value=TRUE)
## [1] "hello world" "你好啊"
\grep("^\\^", "^123", value=TRUE)
## [1] "^123"
\d: any decimal digit => [0-9]\D: any non-digit => [^0-9]\s: any white space => [ \t\n\r\f\v]\S: any non-white space => [^ \t\n\r\f\v]\w: any alphanumeric => [a-zA-Z0-9_]\W: any non-alphanumeric => [^a-zA-Z0-9_]\b: matches the empty string, but only at the beginning or end of a word (\w)base::grep and base::greplgrep: match and return
grepl# return numeric index
grep("^Re: ", gossip_titles_cleansed)
## [1] 1 3 4 6 7 8 10 12 13 15
# then can used as selecting vector
gossip_titles_cleansed[grep("^Re: ", gossip_titles_cleansed)]
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [3] "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳" "Re: [問卦] 台灣對中國的魅力到底在哪裡"
## [5] "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [7] "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一" "Re: [問卦] 陸客不來倒楣的到底是誰?"
## [9] "Re: [爆卦] KMT臥底記(?" "Re: [FB] 馬英九八年感謝文-林雅強"
# which effectively equals...
grep("^Re: ", gossip_titles_cleansed, value=TRUE)
## [1] "Re: [問卦] 有沒有生鐵鍋的八卦" "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [3] "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳" "Re: [問卦] 台灣對中國的魅力到底在哪裡"
## [5] "Re: [爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "Re: [新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [7] "Re: [新聞] 旅行社接到通知...3月後陸客將銳減三分之一" "Re: [問卦] 陸客不來倒楣的到底是誰?"
## [9] "Re: [爆卦] KMT臥底記(?" "Re: [FB] 馬英九八年感謝文-林雅強"
# the no-match case
grep("1p3fjhi1o", gossip_titles_cleansed)
## integer(0)
grep("1p3fjhi1o", gossip_titles_cleansed, value=TRUE)
## character(0)
# the fixed patter: no re, all as-is
grep("^Re: ", gossip_titles_cleansed, value=TRUE, fixed=TRUE)
## character(0)
identical(grep("[新聞]", gossip_titles_cleansed, value=TRUE, fixed=TRUE),
grep("\\[新聞\\]", gossip_titles_cleansed, value=TRUE, fixed=FALSE))
## [1] TRUE
base::gsubsub to replace only oncegsub("\\].*", '', gossip_titles_cleansed)
## [1] "Re: [問卦" "[問卦" "Re: [新聞" "Re: [新聞"
## [5] "[新聞" "Re: [問卦" "Re: [爆卦" "Re: [新聞"
## [9] "(本文已被刪除) [kisweet999" "Re: [新聞" "[問卦" "Re: [問卦"
## [13] "Re: [爆卦" "[新聞" "Re: [FB" "[問卦"
## [17] "[問卦" "[問卦" "[公告" "Fw: [徵求"
## [21] "Fw: [徵求" "[協尋" "[公告"
gsub("^.*\\[", '', gossip_titles_cleansed)
## [1] "問卦] 有沒有生鐵鍋的八卦" "問卦] 有沒有凌晨三點鐘電視機自己開的八卦"
## [3] "新聞] 旅行社接到通知...3月後陸客將銳減三分之一" "新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [5] "新聞] 美國這次很挺喔!美智庫:小英已對北京釋" "問卦] 台灣對中國的魅力到底在哪裡"
## [7] "爆卦] NCC中嘉案公聽會開始了(黃國昌出席)" "新聞] 「柯市漲」又要錢 搭貓纜70元起跳"
## [9] "kisweet999]" "新聞] 旅行社接到通知...3月後陸客將銳減三分之一"
## [11] "問卦] 怎樣的景點在座魯宅會一去再去?" "問卦] 陸客不來倒楣的到底是誰?"
## [13] "爆卦] KMT臥底記(?" "新聞] 傳多人因低溫死亡 新北消防:無法斷定"
## [15] "FB] 馬英九八年感謝文-林雅強" "問卦] 米國的戰機設計有問題?"
## [17] "問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?" "問卦] 為何中東人會入侵歐洲?"
## [19] "公告] 八卦板板規(2015.12.01)" "徵求] 1/20行車紀錄器,九里路上(里港往九如)"
## [21] "徵求] 基隆中華路死亡車禍現場監視器或行車記錄器" "協尋] 1/20台中死亡車禍 徵目擊"
## [23] "公告] 2016一月置底689閒聊區"
gsub("\\].*|^.*\\[", '', gossip_titles_cleansed)
## [1] "問卦" "問卦" "新聞" "新聞" "新聞" "問卦" "爆卦" "新聞"
## [9] "kisweet999" "新聞" "問卦" "問卦" "爆卦" "新聞" "FB" "問卦"
## [17] "問卦" "問卦" "公告" "徵求" "徵求" "協尋" "公告"
stringr libraryhttr, ggplot2, and many othersif ( ! "stringr" %in% rownames(installed.packages()) )
install.packages("stringr")
library("stringr")
str_extract and str_extract_all# only the matched part will be returned; no-match resolves to NA
str_extract(gossip_titles_cleansed, "\\[問卦\\]")
## [1] "[問卦]" "[問卦]" NA NA NA "[問卦]" NA NA NA NA "[問卦]" "[問卦]"
## [13] NA NA NA "[問卦]" "[問卦]" "[問卦]" NA NA NA NA NA
str_extract(gossip_titles_cleansed, "\\[問卦\\].*")
## [1] "[問卦] 有沒有生鐵鍋的八卦" "[問卦] 有沒有凌晨三點鐘電視機自己開的八卦"
## [3] NA NA
## [5] NA "[問卦] 台灣對中國的魅力到底在哪裡"
## [7] NA NA
## [9] NA NA
## [11] "[問卦] 怎樣的景點在座魯宅會一去再去?" "[問卦] 陸客不來倒楣的到底是誰?"
## [13] NA NA
## [15] NA "[問卦] 米國的戰機設計有問題?"
## [17] "[問卦] 400g阿華田的巧克力醬夠肥宅吃幾天?" "[問卦] 為何中東人會入侵歐洲?"
## [19] NA NA
## [21] NA NA
## [23] NA
str_extract(gossip_titles_cleansed, "^\\[問卦\\] [A-Za-z]+")
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# match (amd extract) multiple times
str_extract_all(gossip_titles_cleansed, "有", simplify=TRUE)
## [,1] [,2]
## [1,] "有" "有"
## [2,] "有" "有"
## [3,] "" ""
## [4,] "" ""
## [5,] "" ""
## [6,] "" ""
## [7,] "" ""
## [8,] "" ""
## [9,] "" ""
## [10,] "" ""
## [11,] "" ""
## [12,] "" ""
## [13,] "" ""
## [14,] "" ""
## [15,] "" ""
## [16,] "有" ""
## [17,] "" ""
## [18,] "" ""
## [19,] "" ""
## [20,] "" ""
## [21,] "" ""
## [22,] "" ""
## [23,] "" ""
# many-to-many pattern matching (with recycling)
str_extract(letters, rep("[a-g]", 26))
## [1] "a" "b" "c" "d" "e" "f" "g" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
str_extract(letters, rep("[a-g]", 13))
## [1] "a" "b" "c" "d" "e" "f" "g" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
str_extract(letters, rep("[a-g]", 52))
## [1] "a" "b" "c" "d" "e" "f" "g" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "a" "b" "c"
## [30] "d" "e" "f" "g" NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
str_match and str_match_all# without grouper
str_match(gossip_titles_cleansed, "\\[.*\\]")
## [,1]
## [1,] "[問卦]"
## [2,] "[問卦]"
## [3,] "[新聞]"
## [4,] "[新聞]"
## [5,] "[新聞]"
## [6,] "[問卦]"
## [7,] "[爆卦]"
## [8,] "[新聞]"
## [9,] "[kisweet999]"
## [10,] "[新聞]"
## [11,] "[問卦]"
## [12,] "[問卦]"
## [13,] "[爆卦]"
## [14,] "[新聞]"
## [15,] "[FB]"
## [16,] "[問卦]"
## [17,] "[問卦]"
## [18,] "[問卦]"
## [19,] "[公告]"
## [20,] "[徵求]"
## [21,] "[徵求]"
## [22,] "[協尋]"
## [23,] "[公告]"
# with grouper
str_match(gossip_titles_cleansed, "\\[(.*)\\]")
## [,1] [,2]
## [1,] "[問卦]" "問卦"
## [2,] "[問卦]" "問卦"
## [3,] "[新聞]" "新聞"
## [4,] "[新聞]" "新聞"
## [5,] "[新聞]" "新聞"
## [6,] "[問卦]" "問卦"
## [7,] "[爆卦]" "爆卦"
## [8,] "[新聞]" "新聞"
## [9,] "[kisweet999]" "kisweet999"
## [10,] "[新聞]" "新聞"
## [11,] "[問卦]" "問卦"
## [12,] "[問卦]" "問卦"
## [13,] "[爆卦]" "爆卦"
## [14,] "[新聞]" "新聞"
## [15,] "[FB]" "FB"
## [16,] "[問卦]" "問卦"
## [17,] "[問卦]" "問卦"
## [18,] "[問卦]" "問卦"
## [19,] "[公告]" "公告"
## [20,] "[徵求]" "徵求"
## [21,] "[徵求]" "徵求"
## [22,] "[協尋]" "協尋"
## [23,] "[公告]" "公告"
str_match(gossip_titles_cleansed, "Re: \\[(.*)\\]")
## [,1] [,2]
## [1,] "Re: [問卦]" "問卦"
## [2,] NA NA
## [3,] "Re: [新聞]" "新聞"
## [4,] "Re: [新聞]" "新聞"
## [5,] NA NA
## [6,] "Re: [問卦]" "問卦"
## [7,] "Re: [爆卦]" "爆卦"
## [8,] "Re: [新聞]" "新聞"
## [9,] NA NA
## [10,] "Re: [新聞]" "新聞"
## [11,] NA NA
## [12,] "Re: [問卦]" "問卦"
## [13,] "Re: [爆卦]" "爆卦"
## [14,] NA NA
## [15,] "Re: [FB]" "FB"
## [16,] NA NA
## [17,] NA NA
## [18,] NA NA
## [19,] NA NA
## [20,] NA NA
## [21,] NA NA
## [22,] NA NA
## [23,] NA NA
str_match(gossip_titles_cleansed, "\\[(.?)(.?)\\]")
## [,1] [,2] [,3]
## [1,] "[問卦]" "問" "卦"
## [2,] "[問卦]" "問" "卦"
## [3,] "[新聞]" "新" "聞"
## [4,] "[新聞]" "新" "聞"
## [5,] "[新聞]" "新" "聞"
## [6,] "[問卦]" "問" "卦"
## [7,] "[爆卦]" "爆" "卦"
## [8,] "[新聞]" "新" "聞"
## [9,] NA NA NA
## [10,] "[新聞]" "新" "聞"
## [11,] "[問卦]" "問" "卦"
## [12,] "[問卦]" "問" "卦"
## [13,] "[爆卦]" "爆" "卦"
## [14,] "[新聞]" "新" "聞"
## [15,] "[FB]" "F" "B"
## [16,] "[問卦]" "問" "卦"
## [17,] "[問卦]" "問" "卦"
## [18,] "[問卦]" "問" "卦"
## [19,] "[公告]" "公" "告"
## [20,] "[徵求]" "徵" "求"
## [21,] "[徵求]" "徵" "求"
## [22,] "[協尋]" "協" "尋"
## [23,] "[公告]" "公" "告"
# mathc once or many times
gossip_titles_cleansed %>% head %>% str_match("[a-z]")
## [,1]
## [1,] "e"
## [2,] NA
## [3,] "e"
## [4,] "e"
## [5,] NA
## [6,] "e"
gossip_titles_cleansed %>% head %>% str_match_all("[a-z]")
## [[1]]
## [,1]
## [1,] "e"
##
## [[2]]
## [,1]
##
## [[3]]
## [,1]
## [1,] "e"
##
## [[4]]
## [,1]
## [1,] "e"
##
## [[5]]
## [,1]
##
## [[6]]
## [,1]
## [1,] "e"
# make a comprehensive function for multi-page crawling
crawlPttTitles <- function(npage, pat, target="Gossiping") {
require(data.table)
require(parallel)
require(stringr)
require(rvest)
require(httr)
# make it fast by parallelling
cl <- makeCluster(detectCores())
on.exit(stopCluster(cl))
# crawl front page
response <- GET(sprintf("https://www.ptt.cc/bbs/%s/index.html", target),
set_cookies(over18=1))
if ( (rcode <- response$status_code) != 200 )
stop(sprintf("Got status code %s.", rcode))
# get page index
lastpage_idx <- response$content %>% read_html %>%
html_nodes("div .btn.wide") %>%
iconv(from="utf8", to="utf8") %>% # for Windows
grep("上頁", ., value=TRUE) %>% {
as.integer(str_match(., "index([0-9]+)\\.html")[,2])
}
all_pages <- sprintf("https://www.ptt.cc/bbs/%s/index%s.html",
target, (lastpage_idx+1):(lastpage_idx-npage+2))
# grep titles with given regex
oneshot <- function(page_url, pat) {
require(data.table)
require(rvest)
require(httr)
ptt <- GET(page_url, set_cookies(over18=1))$content %>% read_html
# deleted posts dont have <a href></a>
not_deleted <- html_nodes(ptt, "div .title") %>%
as.character %>%
grepl("<a href", .)
titles <- html_nodes(ptt, "div .title") %>%
html_text %>%
iconv(from="utf8", to="utf8") %>% # for Windows
gsub("\n\t*", '', .) %>%
.[not_deleted]
links <- html_nodes(ptt, "div .title a") %>%
html_attr("href") %>%
iconv(from="utf8", to="utf8") # for Windows
nrec <- html_nodes(ptt, "div .nrec") %>%
html_text %>%
iconv(from="utf8", to="utf8") %>% # for Windows
.[not_deleted]
res <- data.table(nrec=nrec, title=titles, link=links) %>%
.[grepl(pat, title)]
res
}
res <- parLapplyLB(cl, all_pages, oneshot, pat=pat)
res[sapply(res, function(x) length(x) > 0)] %>% rbindlist
}
# check result
system.time(
wanted <- crawlPttTitles(1000, "^\\[問卦\\] 有沒有.*的八卦[??]")
)
## user system elapsed
## 0.295 0.036 45.860
wanted %>% head
## nrec title link
## 1: [問卦] 有沒有當完兵後變笨的八卦? /bbs/Gossiping/M.1453480878.A.2ED.html
## 2: 11 [問卦] 有沒有寒流還沒來的八卦? /bbs/Gossiping/M.1453480326.A.095.html
## 3: 6 [問卦] 有沒有冬季到台北來看雪的八卦? /bbs/Gossiping/M.1453480570.A.B1E.html
## 4: 9 [問卦] 有沒有漢字拉丁化的八卦? /bbs/Gossiping/M.1453479929.A.654.html
## 5: 6 [問卦] 有沒有冰雨的八卦? /bbs/Gossiping/M.1453480272.A.94F.html
## 6: [問卦] 有沒有非洲螢火蟲的八卦? /bbs/Gossiping/M.1453479677.A.670.html
# tidy result
wanted <- wanted[grep("爆", nrec), cnt := 150L] %>%
.[grep("^X", nrec), cnt := 0L] %>%
.[nrec == '', cnt := 0L] %>%
.[is.na(cnt), cnt:=as.integer(nrec)]
# not enough? let's do some visualization
library(RColorBrewer)
library(wordcloud)
buzz <- str_match(wanted$title, "有沒有(.*)的八卦?")[,2] %T>%
{
par(family='Heiti TC Light')
wordcloud(., wanted$cnt, min.freq=3, scale=c(4,.5),
rot.per=0, fixed.asp=FALSE,
colors=brewer.pal(8, name="Set2"), random.color=TRUE)
}
ptt_movie <- crawlPttTitles(1000, "^\\[.*雷\\]", "movie")
rei <- str_match(ptt_movie$title, "^\\[(.*)雷\\]《(.*)》") %>%
.[!is.na(.[,1]),-1] %>%
as.data.table %>% {
setnames(., c("rei", "title"))
setorder(., title)
.[, list(.N), by="title,rei"]
}
head(rei, 20) %>% write.table(row.names=FALSE, col.names=FALSE, quote=FALSE)
## 絕地救援 The Martian 好 1
## 007惡魔四伏 負 1
## 007:惡魔四伏 普 1
## 45年 好 1
## 52個星期二 好 1
## Inside Out 好 1
## 三心一意 好 1
## 久美子的奇異旅程 普 1
## 他媽媽的藏寶圖 負無 1
## 侏羅紀公園 有 1
## 保全員之死 屌 1
## 全家玩到趴 普 1
## 再見,在也不見 普 1
## 別跟山過不去 普 1
## 刺客聶隱娘 普 1
## 劇場靈 普 1
## 危樓愚夫 超好 1
## 原力覺醒 好微 1
## 可惜不是 大負 1
## 命運.晚餐 普好 1
crawlDcardTitles <- function(target="bg", npage, pat) {
require(data.table)
require(parallel)
require(stringr)
require(rvest)
require(httr)
cl <- makeCluster(detectCores())
on.exit(stopCluster(cl))
pages <- sprintf("https://www.dcard.tw/api/forum/%s/%s", target, 1:npage)
crawlSinglePage <- function(page_url, pat) {
require(data.table)
require(rvest)
require(httr)
response <- GET(page_url)
if ( (rcode <- response$status_code) != 200 ) {
NULL
} else {
content(response) %>%
sapply(function(x) x$version[[1]]$title) %>%
grep(pat, ., value=TRUE)
}
}
parLapplyLB(cl, pages, crawlSinglePage, pat=pat) %>% unlist
}
system.time(
dcardtitles <- crawlDcardTitles("bg", 1000, "室友")
)
## user system elapsed
## 0.020 0.015 62.509
dcardtitles # rarer than what I thought...
## [1] "救救綠光室友"
crawlDcardArticles <- function(target="bg", npage,
title_pat=NULL, article_pat, npat) {
require(parallel)
require(rvest)
require(httr)
cl <- makeCluster(detectCores())
on.exit(stopCluster(cl))
pages <- sprintf("https://www.dcard.tw/api/forum/%s/%s", target, 1:npage)
crawlSinglePageTitleId <- function(page_url, pat) {
require(rvest)
require(httr)
response <- GET(page_url)
if ( (rcode <- response$status_code) != 200 ) {
NULL
} else {
json <- content(response)
if ( !is.null(pat) ) {
idx <- sapply(json, function(x) x$version[[1]]$title) %>%
grep(pat, .)
sapply(json[idx], function(x) x$id)
} else {
content(response) %>% sapply(function(x) x$id)
}
}
}
crawlSingleArticle <- function(page_url, pat, npat) {
require(stringr)
require(rvest)
require(httr)
response <- GET(page_url)
if ( (rcode <- response$status_code) != 200 ) {
NULL
} else {
json <- content(response)
matched_len <- json$version[[1]]$content %>%
str_match_all(pat) %>%
unlist %>%
length
if ( matched_len >= npat ) {
json$version[[1]]$content
} else {
NULL
}
}
}
parLapplyLB(cl, pages, crawlSinglePageTitleId, pat=title_pat) %>%
unlist %>%
sprintf("https://www.dcard.tw/api/post/%s/%s", target, .) %>%
parLapplyLB(cl, ., crawlSingleArticle,
pat=article_pat, npat=npat) %>%
unlist
}
system.time(
dcardarticles <- crawlDcardArticles("bg", 100, article_pat="室友", npat=3)
)
## user system elapsed
## 0.024 0.013 76.582
length(dcardarticles)
## [1] 5
dcardarticles %>% head %>% substr(start=1, stop=100)
## [1] "不知道是不是發男女版,但有涉及到男男女女,所以我就發這了\n\n曾經我跟一個女孩L在一起\n一開始我對她沒有任何感覺,就跟普通朋友一樣\n但她常常約我,跟我聊天...\n讓我慢慢地對她也有好感\n於是我就答應了她"
## [2] "在一次聚會中認識了你\n之後了解你很擅長交朋友而且男女都有\n是個很細心的男生\n\n後來我們\n一個禮拜有四堂共同課\n幾乎有三堂他會坐在我旁邊\n會幫對方互相留位子,翹課時也相互照應\n有時上課他會突然靠得很近,"
## [3] "看著Dcard上分享當小三遇小三養小三的文章 \n觸碰到內心深處的記憶\n\n時間可以沖淡一切\n但在深陷泥沼中 總是不知不覺向下陷\n\n-----------------正文開始---------------"
## [4] "手機排版請見諒\n\n在開始講故事以前\n我要先說說故事的背景\n我的還有她的\n\n我是一個魯了差點超過20年體重突破三位數的魯蛇(ps胖子朋友不要放棄啊冬天是你的主場!!!!\n一個台獨分子\n\n她是來從大陸來台"
## [5] "首po手機排版請見諒\U0001f60f\n上大學後,每天在狄卡爬文抽卡是小大一的必經之路。12點一過和室友討論誰抽到的卡友也是生活當中的樂趣。然而不知道是狄卡設計還是什麼的,總覺得我們寢室常被銃鏮。\n\n "
data.table
ggplot2