library(data.table)
library(stringr)
# download R source code if you dont have it yet
# download.file("http://cran.csie.ntu.edu.tw/src/base/R-3/R-3.2.1.tar.gz", "R-3.2.1.tar.gz")
# untar("R-3.2.1.tar.gz")
# get all file extensions in R source code
all_src_files <- list.files("./R-3.2.1/src/", recursive=TRUE, full.names=TRUE)
all_ext <- str_extract(all_src_files, "\\.[a-zA-Z]+$")
file_counts <- as.data.table(sort(table(all_ext), decreasing=TRUE), keep.rownames=TRUE)
library(wordcloud)
par(mar=c(0, 0, 0, 0))
wordcloud(file_counts$V1, file_counts$V2, min.freq=1, scale=c(7,1), rot.per=.35, random.order=F)
library(ggvis)
file_counts[1:10] %>% ggvis(~reorder(V1, -V2), ~V2) %>% layer_bars() %>%
add_axis('x', title="Source Type") %>% add_axis('y', title="File Count")
# now focus on only source type files
src_ext <- c('c', 'h', 'R', 'f')
srcfiles <- list()
for ( ext in src_ext )
srcfiles[[ext]] <- grep(sprintf("\\.%s$", ext), all_src_files, value=TRUE)
ext_dist <- sapply(srcfiles, length)
round(ext_dist / sum(ext_dist), 2)
## c h R f
## 0.37 0.15 0.46 0.02
As you can see, R is heavily writen in C. Indeed, most primitive functions are written in C code. Vectorization is implemented in C code. The speedy part of R is written in C. (And that’s why it is speedy.)
For example, +
(the add function) is a primitive function written in C, vectorized.
`+` # no R source code printed cause it is written in C
## function (e1, e2) .Primitive("+")
In addition to distribution of number of files in R source, we can take a further look into the distribution of number of lines in each source type.
# unix only
wc_res <- sapply(srcfiles, function(x)
sum(as.integer(sapply(sprintf("wc -l < %s", x), system, intern=TRUE))))
round(wc_res / sum(wc_res), 2)
## c h R f
## 0.45 0.07 0.20 0.27
line_counts <- as.data.table(wc_res, keep.rownames=TRUE)
line_counts %>% ggvis(~rn, ~wc_res) %>% layer_bars() %>%
add_axis('x', title="Source Type") %>% add_axis('y', title="Line Count", title_offset=60)
Finally, we’ve found that only 20.09% of R source lines is written in R, and 45.2% of it is written by C, and 27.31% Fortran.
Rcpp
Rcpp
?
Imagin you are dealing with hexdump of files and planning to use its n-gram frequency feature. Your input data may look like this:
samplefile <- "index.html"
system(sprintf("xxd -g1 %s | head", samplefile), intern=TRUE) # unix only
## [1] "0000000: 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 3e 0a <!DOCTYPE html>."
## [2] "0000010: 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 .<html xmlns=\"ht"
## [3] "0000020: 74 70 3a 2f 2f 77 77 77 2e 77 33 2e 6f 72 67 2f tp://www.w3.org/"
## [4] "0000030: 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 0a 3c 68 1999/xhtml\">..<h"
## [5] "0000040: 65 61 64 3e 0a 0a 3c 6d 65 74 61 20 63 68 61 72 ead>..<meta char"
## [6] "0000050: 73 65 74 3d 22 75 74 66 2d 38 22 3e 0a 3c 6d 65 set=\"utf-8\">.<me"
## [7] "0000060: 74 61 20 68 74 74 70 2d 65 71 75 69 76 3d 22 43 ta http-equiv=\"C"
## [8] "0000070: 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 63 6f 6e ontent-Type\" con"
## [9] "0000080: 74 65 6e 74 3d 22 74 65 78 74 2f 68 74 6d 6c 3b tent=\"text/html;"
## [10] "0000090: 20 63 68 61 72 73 65 74 3d 75 74 66 2d 38 22 20 charset=utf-8\" "
We firstly tidy the input to be a long character sequence…
hexdumplines <- system(sprintf("xxd -g1 %s | cut -d \" \" -f 2-17", samplefile), intern=TRUE) # unix only
hexdump <- unlist(strsplit(paste(hexdumplines, collapse=' '), ' '))
hexdump <- hexdump[hexdump != '']
str(hexdump)
## chr [1:1780975] "3c" "21" "44" "4f" "43" "54" "59" "50" "45" "20" "68" "74" "6d" "6c" ...
Then what? Try looping in native R code, which is rather straight forward.
# first trial: use native R loop (could be terrible for large-scale processing...)
generateNgramFreqR <- function(x, n) {
len <- length(x) - (n - 1)
out <- character(len) # pre-allocate size, very important
for ( i in 1:len ) {
out[i] <- x[i]
if ( n > 1 )
for ( j in 1:(n-1) )
out[i] <- paste0(out[i], x[i+j])
}
unlist(as.list(sort(table(out), decreasing=TRUE)))
}
system.time(r_res <- generateNgramFreqR(hexdump, 2))
## user system elapsed
## 11.452 0.231 12.246
r_res[1:10]
## 2532 2533 3245 4141 3243 3230 3239 3238 2537 3925
## 140886 38566 23687 21677 20037 18886 18817 18718 18242 17731
Or we can use some external library that focuses on text mining, for example, use tau
(Buchta et al. 2015).
# second trial: use external library
generateNgramFreqTau <- function(x, n) {
require(tau)
tau_res <- textcnt(x, n=n, method="string", split="[[:space:]]+")
names(tau_res) <- gsub(' ', '', names(tau_res))
sort(tau_res, decreasing=TRUE)
}
system.time(tau_res <- generateNgramFreqTau(hexdump, 2))
## user system elapsed
## 7.656 0.213 7.960
tau_res[1:10]
## 2532 2533 3245 4141 3243 3230 3239 3238 2537 3925
## 140886 38566 23687 21677 20037 18886 18817 18718 18242 17731
# check if the results are eqaul
identical(tau_res, r_res)
## [1] TRUE
But the performance of the above two solutions are both not satisfying when it comes to large-scale problem. Think about dealing with hex-dump strings from tens of thousands of files, we will need more efficient solution.
Sometimes to make things better all you have to do is to find ANOTHER package that is more powerful over the probelm at hand. But still sometimes you just don’t have much to choose. In this very case of generating n-grams, though there is another well-known general text mining pacakge tm
(Feinerer and Hornik 2015; Feinerer, Hornik, and Meyer 2008) available, it is not suitable to serve as a faster solution to our problem. Indeed, it is prohibitively slow.
# third trial: use another external library, hopefully faster? (Nope.)
generateNgramFreqTm <- function(x, n) {
require(tm)
op <- options(mc.cores=NULL)
options(mc.cores=1)
on.exit(options(op))
ngramTokenizer <- function(x) RWeka::NGramTokenizer(x, RWeka::Weka_control(min=2, max=2))
corp <- Corpus(VectorSource(paste(x, collapse=' ')))
tdm <- TermDocumentMatrix(corp, control=list(tokenize=ngramTokenizer))
res <- rowSums(as.matrix(tdm))
names(res) <- gsub(' ', '', names(res))
sort(res, decreasing=TRUE)
}
# full-size input not run, too slow
length(hexdumplines)
## [1] 111311
system.time(tm_res <- generateNgramFreqTm(hexdumplines[1:10000], 2))
## user system elapsed
## 14.295 0.271 14.643
tm_res[1:10]
## 2532 2533 3239 3238 3243 3245 3344 4425 3925 2537
## 21069 6151 3534 3533 3529 3479 3464 3354 3248 3204
Let’s try Rcpp
to see what we could further achieve.
# final trial: use Rcpp to write the function in C++
library(Rcpp)
cppFunction('
CharacterVector generateNgramFreq (CharacterVector hexvector, int ngram) {
int len = hexvector.size() - (ngram - 1);
CharacterVector out(len);
for (int i = 0; i < len; i++) {
out(i) = hexvector[i];
for (int j = 1; j < ngram; j++) {
out(i) += hexvector[i+j];
}
}
return out;
}')
generateNgramFreq
## function (hexvector, ngram)
## .Primitive(".Call")(<pointer: 0x10b559510>, hexvector, ngram)
generateNgramFreqCpp <- function(x, n)
unlist(as.list(sort(table(generateNgramFreq(x, n)), decreasing=TRUE)))
system.time(cpp_res <- generateNgramFreqCpp(hexdump, 2))
## user system elapsed
## 0.396 0.045 0.442
cpp_res[1:10]
## 2532 2533 3245 4141 3243 3230 3239 3238 2537 3925
## 140886 38566 23687 21677 20037 18886 18817 18718 18242 17731
identical(cpp_res, r_res)
## [1] TRUE
Now it’s time to benchmark these solutions to see the performance difference. You’ll see that the Rcpp
solution is above 10X faster than its alternatives. Depends on the scale of your problem such gap could be further enlarged.
library(microbenchmark)
microbenchmark(generateNgramFreqR(hexdump, 2),
generateNgramFreqTau(hexdump, 2),
generateNgramFreqCpp(hexdump, 2),
times=10)
## Unit: milliseconds
## expr min lq mean median uq max neval
## generateNgramFreqR(hexdump, 2) 9706.8352 10054.8189 10419.5566 10532.4037 10725.8482 11177.2476 10
## generateNgramFreqTau(hexdump, 2) 6125.0972 7266.1923 7515.5358 7630.1056 7925.4634 8134.4764 10
## generateNgramFreqCpp(hexdump, 2) 416.7993 453.6783 614.8183 599.8169 746.5086 907.3503 10
Move on to the next case. Suppose you’d like to monitor the system logs in your computer, counting the process that triggers logging in a sliding-window manner.
syslogname <- "system.log" # this is for Mac, for Linux you may use "syslog" instead
syslogs <- grep(syslogname, dir("/var/log", full.names=TRUE), value=TRUE)
loglines <- unlist(lapply(syslogs,
function(x) {
con <- gzfile(x)
on.exit(close(con))
readLines(con)
}))
head(loglines)
## [1] "Jul 28 00:06:40 kyledembpr kernel[0]: Previous sleep cause: 5"
## [2] "Jul 28 00:06:40 kyledembpr kernel[0]: The USB device Card Reader (Port 3 of Hub at 0x15000000) may have caused a wake by being disconnected"
## [3] "Jul 28 00:06:40 kyledembpr.client.tw.trendnet.org discoveryd[635]: Basic DNSResolver UDNSServer:: PowerState is DarkWake"
## [4] "Jul 28 00:06:40 kyledembpr kernel[0]: AppleThunderboltNHIType2::prePCIWake - power up complete - took 64307 us"
## [5] "Jul 28 00:06:40 kyledembpr kernel[0]: AppleThunderboltGenericHAL::earlyWake - complete - took 14 milliseconds"
## [6] "Jul 28 00:06:40 kyledembpr kernel[0]: IOThunderboltSwitch<0xffffff8025614c00>(0x0)::listenerCallback - Thunderbolt HPD packet for route = 0x0 port = 11 unplug = 0"
Here we use regex to extract fields we are intereted, convert datetime character to POSIX time object, and take a look at the top distributed categories of process names.
logdat <- as.data.table(str_match(loglines, "(^.*[0-9]+ [0-9]{2}:[0-9]{2}:[0-9]{2})([^:]+:)")[,2:3])
setnames(logdat, c("ts", "cate"))
logdat[, `:=`(cate=str_match(cate, " ([a-zA-Z]+)\\[[0-9]+\\]")[,2],
ts=as.POSIXct(ts, format="%b %e %H:%M:%S"))]
logdat[, tsint:=as.integer(ts)]
logdat <- logdat[!is.na(cate)]
setorder(logdat, ts)
logdat
## ts cate tsint
## 1: 2015-07-21 00:00:01 SubmitDiagInfo 1437408001
## 2: 2015-07-21 00:00:05 powerd 1437408005
## 3: 2015-07-21 00:00:05 WindowServer 1437408005
## 4: 2015-07-21 00:00:05 WindowServer 1437408005
## 5: 2015-07-21 00:00:10 powerd 1437408010
## ---
## 19360: 2015-07-28 11:18:55 Dock 1438053535
## 19361: 2015-07-28 11:18:56 Dock 1438053536
## 19362: 2015-07-28 11:18:56 Dock 1438053536
## 19363: 2015-07-28 11:18:56 Dock 1438053536
## 19364: 2015-07-28 11:18:56 Dock 1438053536
sort(table(logdat$cate), decreasing=TRUE)[1:10]
##
## Dock kernel WindowServer discoveryd
## 9081 2742 1618 1339
## sharingd storedownloadd GoogleSoftwareUpdateDaemon Book
## 632 585 364 337
## nsurlstoraged CalendarAgent
## 287 240
Suppose the task is to count number of records for each process within the last 5 minutes for every 1 minute. This is a moving-window operation which could be painful should you use pure R code to solve it. Of course there are dedicated packages aiming at this kind of task but again for this very case none of them to my best knowledge could work efficently.
# first trial: again, naive approach first
movingWindowCountR <- function(DT, wsize, interval, select=NULL) {
require(data.table)
sts <- min(DT$tsint)
ninter <- ceiling(diff(range(DT$tsint)) / interval)
if ( !is.null(select) )
DT <- DT[cate %in% select]
res <- list()
for ( ca in unique(DT$cate) ) {
sts_ <- sts
cnt <- integer(ninter)
for ( i in 1:ninter ) {
cnt[i] <- nrow(DT[cate == ca][tsint <= sts_ & tsint > sts_ - wsize])
sts_ <- sts_ + interval
}
res[[ca]] <- cnt
}
res
}
# run for only two processess cause to slow...
system.time(r_res <- movingWindowCountR(logdat, 300, 60, c("kernel", "discoveryd")))
## user system elapsed
## 32.629 0.679 34.034
Let’s plot the moving-window-count time series for process “kernel”.
kernel_mvcnt <- data.table(count=r_res[["kernel"]])
kernel_mvcnt[, t:=1:.N]
kernel_mvcnt %>% ggvis(~t, ~count) %>% layer_lines()
The bottleneck is, of course, at the loop code. R is not good at explicit looping. When you just don’t find a way to avoid coding like this, it is probably a strong sign indicating the use of Rcpp
.
But before that, let’s struggle a little bit more.
# second trial: play tricky
movingWindowCountR2 <- function(DT, wsize, interval, select=NULL) {
require(data.table)
wsize <- wsize
allts <- data.table(tsint=min(DT$tsint):max(DT$tsint))
if ( !is.null(select) )
DT <- DT[cate %in% select]
res <- lapply(split(DT, DT$cate),
function(x) {
x <- merge(x, allts, by="tsint", all=TRUE)
tsunit_cnt <- x[, list(cnt=sum(!is.na(cate))), by="tsint"]
tsunit_cumcnt <- c(rep(0, wsize), cumsum(tsunit_cnt$cnt))
as.integer(tail(tsunit_cumcnt, -wsize) - head(tsunit_cumcnt, -wsize))
})
lapply(res, function(x) x[seq(1, length(x), interval)])
}
# run for all processes
system.time(r2_res <- movingWindowCountR2(logdat, 300, 60))
## user system elapsed
## 63.921 4.575 70.119
identical(r_res[["kernel"]], r2_res[["kernel"]])
## [1] TRUE
Our second solution is considerably faster, though little bit harder to code and maintain. What if we simply re-write the loop in the first solution in Rcpp
?
# final trial
cppFunction('
NumericVector movingWindowCount(NumericVector ts, int wsize, int interval, int sts, int nt) {
NumericVector out(nt);
for (int i = 0; i < nt; i++) {
NumericVector cnts = ts[(ts <= sts) & (ts > sts - wsize)];
out(i) = cnts.size();
sts += interval;
}
return out;
}')
movingWindowCountCpp <- function(DT, wsize, interval, select=NULL) {
require(data.table)
sts <- min(DT$tsint)
ninter <- ceiling(diff(range(DT$tsint)) / interval)
if ( !is.null(select) )
DT <- DT[cate %in% select]
lapply(split(DT, DT$cate),
function(x) as.integer(movingWindowCount(x$ts, wsize, interval, sts, ninter)))
}
# run for all processes
system.time(rcpp_res <- movingWindowCountCpp(logdat, 300, 60))
## user system elapsed
## 5.337 0.349 5.780
identical(rcpp_res[["kernel"]], r_res[["kernel"]])
## [1] TRUE
length(rcpp_res)
## [1] 86
xargs <- list(logdat, 300, 60, c("kernel", "discoveryd"))
microbenchmark(do.call(movingWindowCountR, xargs),
do.call(movingWindowCountR2, xargs),
do.call(movingWindowCountCpp, xargs),
times=3)
## Unit: seconds
## expr min lq mean median uq max neval
## do.call(movingWindowCountR, xargs) 35.646060 35.828431 36.963566 36.010802 37.622318 39.233835 3
## do.call(movingWindowCountR2, xargs) 1.466984 1.593590 1.680163 1.720196 1.786753 1.853309 3
## do.call(movingWindowCountCpp, xargs) 1.101173 1.115042 1.122784 1.128911 1.133590 1.138268 3
At the benchmark of only processing a small number of processes, solution 2 may outperform Rcpp
solution. Nevertheless, the latter scales considerably greater. Let’s further examine this point. (It all depends on your machine. For example on my MacBook 12" solution 2 is faster overall at 2 processes but on my MacBook Pro 13" the Rcpp
solution constantly outperforms the others.)
allcate <- unique(logdat$cate)
nround <- 20
elapsed_r2 <- numeric(nround)
elapsed_rcpp <- numeric(nround)
for ( i in 1:nround ) {
xargs <- list(logdat, 300, 60, allcate[1:i])
stime <- proc.time()
do.call(movingWindowCountR2, xargs)
elapsed_r2[i] <- (proc.time() - stime)["elapsed"]
stime <- proc.time()
do.call(movingWindowCountCpp, xargs)
elapsed_rcpp[i] <- (proc.time() - stime)["elapsed"]
}
bres <- rbind(data.table(n=1:nround, t=elapsed_r2, alg="r2"),
data.table(n=1:nround, t=elapsed_rcpp, alg="rcpp"))
bres %>% ggvis(~n, ~t, fill=~alg) %>% layer_points() %>%
add_axis('x', title="nround") %>% add_axis('y', title="Seconds Elapsed")
As you can see, the Rcpp
solution is already remnarkably fast in fair-sized problem. Sometimes (like the current one) the speed gain could be exponential in scale of the given problem. The promising point is that we don’t actually add any complexity in our coding. (Okay maybe just a little if one is new to Rcpp
.) Only a small part of the code is re-written in Rcpp
, and the performance gain is tremendous.
Rcpp
functions.There are other cases that you found solutions of existing packages lacking efficiency. That’s assume an MOOC-like student-course activity log in a simplist format. We log all activities for all students in all courses with timestamp. (Assume 26 courses in total, marked by English letters. Students are indexed by incremental integers.)
# generate a simulated data as our playground
genSimData <- function(nlog, nuser, seed) {
set.seed(seed)
dat <- data.table(user=sample(1:nuser, nlog, replace=TRUE,
prob=rbeta(1:nuser, .1, 2)),
course=sample(letters, nlog, replace=TRUE, prob=1:26),
ts=runif(nlog, as.integer(as.POSIXct("2015-01-01")),
as.integer(as.POSIXct("2015-06-30"))))
dat[, t:=as.POSIXct(ts, origin="1970-01-01")]
setorder(dat, user, course, ts)
dat
}
simlog <- genSimData(nlog=1e6, nuser=1e3, seed=528491)
head(simlog)
## user course ts t
## 1: 2 a 1420055427 2015-01-01 03:50:26
## 2: 2 a 1420219054 2015-01-03 01:17:33
## 3: 2 a 1420320164 2015-01-04 05:22:44
## 4: 2 a 1421227260 2015-01-14 17:21:00
## 5: 2 a 1421467857 2015-01-17 12:10:57
## 6: 2 a 1422182483 2015-01-25 18:41:22
Now based on such dataset, assume we’d like to extract a feature: How many courses in total did a student register at the time of his/her last activity record for each course? For example, if student 1’s last activity for course a is today, then how many courses in total (including course a) did she also have at least one activity records?
This problem requires a some-how twisted by-group operation on data.frame, and hence could be challenging if the scale is huge.
# naive apporach first
adHocOperateR <- function(DT) {
require(data.table)
DT <- copy(DT)
DT[, `:=`(tsmin=min(ts), tsmax=max(ts)), by="user,course"]
DT <- unique(DT[, list(user, course, tsmin, tsmax)])
res <- integer()
for ( i in unique(DT$user) ) {
tmpdat <- DT[user == i]
for ( j in tmpdat$course ) {
res <- c(res, sum(tmpdat$tsmin <= tmpdat[course == j, tsmax]))
}
}
cbind(DT[, list(user, course)], res)
}
system.time(res_r1 <- adHocOperateR(simlog))
## user system elapsed
## 10.231 0.142 10.555
res_r1[sample(nrow(res_r1), 10)]
## user course res
## 1: 658 z 25
## 2: 386 q 26
## 3: 504 y 24
## 4: 481 p 15
## 5: 284 j 26
## 6: 184 q 26
## 7: 629 k 26
## 8: 884 d 26
## 9: 118 k 25
## 10: 320 b 26
The naive approach scales well in nlog
but not in nuser
cause the latter impacts how many loops actually run in our solution. Relying only on data.table
, can we improve? The anwser is yes. The following solution use the so-called “grouping by each i” technique to drastically improve performance.
# let's utilizing data.table bitter
adHocOperateR2 <- function(DT) {
require(data.table)
DT <- copy(DT)
DT[, `:=`(tsmin=min(ts), tsmax=max(ts)), by="user,course"]
tmpdat1 <- unique(DT[, list(user, course, tsmin)])
tmpdat2 <- unique(DT[, list(user, course, tsmax)])
setkey(tmpdat1, user)
setkey(tmpdat2, user)
cbind(tmpdat1[, list(user, course)],
res=tmpdat1[tmpdat2, list(res=sum(tsmin <= tsmax)), by=.EACHI][, res])
}
system.time(res_r2 <- adHocOperateR2(simlog))
## user system elapsed
## 0.120 0.024 0.145
setkey(res_r1, user)
identical(res_r1, res_r2)
## [1] TRUE
Time for Rcpp
. Is it possible to be even faster?
cppFunction('
NumericVector adHocOperate (NumericVector first, NumericVector last) {
int len = first.size();
NumericVector out(len);
for (int i = 0; i < len; i++) {
out(i) = 0;
for (int j = 0; j < len; j++) {
if (first(j) <= last(i)) {
out(i) += 1;
}
}
}
return out;
}')
adHocOperateCpp <- function(DT) {
require(data.table)
DT <- copy(simlog)
DT[, `:=`(tsmin=min(ts), tsmax=max(ts)), by="user,course"]
DT <- unique(DT[, list(user, course, tsmin, tsmax)])
cbind(DT[, list(user, course)],
res=as.integer(DT[, list(res=adHocOperate(.SD$tsmin, .SD$tsmax)), by="user"][, res]))
}
system.time(res_cpp <- adHocOperateCpp(simlog))
## user system elapsed
## 0.102 0.012 0.116
setkey(res_cpp, user)
identical(res_cpp, res_r1)
## [1] TRUE
Well, the result suggests that Rcpp
solution performs fairly equal to the data.table
solution. Time to inspect the scalability issue.
# make the data bigger
simlog <- genSimData(nlog=1e7, nuser=1e4, seed=528491)
microbenchmark(adHocOperateR2(simlog),
adHocOperateCpp(simlog),
times=10)
## Unit: seconds
## expr min lq mean median uq max neval
## adHocOperateR2(simlog) 1.515727 1.594130 1.658769 1.667782 1.712563 1.821021 10
## adHocOperateCpp(simlog) 1.263045 1.344909 1.513549 1.562751 1.600723 1.666989 10
Finally, the Rcpp
solution still outperforms, but not in so much advantage. The data.table
solution did a fairly good job. The reason is not surprising: data.table
itself is largely written in C.
Here is the take-a-way from the 3rd example: when performance is the key factor in your computing job and you find yourself facing severe performance bottleneck, you either rewrite the key operaion part of your code in Rcpp
or you choose a package that is written in C and hopefully can solve your problem. To be a more independent data analyst, it is really worth learning Rcpp
. As you can see in the above three examples, sometimes it only takes a little simple coding and your task will be done in lightning fast.
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tm_0.6-2 NLP_0.1-8 tau_0.0-18 microbenchmark_1.4-2 Rcpp_0.11.5
## [6] ggvis_0.4.2 wordcloud_2.5 RColorBrewer_1.1-2 stringr_1.0.0 data.table_1.9.5
##
## loaded via a namespace (and not attached):
## [1] formatR_1.2 plyr_1.8.2 RWekajars_3.7.12-1 tools_3.2.0 digest_0.6.8 jsonlite_0.9.16
## [7] evaluate_0.7 gtable_0.1.2 rstudioapi_0.3.1 shiny_0.12.1 DBI_0.3.1 yaml_2.1.13
## [13] parallel_3.2.0 proto_0.3-10 rJava_0.9-6 dplyr_0.4.2 knitr_1.10.5 grid_3.2.0
## [19] R6_2.0.1 rmarkdown_0.7 RWeka_0.4-24 ggplot2_1.0.1 reshape2_1.4.1 magrittr_1.5
## [25] codetools_0.2-11 scales_0.2.4 htmltools_0.2.6 MASS_7.3-40 assertthat_0.1 mime_0.3
## [31] xtable_1.7-4 colorspace_1.2-6 httpuv_1.3.2 stringi_0.4-1 lazyeval_0.1.10 munsell_0.4.2
## [37] slam_0.1-32 chron_2.3-45
Buchta, Christian, Kurt Hornik, Ingo Feinerer, and David Meyer. 2015. Tau: Text Analysis Utilities. http://CRAN.R-project.org/package=tau.
Dowle, M, T Short, S Lianoglou, A Srinivasan with contributions from R Saporta, and E Antonyan. Data.table: Extension of Data.frame. https://github.com/Rdatatable/data.table/wiki.
Eddelbuettel, Dirk, and Romain Francois. 2011. “Rcpp: Seamless R and C++ Integration.” Journal of Statistical Software 40 (8): 1–18. http://www.jstatsoft.org/v40/i08.
Feinerer, Ingo, and Kurt Hornik. 2015. Tm: Text Mining Package. http://CRAN.R-project.org/package=tm.
Feinerer, Ingo, Kurt Hornik, and David Meyer. 2008. “Text Mining Infrastructure in R.” Journal of Statistical Software 25 (5): 1–54. http://www.jstatsoft.org/v25/i05/.
Hornik, Kurt, Christian Buchta, and Achim Zeileis. 2009. “Open-Source Machine Learning: R Meets Weka.” Computational Statistics 24 (2): 225–32. doi:10.1007/s00180-008-0119-7.
Wickham, Hadley. 2015. Stringr: Simple, Consistent Wrappers for Common String Operations. http://CRAN.R-project.org/package=stringr.