Kyle Chung
November 28th, 2013 at Trend Micro
illust. by RedEyeHare
sudo apt-get install r-base-coresudo yum install R-core (recommend using EPEL)bigmemory, rPython%load_ext rmagicYour role matters!
sessionInfo() # type ?sessionInfo for document help
getwd()
setwd('C:\\Dropbox')
# or simply:
setwd('C:/Dropbox')
save.image() # default to destined file '.RData'
q()
edit(file='C:/Program Files/R/R-3.0.0/etc/Rprofile.site')
.libPaths()
[1] "C:/Users/everdark/Documents/R/win-library/3.0"
[2] "C:/Program Files/R/R-3.0.2/library"
(.libPaths(c(.libPaths(), getwd())))
[1] "C:/Users/everdark/Documents/R/win-library/3.0"
[2] "C:/Program Files/R/R-3.0.2/library"
[3] "C:/Dropbox/R/Course"
object.size to check memory usage of object(s)gc for garbage collection
library or require to load external packagesinstall.packages to install CRAN packagesas function for type convertionx <- 999L; y = 0 # append 'L' to force integer type
x # the same as print(x)
[1] 999
print(y)
[1] 0
power2 <- function(z) z^2
power2(z = 3); z # Error: object 'z' not found
power2(z <- 4); z # z is assigned numeric 4
z <- 'whatever remains'
tellTheTruth <- function(z) return(TRUE)
tellTheTruth(z <- 'I won\'t appear.')
[1] TRUE
z
[1] "whatever remains"
changeX <- function(x) {
x <- x + 100
gvar <<- 'I am a GLOBAL variable.'
x
}
x <- 0; changeX(x=x)
[1] 100
x
[1] 0
gvar
[1] "I am a GLOBAL variable."
x <- 123
typeof(x)
[1] "double"
x <- 'abc'
typeof(x)
[1] "character"
rm(x) # remove the object
y <- x <- 'abc'
identical(tracemem(x), tracemem(y))
[1] TRUE
y <- 'abc'
identical(tracemem(x), tracemem(y))
[1] FALSE
identical(x, y)
[1] TRUE
V <- c(10L,20,30) # c for concatenate
typeof(V) # notice for the implicit casting
[1] "double"
V[2] # try ?'['
[1] 20
TRUE, FALSE, T, Ftypeof or mode to check the variable type
?modex <- integer()
c(typeof(x), class(x)) # try also storage.mode(x)
[1] "integer" "integer"
y <- data.frame()
c(typeof(y), class(y))
[1] "list" "data.frame"
(mixed <- c('123', 123)) # implicit casting
[1] "123" "123"
(mixed <- c(mixed, 4000)[2:4]) # V points to a new vector
[1] "123" "4000" NA
names(mixed) <- c('1st', '2nd', '3rd'); mixed
1st 2nd 3rd
"123" "4000" NA
c(1,2) * c(100,100,100) # '*' is element-wise, see ?'*'
Warning: longer object length is not a multiple of shorter object length
[1] 100 200 100
%*%c(1,2,3) %*% c(100,100,100) # inner product
[,1]
[1,] 600
c(1,2,3) %o% c(100,100,100) # outer product
[,1] [,2] [,3]
[1,] 100 100 100
[2,] 200 200 200
[3,] 300 300 300
V <- c(10,20,30,40)
V[c(2,4)]
[1] 20 40
V[2:length(V)] # see ?seq for a generalized ':' function
[1] 20 30 40
bool_index <- c(TRUE,FALSE,FALSE,FALSE)
V[bool_index]
[1] 10
V1 <- c(10,20,30,40)
(V2 <- V1[c(rep(1,2),2:4)]) # take the 1st element twice
[1] 10 10 20 30 40
V2[-c(2,4)] # negative indexing
[1] 10 20 40
V2[V2 > mean(V2)] # indeed, '>'(V2, mean(V2))
[1] 30 40
names(V1) <- c('one', 'two', 'three', 'four')
str(V1) # a very useful function to check structure of an object
Named num [1:4] 10 20 30 40
- attr(*, "names")= chr [1:4] "one" "two" "three" "four"
V1[c('two', 'three')]
two three
20 30
V1[grep('^t', names(V1))] # see also ?grepl and ?regex
two three
20 30
(M <- matrix(c(1,2,3,4), nrow=2, byrow=FALSE))
[,1] [,2]
[1,] 1 3
[2,] 2 4
str(M) # simply a numeric vector with 2 dimensions
num [1:2, 1:2] 1 2 3 4
dim(M) # try length(M)
[1] 2 2
colnames(M) <- c('c1', 'c2')
rownames(M) <- c('r1', 'r2')
identical(M[,2], M[1:2,'c2']) # test exact equality
[1] TRUE
M[1,1] <- 0; M
c1 c2
r1 0 3
r2 2 4
dim(M[1,])
NULL
str(M[1,]) # rowname attribute is dropped, too
Named num [1:2] 0 3
- attr(*, "names")= chr [1:2] "c1" "c2"
str(M[1,,drop=FALSE]) # you'll need this one day
num [1, 1:2] 0 3
- attr(*, "dimnames")=List of 2
..$ : chr "r1"
..$ : chr [1:2] "c1" "c2"
cbind or rbindM1 <- matrix(1:4, 2, 2) # use positional argument
M2 <- matrix(0, 2, 2) # recycle occurs
cbind(M1, M2)
[,1] [,2] [,3] [,4]
[1,] 1 3 0 0
[2,] 2 4 0 0
unique.matrix(rbind(M1, M2)) # distinct by row
[,1] [,2]
[1,] 1 3
[2,] 2 4
[3,] 0 0
There are many implementations of sparse matrix in R. The base package
Matrixprovide a fairly flexible classdgCMatrix(which, of course, extends classmatrix).
i <- c(1,3:8); j <- c(2,9,6:10); x <- 7 * (1:7)
(A <- Matrix::sparseMatrix(i, j, x = x))
8 x 10 sparse Matrix of class "dgCMatrix"
[1,] . 7 . . . . . . . .
[2,] . . . . . . . . . .
[3,] . . . . . . . . 14 .
[4,] . . . . . 21 . . . .
[5,] . . . . . . 28 . . .
[6,] . . . . . . . 35 . .
[7,] . . . . . . . . 42 .
[8,] . . . . . . . . . 49
mylist <- list(name='Kyle', gender='male', 18)
str(mylist) # notice that tag name is not required
List of 3
$ name : chr "Kyle"
$ gender: chr "male"
$ : num 18
nested <- list(old=mylist, new='new')
(atomic <- c(c(1,2,3), 4, 5)) # no nesting
[1] 1 2 3 4 5
str(recursive <- list(list('a',2), TRUE))
List of 2
$ :List of 2
..$ : chr "a"
..$ : num 2
$ : logi TRUE
c(
mylist$name, # use '$' operator
mylist[['name']], # use '[[' with tag name
mylist[[1]] # use '[[' with numeric index
)
[1] "Kyle" "Kyle" "Kyle"
mylist[1:2] # try also mylist[c('name', 'gender')]
$name
[1] "Kyle"
$gender
[1] "male"
mylist$newtag <- 'something new'
mylist[[3]] <- NULL
mylist[['gender']] <- 'female'
mylist
$name
[1] "Kyle"
$gender
[1] "female"
$newtag
[1] "something new"
unlist to convert recursive vectors into atomic oneextends('data.frame') # return superclass of data.frame
[1] "data.frame" "list" "oldClass" "vector"
DataFrames class from the pandas package in Python?DF <- data.frame(small=letters, big=toupper(letters), rn=round(runif(26),1))
head(DF) # print the first 6 rows
small big rn
1 a A 0.9
2 b B 0.6
3 c C 0.2
4 d D 0.6
5 e E 0.4
6 f F 0.5
merge for inner/outter joinrbind, cbind (these functions are polymorphic)aggregate, sweepreshapesplit.data.frameorderPackage
data.tableprovides a new class 'data.table' which extends 'data.frame' but with more efficient computing capability for fairly large dataset.
(fchar <- factor(sample(letters[1:5], 5, replace=TRUE)))
[1] d b b c c
Levels: b c d
c(class(fchar), typeof(fchar))
[1] "factor" "integer"
str(fchar)
Factor w/ 3 levels "b","c","d": 3 1 1 2 2
x <- factor(c(1, 10, 100, 10))
levels(x)
[1] "1" "10" "100"
as.character(x)
[1] "1" "10" "100" "10"
as.numeric(x)
[1] 1 2 3 2
grep('00', x) # grep use implicit casting
[1] 3
NA, which represents missing valuesc(length(NA), length(NULL))
[1] 1 0
NaN, which represents not-a-number0/0
[1] NaN
if (expression_A) {
code_block_A
} else if (expression_B) {
code_block_B
} else {
code_block_C
}
ifelsex <- c(1, 2, 3)
ifelse(x > mean(x), 'Yes', 'NO')
[1] "NO" "NO" "Yes"
& and |&& and ||x <- c(1, 0, 1, 1)
y <- c(1, 0, 0, 0)
x & y # return Boolean vector
[1] TRUE FALSE FALSE FALSE
x && y # only the FIRST element is tested
[1] TRUE
TRUE
if (1) print(TRUE) is valid (not for Java you know…)i <- 1
repeat {
i <- i + 1
if (i > 10) break
}
i
[1] 11
while (TRUE) loopnext to skip the current iterationbreak to break the entire loop{} is required for multi-line blocki <- 1
while (i < 10) {
i <- i + 1
}
i
[1] 10
for (i in 1:10) print(i) # result not printed
for (c in letters) print(c)
getA <- c(1, 2, 3)
B <- c(100, 200, 300)
for (obj in c('A', 'B')) print(mean(get(obj)))
[1] 2
[1] 200
for (obj in list(A,B)) print(mean(obj)) # result not printed
try for simple applicationtryCatch for more flexible and formal usageresult <-
tryCatch({
# main expression block
# last valuated expression will be returned in case of success
}, warning=function(cond) {
# warning handling block, wrapped in function
# argument is a condition class auto-generated in case of warning
}, error=function(cond) {
# error handling block, wrapped in function
# argument is a condition class auto-generated in case of error
}, finally {
# the finally block that always evaluated
})
expr <- quote(1 + '1')
out <- tryCatch({
eval(expr)
}, error=function(cond) {
# simply return the condition object
cond
}, finally={
# be ware that the finally block is NOT wrapped in function
print('Finally!')
})
[1] "Finally!"
out
<simpleError in 1 + "1": non-numeric argument to binary operator>
class(out)
[1] "simpleError" "error" "condition"
try_list <- list(expr1=quote(1 + 1), expr2=quote(1 + '1'))
sapply(try_list, eval)
Error: non-numeric argument to binary operator
tryEval <- function(expr) {
out <- tryCatch({
eval(expr)
}, error=function(cond) {
message('the original error message:', '\n', cond)
return(NA)
})
out
}
sapply(try_list, tryEval)
the original error message:
Error in 1 + "1": non-numeric argument to binary operator
expr1 expr2
2 NA
Nothing more than a specific typed object, but deserve its own section cause it is the core of R programming.
doSomething <- function(x) {
x # this is the same as return(x)
}
function itself is a function object to create function
{ is a function, too
?'{'environment(doSomething)
<environment: R_GlobalEnv>
ls to list all variables in global (default) scopereturn is not require
return for a conditional breakstop to break the function with error messagewarning to generate warning messagejustInteger <- function(x)
if ( !is.integer(x) ) stop('please give me integer')
justInteger(1L)
justInteger('1')
Error: please give me integer
What do you think about the follwing prgramming fact in R?
V1 <- c(10,20,30,40); names(V1)
NULL
names(V1) <- c('one', 'two', 'three', 'four'); V1
one two three four
10 20 30 40
Apparently names is a function, and we assign values to the result of its call
?'names<-' and mode(get('names<-'))V1 <- c(10,20,30,40)
V1 <- 'names<-'(V1, value=c('one', 'two', 'three', 'four'))
V1
one two three four
10 20 30 40
V1[1] <- 11 will replace the first element of V1 with numeric 11'$<-' for list operationplot, print, summary, and many others are polymorphicmethods to query applicaple classes for a given generic functionmethods(summary)[1:20] # restrict to first 20 methods
[1] "summary.aov" "summary.aovlist" "summary.aspell"
[4] "summary.connection" "summary.data.frame" "summary.Date"
[7] "summary.default" "summary.ecdf" "summary.factor"
[10] "summary.ggplot" "summary.glm" "summary.infl"
[13] "summary.lm" "summary.loess" "summary.loglm"
[16] "summary.manova" "summary.matrix" "summary.mlm"
[19] "summary.negbin" "summary.nls"
# reduce sample dataset
iris1 <- iris[,1:2]
# linear regression without intercept
lm_model <-
lm(data=iris1,
Sepal.Length~ -1+Sepal.Width)
# summarize a data.frame
summary(iris1)
Sepal.Length Sepal.Width
Min. :4.30 Min. :2.00
1st Qu.:5.10 1st Qu.:2.80
Median :5.80 Median :3.00
Mean :5.84 Mean :3.06
3rd Qu.:6.40 3rd Qu.:3.30
Max. :7.90 Max. :4.40
summary(lm_model)
Call:
lm(formula = Sepal.Length ~ -1 + Sepal.Width, data = iris1)
Residuals:
Min 1Q Median 3Q Max
-2.524 -1.036 0.482 0.990 2.841
Coefficients:
Estimate Std. Error t value Pr(>|t|)
Sepal.Width 1.8690 0.0326 57.2 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.23 on 149 degrees of freedom
Multiple R-squared: 0.957, Adjusted R-squared: 0.956
F-statistic: 3.28e+03 on 1 and 149 DF, p-value: <2e-16
apply
lapply
sapply for a vector versionmapply
(M <- matrix(sample(1:15), 3, 5))
[,1] [,2] [,3] [,4] [,5]
[1,] 15 2 6 10 4
[2,] 8 13 14 1 12
[3,] 11 5 7 3 9
apply(M, 2, mean)
[1] 11.333 6.667 9.000 4.667 8.333
apply(M, 2, function(x) sum(x)/length(x)) # use anonymous function
[1] 11.333 6.667 9.000 4.667 8.333
# Indeed we have...
colMeans(M)
[1] 11.333 6.667 9.000 4.667 8.333
str(cars) # remember that a data.frame is a list
'data.frame': 50 obs. of 2 variables:
$ speed: num 4 4 7 7 8 9 10 10 10 11 ...
$ dist : num 2 10 4 22 16 10 18 26 34 17 ...
apply(cars, 2, mean) # return a vector
speed dist
15.40 42.98
lapply(cars, mean) # return a list
$speed
[1] 15.4
$dist
[1] 42.98
c(mode(get('if')), mode(get('for')))
[1] "function" "function"
c(mode(get('+')), mode(get('&&')), mode(get('<-')))
[1] "function" "function" "function"
mode(get('{'))
[1] "function"
Now we shall go back to the section for error handling.
A function applied to a vector is actually applied to each element individually (i.e., A map function)
import time
mil = range(1, 1000001)
start_time = time.time()
mil = [m + 100 for m in mil]
print time.time() - start_time, 'seconds'
0.101000070572 seconds
mil <- c(1:1000000)
system.time( # explicit loop with side effect
for ( i in seq_len(length(mil)) ) mil[i] <- mil[i] + 100
)
user system elapsed
3.12 0.00 3.13
mil <- c(1:1000000)
system.time( # high-level loop wraper without side effect
mil <- sapply(mil, function(x) x + 100)
)
user system elapsed
3.89 0.03 3.93
mil <- c(1:10000000) # one more zero
system.time( # What the hell are you two just doing?
mil <- mil + 100
)
user system elapsed
0.15 0.00 0.16
Here comes the data.
?files for more)
dir: list files under the curent working directoryfile.info: examine information of a given filefile.exist: check the existence of a filesystem to execute your system commmand
system('ls') is conceptually the same as dirshell for more flexible shell commanddir()[grep('\\.html$', dir())]
# is the same as
shell('ls | grep "\\.html$"', intern=TRUE)
writeLinestemp_file <- file()
cat(
'this is the first line
and this is the second line
this is the end of file
', file=temp_file) # be ware of the quoting position
readLines(temp_file) # connection will be auto-closed
[1] "this is the first line" "and this is the second line"
[3] "this is the end of file"
?connectionfilegzfile
url
XML, rjsonreadLines
temp_file <- file()
cat('1,,three', file=temp_file)
scan(temp_file, what=list(col1=1L,col2=1L,col=''), sep=',')
$col1
[1] 1
$col2
[1] NA
$col
[1] "three"
?read.table
read.csv, read.fwf, …nrows=100 to read the initial 100 rows for testing purposecomment.char='' to disable comment char if you are dealing with messy string data to prevent from long string crashstringsAsFactors=FALSE if you don't like characters to be converted into factorscolClasses to pre-defined data schema for speedupfill=TRUE if your data is unbalanced (have missing columns for some rows)data.frameThere are a bunch of ways to do data visualization in R, with either base environment or leveraing other add-on pacakges and even other languages– for example, JavaScript.
plot, barplot, boxplot, matplot, …
plot is a generic function doing X-Y plotingbarplot is used to generate barplot (what else?)boxplot for boxplotmatplot for multi-line plotingrgl::plot3dcase <- iris[,3:5]
colnames(case) = gsub('\\.', '', colnames(case))
case$Name <- paste('N',
round(runif(nrow(case)),3)*1000,
sep='')
head(case)
PetalLength PetalWidth Species Name
1 1.4 0.2 setosa N904
2 1.4 0.2 setosa N503
3 1.3 0.2 setosa N133
4 1.5 0.2 setosa N858
5 1.4 0.2 setosa N659
6 1.7 0.4 setosa N741
plot(x=case$PetalLength,
y=case$PetalWidth,
xlab='PetalLength',
ylab='PetalWidth',
col=c(1:3)[case$Species],
pch=19)
legend('bottomright',
levels(case$Species),
col=c(1:3), pch=19)
title('Scatter Plot of Iris Data')
library(ggplot2)
AES <- aes(x=PetalLength,
y=PetalWidth,
group=Species,
color=Species)
ggplot(case, AES) +
geom_point(size=3) +
theme(legend.position='top')
rcharts, d3network, googleVis, …gridSVG, SVGAnnotation, …shiny (a RStudio product)library(rCharts)
nvd3 <- nvd3Plot(
PetalLength ~ PetalWidth, data=case, type='scatterChart',
group='Species',
xAxis=list(axisLabel='PetalWidth'),
yAxis=list(axisLabel='PetalLength'),
chart=list(showDistX=TRUE, showDistY=TRUE)
); nvd3
see my RPubs article for the source code.
library(ggplot2)
library(gridSVG)
AES <- aes(x=PetalLength,
y=PetalWidth,
group=Species,
color=Species)
gg <- ggplot(case, AES) +
geom_point(size=3) +
theme(legend.position='top')
ggsvg <- grid.export("ggplot_scatter.html",
xmldecl=NULL, addClasses=TRUE)
file.show('ggplot_scatter.html')
Zoom out to check scalability!
# read back svg inline
raw_svgcode <- readLines('./svg_demo/ggplot_scatter.html', warn=FALSE)
# add d3.js library path
d3js_library_url <- 'https://dl.dropboxusercontent.com/u/210177/d3/d3.v3.min.js'
modified_svgcode <- c(paste('<script src="',
d3js_library_url,
'"></script>',
sep=''), raw_svgcode)
# convert data to json (in object-of-objecs format)
head(gg$data)
tojson <- apply(gg$data, 1, function(x) list(x))
tfile <- file()
cat(
'<script> data=',
rjson::toJSON(lapply(tojson, function(x) unlist(x))),
'</script>'
,
file=tfile
)
importData <- readLines(tfile, warn=FALSE)
close(tfile)
modified_svgcode <- c(modified_svgcode, importData)
# bind data
tfile <- file()
cat(
'
<script>
scatterpoints = d3.select(".points")
.selectAll("use")
.data(data);
</script>
',
file=tfile
)
bindData <- readLines(tfile, warn=FALSE)
close(tfile)
modified_svgcode <- c(modified_svgcode, bindData)
# add simple tooltip (use browser default title facility)
tfile <- file()
cat(
'
<script>
d3.selectAll("use")
.append("title")
.text(function(d) {return d.Name;});
</script>
',
file=tfile
)
addTooltip <- readLines(tfile, warn=FALSE)
close(tfile)
modified_svgcode <- c(modified_svgcode, addTooltip)
# add hyperlink event
tfile <- file()
cat(
'
<script>
d3.selectAll("use")
.on("click", function(d) {
var url = "http://google.com/search?q=";
url += d.Name;
window.location.href = url;}
);
</script>
',
file=tfile
)
addHref <- readLines(tfile, warn=FALSE)
close(tfile)
modified_svgcode <- c(modified_svgcode, addHref)
writeLines(modified_svgcode, './svg_demo/ggplot_scatter_modified.html')
file.show('./svg_demo/ggplot_scatter_modified.html')
ff, ffbase, sqldfparallel for embarrassingly parallel problemrandomForest, arules, e1071, …RODBCrmr2, rhdfs, rhbase, …