Fundamentals of R Programming -
m1<-matrix(3:8,nrow=2) #R can infer the - Data Frame Subsetting
- getwd(), setwd(x) other dimension - df[3,2], df[3, “age”] #select single element
- ls(): list all the objects in the current workspace - m2<-3:8 - df[3,] #select entire row
- rm(x): remove object “x” from workspace dim(m2)<-c(3,2) #3 rows, 2 col - df[,2], df[, “age”] #select entire col
- dir(): list all files and subfolders in the current wd - Can recycle too - df[c(3,5), c(2,3)], df[c(3,5), c("age", "child")]
- list.files(), file.exists(x) - m1<-matrix(3:5,ncol=3,nrow=2) #select multiple portions
- m1<-matrix(3:6,ncol=3,nrow=2) - df[2] - returns a dataframe
Atomic Data Types #4 is not a factor of 6, warning given - df$age - returns a vector
Logical - m3<-matrix(3:8,ncol = 3,nrow = - df[[“age”]] - returns a vector
Integer 2,byrow = TRUE) - df[[2]] - returns a vector
Numeric - rownames(m3)<-c("Row1","Row2") - Data Frame Extension
- as.numeric(“abc”) = NA [coercion] - colnames(m3)<-c("Col.1","Col.2","Col.3") - df$height <- height, df[["height"]] <- height
- as.integer(x)) - is.matrix(x), is.vector(x) - cbind(df, weight)
- as.integer(5>6) = 0 - Matrix multiplication: G %*% F - rbind(df, tom)
Character - Transpose: t(m) - Data Frame Sorting
● Length of string - Common vectors: Unit, Zero 1. sort(df$age)
○ nchar("Singapore") = 9 - Common matrices: Unit, Zero 2. ranks <- order(df$age)
● Find the starting pos of a substring - Diagonal matrix: (1) diag(m) or (2) diag(c(2,2,5)) #the order of the current indexes if sorted
○ regexpr(“ex”, “longtext”) = 6 - Identity matrix: diag(3) 3. df[ranks,]
○ regexpr("exs","longtext") = -1 - Inverse matrix: solve(m) - Other sorting related functions:
○ regexpr("a","banana") = 2 - Determinant: det(m) max(df$age), which.max(df$age),
● Find the starting pos of all substrings - Num of linearly independent columns: rank(df$age) #just ranks the elements
○ gregexpr("a","banana") = [2,4,6] - matA <- qr(m) - Data Frame Indexing
● Find pos of strings in a vector containing the - matA$rank - index <- df$height > 171
substring - dim(m), nrow(m), ncol(m) ## [1] FALSE TRUE FALSE FALSE FALSE
○ txt<-c("arm","foot","lefroo", "bafoobar") - colSums(m), rowSums(m), sum(m) - sum(index) #1
grep("foo", txt) = [2,4] - colMeans(m), rowMeans(m), mean(m) - df$name[index] #Pete
● Extract a substring from a string - cbind(m1, m2), rbind(m1, m2) - which(df$height > 171)
○ substr("Singapore",2,4) = "ing" Factor - match(c("Anne", "Julia", "Cath"), df$name)
● Substitute substring in a string - special variables used to store categorical ## [1] 1 4 5
○ sub("or","es","Singapore") = "Singapese" variables, more efficient use of memory (stored - c("Anne", "Julia", "Cath", "Bob") %in%
● Substitute all substrings in a string as a vector of integer values) df$name
○ gsub("a","o","banana") = "bonono" - a<-c(0,1,0,0,1) ## [1] TRUE TRUE TRUE FALSE
Complex value a.f<-factor(a,labels = c("Male","Female")) - Basic Data Wrangling
- class(-1-2i) = “complex” - a<-c("One","Two","Three","One","Three") - library(dplyr)
- sqrt(-1+0i)=0+1i a.f<-factor(a) - mutate(df, bmi = weight/height^2*10000)
- levels of the factor are ordered, by default, by ||
Data Structures alphabet order (e.g. “Three” before “Two”) df$bmi <- df$weight/df$height^2*10000
- But, can manually assign the order of the levels: - attach(df)
a<-c("One","Two","Three","One","Three") df$bmi <- weight/height^2*10000
a.f<-factor(a,levels=c("One","Two","Three")) detach(df)
b<-as.integer(a.f) - filter(df, bmi > 18.5 & bmi < 24.9)
b=12313 ||
- Generating factors by specifying the pattern of df[df$bmi > 18.5 & df$bmi < 24.9,]
Vector their levels: - select(df, name, height, weight, bmi)
- Instantiation: gl(2,8,labels=c("male","female")) ||
1) a<-c(1,3,4) List df[,c("name", "height", "weight", "bmi")]
2) a<-c("desks" = 1, "tables" = 3, "chairs" = 4) - Mike<-list(Name="Mike",Salary=10000,Age=43, - %>%
3) a<-c(desks = 1, tables = 3, chairs = 4) Children=c("Tom","Lily","Alice"))
- Coercion (coerced to the most flexible type; - Mike[2], Mike[‘Salary’], Mike$Salary, Mike[c(1,2)], Useful functions:
order: logical < integer < numeric < complex < Mike[c("Salary","Age")] - round(num, 2) #round to 2dp
character < list): - str(): Displays the internal structure of R object - unique(data$marital) #output all unique values
- a <- c("Name",1) = [“Name” “1”] - Nested list: list(list(1,2),c(3,4)) - table(df$age)
typeof(a) = “character” - Combine 2 lists with c(): c(list(1,2),c(3,4)) - group_by(…)
Array - summarise(Count=n()) #creates a new data
- a<-c(TRUE, FALSE, TRUE) - array(1:12,c(2,2,3)) frame. It returns one row for each combination of
a<-as.integer(a) grouping variables
a = [1 0 1] Other useful functions: - Center: mean(), median()
- Other ways to instantiate: - unlist(strsplit(salary_data, " ")) #split the string - Spread: sd(), IQR(), mad()
- seq(1,9,2) → [1 3 5 7 9] into components (vectors) - Range: min(), max(),
- rep(c(2,3,4), 3) → [2 3 4 2 3 4 2 3 4] - which(logical object) - Position: first(), last(), nth(),
- Vector arithmetic (+ - * / ^ on all elements in the - which.max(v) #index of the max value in a vector - Count: n(), n_distinct()
vector) - cat(...) #concat and print - Logical: any(), all()
- Arithmetic using another vector of same length - Concat vectors after converting to character - slice_max(colname) #select rows with highest or
- sum(x), mean(x), prod(x) - paste0(1:10, collapse=””) = “12345678910” lowest values of a variable
- Vector subsetting: - paste(“1”, “2”, sep = ", ") = “1, 2” - tidyr::spread(x, y) #create new cols from the x
- a[1], a[“desk”], a[c(1,2)], a[c(“desks”, - cut(1:10, c(0,5,10), labels=c("low", "high")) values, and y is the spreaded data under those
“tables”)] #low low low low low high high high high high new cols
- a[-3] = a[1:2] (factor)
- a[c(TRUE, FALSE, TRUE)] --------------------------------------------------------------------------
- a[c(TRUE, FALSE)] → will recycle Basic Data Wrangling warning(), stop(), tryCatch(){},
- a[c(TRUE, FALSE, TRUE, FALSE)] → will Data Frame error=function(cond){}, warning = function(cond){}
have NA - df <- data.frame(v1, v2, v3) finally={}
Matrix - names(df) <- c("Name", "Age", "Child")
- Instantiation - df <- data.frame("Name" = name, "Age" = age,
- m1<-matrix(3:8,ncol=3,nrow=2) #fills the "Child" = child) #can do without the “”s also
first col first - stringsAsFactors = FALSE
Advanced Data Wrangling - nodes <- xmlChildren(root) #list of child nodes - inner_join(tab1, tab2) #no NAs
readr: nodes[[2]] - full_join(tab1, tab2) #all the NAs
- books <- getNodeSet(data, - semi_join(tab1, tab2) #keep the part of the first
"/library/catalog/book") #list of nodes that match table for which we have information in the
the criterion second table, but doesn’t add the columns of the
- books <- getNodeSet(data, second
"/library/catalog/book[@type='HardCover']") - anti_join(tab1, tab2) #keep the part of the first
- xmlToList(data) table for which we have NO information in the
- xmlToDataFrame(books) second table, but doesn’t add the columns of the
Data Problems: second
readxl: - Causes of missing data: Set Operators
- Structural missing data - intersect(1:10, 6:15)
- Human error in data entry or collection - library(dplyr)
- System error in capturing data tab1 <- tab[1:5,]
- Loopholes in business process tab2 <- tab[3:7,]
- Handling Missing Data intersect(tab1, tab2) #rows 3:5
- Data can be completed w info in other - union(tab1, tab2)
Normal data loading: attributes (e.g. lat and lon) - setdiff(tab1, tab2) #output the rows that are in
- employees <- read.delim("employees.txt", - Data can be completed w info from other tab1 but not in tab2
sep=",") records (e.g. find the developer of the - setequal(tab1, tab2) ## [1] FALSE
read.table("employees.txt", header=T, sep=",") same property)
- library(readxl) - If there is no way we can replenish missing Programming Structure and Functions
read_excel("employees.xlsx", data from alternative sources, can create a - if(cond){if_output}else{else_output}
sheet="employees") new col and label NAs as “Unknown” - ifelse(cond, if_output, else_output)
- read_excel("employees.xlsx", sheet=2) - Handle Missing Numeric Data #works on vectors
Download Data from the Internet - Convert numeric data into categorical data - z <- c(TRUE, TRUE, FALSE)
- library(curl) - Replace the data with value 0 any(z) ## [1] TRUE
library(XML) - Replace the data with mean value - z <- c(TRUE, TRUE, FALSE)
theurl <- "[url http link]" - Common causes of Data problems all(z) ## [1] FALSE
url <- curl(theurl) - Data entry problem - avg <- function(x){
urldata <- readLines(url) - Logical error s <- sum(x)
data <- readHTMLTable(urldata, - Outdated n <- length(x)
stringsAsFactors = FALSE) - Different standard s/n
class(data) #list - What causes logical error in data? }
- Why CSV? - Logical mistakes in the system where the - for (i in range of values){
- More efficient in storing huge amounts of data was generated from operations that use i, which is changing across
data - Mishandling done by the data processor if the range of values
- Can be used across platforms data is secondary data }
Import Data more Efficiently - Data might be INCONSISTENT Other functions:
- system.time(care.data<-read.csv("hospital-data.c Reshaping data - apply(x, MARGIN, FUNC, ...)
sv")) - library(tidyr) #apply a function to the margin of a matrix or a
new_tidy_data <- wide_data %>% gather(year, dataframe
VS fertility, '1960':'2015') #convert wide to tidy data #for matrix, margin=1 indicates rows, 2 for cols,
- new_tidy_data <- wide_data %>% gather(year, c(1,2) for rows and cols
colclass <- c("character","character","character", fertility, -country) #specify to not gather country e.g. apply(z,2,CountOddorEven, TRUE)
"character","character", - new_tidy_data <- wide_data %>% gather(year, - x <- list(A=1:4, B=seq(0.1,1,by=0.1))
"character","factor","factor","factor", fertility, -country, convert = TRUE) lapply(x, mean) #returns list
"character","factor","factor","factor") #convert to integer - sapply(x, mean) #returns vector instead of list
- new_wide_data <- new_tidy_data %>% - vapply(x, mean, numeric(1))
system.time(care.data<-read.csv("hospital-data.c spread(year, fertility) #tidy to wide #performs like vapply() but can specify return
sv", colClasses = colclass)) Separate and Unite value type
Open Data - dat %>% separate(key, c("year", - rapply(x, function(x){x^2}) #returns a vector
- www.kaggle.com "first_variable_name", "second_variable_name"), #recursive apply
- www.kdnuggets.com/dataset/index.html fill = "right") - mapply(rep, 1:5, c(4,4,4,4,4))
- data.gov.sg #separate key col into the 3 cols #take multiple vectors as inputs
Obtain data via JSON Interface #fill tells us where to place the NA values if there ||
- JSON - lightweight data-interchange format - isn’t a 3rd variable matrix(c(rep(1,4), rep(2,4), rep(3,4), rep(4,4),
completely language independent - standard that - dat %>% separate(key, c("year", rep(5,4)), nrow = 4, ncol = 5)
is accepted by all programming languages - used "variable_name"), extra = "merge") - x <- 1:10
as a media to exchange data between different #can also merge the extra variable y <-
parties - dat %>% separate(key, c("year", factor(c("A","A","A","B","B","B","B","C","C","C"))
- library(jsonlite) "variable_name"), extra = "merge") %>% tapply(x,y,sum) #sumofA, sumofB, sumofC
url <- spread(variable_name, value) #applies the specified FUNC to each group of an
"https://api.data.gov.sg/v1/transport/carpark-avail - dat %>% separate(key, c("year", array, grouped based on levels of certain factors
ability" "first_variable_name", "second_variable_name"), - Pivot table
data <- fromJSON(url) fill = "right") %>% unite(variable_name, - tapply(murders$total, murders$region,
a <- as.data.frame(data$items$carpark_data) first_variable_name, second_variable_name) sum)
Introduction to XML %>% spread(variable_name, value) %>% - tapply(murders$total,
- XML - eXtensible Markup Language - designed rename(fertility = fertility_NA) cut(murders$population, breaks = c(0,
to store and transport data Combining Data 1e+06, 1e+07, 1e+08)), mean)
- HTML - HyperText markup Language - designed - identical(results_us_election_2016$state, - Note: tapply() only works if the studied
to display data murders$state) ## [1] FALSE numbers are in one vector
- XML is able to store all kinds of data - For HTML, #cannot simply join 2 tables - split(murders, murders$region)
the text must be pre-defined - left_join(murders, results_us_election_2016, by #split a dataframe into a list of data frames based
- library(XML) = "state") on a factor array
data <- xmlParse("books.xml") - tab1 %>% left_join(tab2)
- root <- xmlRoot(data) #simply the top level node - tab1 %>% right_join(tab2)