getwd()
#ImportingTitanic
Titanic <- read.csv(file="titanic.csv")
View(Titanic)
summary(Titanic)
class(Titanic)
str(Titanic)
names(Titanic)
head(Titanic, n=10)
tail(Titanic, n=10)
Titanic[1:50,1:2]
Titanic.Sorted <- Titanic[order(Titanic$Name),]
View(Titanic.Sorted)
#factoring
Titanic$Survived <- as.factor(Titanic$Survived)
Titanic$Pclass <- as.factor(Titanic$Pclass)
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$Embarked <- as.factor(Titanic$Embarked)
#InstallingGGPLot2
install.packages("ggplot2")
library(ggplot2)
#Survival Rate
table(Titanic$Survived)
prop.table(table(Titanic$Survived))
#Using ggplots
ggplot(Titanic,aes(x=Survived)) +
theme_bw() + geom_bar()
#survival against gender
ggplot(Titanic,aes(x=Sex, fill = Survived)) + geom_bar() + labs(y="Passenger Count",title="Survival rate
by sex")
#survivalrate by class of ticket
ggplot(Titanic,aes(x=Pclass, fill = Survived)) + geom_bar()
#Survivalrate by class of ticket and gender
ggplot(Titanic,aes(x=Sex, fill = Survived)) + facet_wrap( ~ Pclass) + geom_bar()
#what is the distribution of passenger ages
ggplot(Titanic,aes(x=Age, fill = Survived)) + geom_histogram(binwidth = 5)
#boxplot
ggplot(Titanic,aes(x=Survived,y=Age, color = Survived)) + geom_boxplot()
#what is survival rate by age when segmented by gender and class of ticket?
ggplot(Titanic,aes(x=Age,fill=Survived)) + facet_wrap(Sex ~ Pclass) + geom_density(alpha = 0.5)
#use hist for same question
ggplot(Titanic,aes(x=Age,fill=Survived)) + facet_wrap(Sex ~ Pclass) + geom_histogram(binwidth = 5)
#scatterplot
ggplot(Titanic,aes(x=Age,Fare,color=Survived,shape=Pclass)) + geom_point()
#mean age
meanAge <- mean(Titanic$Age)
#you wont get the mean because of the missing values
summary(Titanic)
#omit missing values
NAValues <- is.na(Titanic)
View(NAValues)
table(NAValues)
TitaniNew <- na.omit(Titanic)
meanAge1 <- mean(TitaniNew$Age)
#Doing it within functions
x <- c(3,NA,4,7)
mean(x, na.rm = TRUE)
meanAge <- mean(Titanic$Age, na.rm = TRUE)
#Subset
cTitanic <- subset(TitaniNew,TitaniNew$Embarked == "C")
View(cTitanic)
fTitanic <- subset(TitaniNew,TitaniNew$Sex == "female")
View(fTitanic)
aTitanic <- subset(TitaniNew,TitaniNew$Age < 20)
View(aTitanic)
afTitanic <- subset(TitaniNew, TitaniNew$Age < 20 & TitaniNew$Sex == "female" )
View(afTitanic)
#correlation
cor(TitaniNew$Age,TitaniNew$Fare)
corM <- cor(TitaniNew[c(6,10)])
print(corM)
#Corrplot
install.packages("corrplot")
library(corrplot)
corrplot(corM)
getwd()
# Working with a new dataset - Airquality
# Various ways to clean the data
myData <- airquality
summary(myData)
View(myData)
#creating columns and rows with NA values
myData[154,] <- c(NA)
myData[,7] <- c(NA)
myData
View(myData)
any(is.na(myData))
str(myData)
head(myData)
tail(myData)
#getting rid of columns(deleting columns or rows)
myData <- myData[,-7]
myData <- myData[-154,]
any(is.na(myData))
#how many missing values and where?
colSums(is.na(myData))
myDataClean <- na.omit(myData)
nrow(myDataClean)
myDataClean2 <- myData[complete.cases(myData),]
nrow(myDataClean2)
#Remove a column since its not part of your model
myDataClean3 <- na.omit(myData[-1])
colSums(is.na(myDataClean3))
myDataClean4 <- myData[,colSums(is.na(myData)) < 10]
colSums(is.na(myDataClean4))
myFinalData <- na.omit(myDataClean4)
View(myFinalData)
# Load mtcars dataset
View(mtcars)
head(mtcars)
myMTCars <- mtcars
library(ggplot2)
myMTCars$cyl <- as.factor(myMTCars$cyl)
#Annotation
ggplot(myMTCars, aes(x=wt, y=mpg)) +
geom_point(aes(colour=cyl))+
geom_smooth(method = "lm", se = FALSE)
annotate("rect", xmin=5.1, xmax=5.5, ymin=0, ymax=40, alpha=.1,
fill="blue")
+
annotate("text", x=4.7, y=4.4, label="Lincoln Continental") +
annotate("segment", x=5.1, xend=5.41, y=5, yend=10.3, colour="blue",
size=1, arrow=arrow())
#Working with maps
install.packages('maps')
library(maps)
world_map <- map_data("world")
View(world_map)
sort(unique(world_map$region))
#Map East Asia
east_asia <- map_data("world", region = c("Japan", "China", "North Korea", "South Korea"))
ggplot(east_asia, aes(x=long, y=lat, group=group, fill=region)) +
geom_polygon(colour="black") +
scale_fill_brewer(palette="Set2")
##Create Yourself: Map of South Asia
south_asia <- map_data("world", region = c("Bangladesh", "India", "Nepal", "Pakistan", "Sri Lanka"))
ggplot(south_asia, aes(x=long, y=lat, group=group, fill=region)) +
geom_polygon(colour="black") +
scale_fill_brewer(palette="Set2")
##Create Yourself: Map the World
world_map <- map_data("world", region = sort(unique(world_map$region)))
ggplot(world_map, aes(x=long, y=lat, group=group)) +
geom_polygon(fill= "white" , colour="black")
# Get map data for USA
states_map <- map_data("state")
#This is a grouping variable for each polygon. A region or subregion might have multiple polygons, for
example, if it includes islands.
# Import crime_map dataset
usacrime_map <- read.csv("crime_map.csv")
View(usacrime_map)
install.packages("mapproj")
library(mapproj)
ggplot(usacrime_map, aes(map_id = region, fill=Assault)) +
geom_map(map = states_map, colour="black") +
scale_fill_gradient2(low="#559999", mid="grey90", high="#BB650B",
midpoint=median(usacrime_map$Assault)) +
expand_limits(x = usacrime_map$long, y = usacrime_map$lat) +
coord_map("polyconic")
# Basic Data Manipulation
# introduction of dplyr Package - Purpose: Data Transformation
# Install or load 'dplyr' package
install.packages("dplyr")
library("dplyr")
install.packages("hflights")
library("hflights")
# Load built in dataset hflights, you need to load the library for this
flights <- hflights
flights
View(flights)
# See dplyr cheatsheet
library(dplyr)
#install.packages('hflights')
library(hflights)
flights <- hflights
flights
flights <- tbl_df(hflights)
flights
# Review only flights from 1st Jan
# 1. base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]
# 2. dplyr approach
# note: you can use comma or ampersand to represent AND condition
filter(flights, Month==1, DayofMonth==1)
# 3. use pipe for OR condition
filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")
# 4. you can also use %in% operator
filter(flights, UniqueCarrier %in% c("AA", "UA"))
# 5. base R approach to select DepTime, ArrTime, and FlightNum columns
flights[ , c("DepTime", "ArrTime", "FlightNum")]
# 6. SELECT: Pick columns by name
select(flights, DepTime, ArrTime, FlightNum)
# 7.
# nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)
# 8. chaining method
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60)
# 9. arrange: Reorder rows
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
# 10. mutate: Add new variables
# base R approach to create a new variable Speed (in mph)
flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]
# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)
# 11. store the new variable
flights <- flights %>% mutate(Speed = Distance/AirTime*60)
# 12, summarise: Reduce variables to values
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))
# 13. for each carrier, calculate the percentage of flights cancelled or diverted
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
# funs create a list of functions calls.
# 14. for each day of the year, count the total number of flights and sort in descending order
flights %>%
group_by(Month, DayofMonth) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count))
# 15. for each destination, count the total number of flights and the number of distinct planes that flew
there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
# 16. for each destination, show the number of cancelled and not cancelled flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table()
# 17. for each month, calculate the number of flights and the change from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count -lag(flight_count))
#dplyr on iris data
iris<- iris
View(iris)
dplyr::glimpse(iris)
utils::View(iris)
#calculate the mean sepal width of each of the species.
iris%>%
group_by(Species)%>%
summarise(avg=mean(Sepal.Width))
#arrange the width in ascending order
iris%>%
group_by(Species)%>%
summarise(avg=mean(Sepal.Width))%>%
arrange(avg)
#calculate mean of sepal width
mean(iris$Sepal.Width)
dplyr::filter(iris, Sepal.Width>4)
dplyr::slice(iris, 1:10)
dplyr::select(iris,starts_with("Petal"))
lapply(iris[,c(1:4)], mean)
count(iris, iris$Species, wt=Sepal.Length)
#creating a new varibale which is sum of petal and sepal length
New<- mutate(iris,New=iris$Sepal.Length+iris$Petal.Length)
#dplyr on family income and expenditure
data<- read.csv("FamilyIncomeExpenditure.csv")
summary(data)
#create a new dataset which includes observations of househilds which have income greater
than 600,000 and the main source of income is wages/salaries.
ftdata<- filter(data, data$Total.Household.Income>600000,
data$Main.Source.of.Income=="Wage/Salaries")
#checking missing values
colSums(is.na(data))
#average of all columns (remember it will only give results for numeric columns and warning for
characters)
summarise_each(data, funs(mean))
#using lapply to calculate average of all columns
lapply(data[1:21], mean)
#average of household income
summarise(data, avg=mean(data$Total.Household.Income))
#creating new variable which converts source of income into numeric data
numdata<- as.numeric(data$Main.Source.of.Income)
table(numdata)
prop.table(table(numdata))
#creating new variable of total expenditure
data<- mutate(data, TotalExpenditure=
data$Total.Food.Expenditure+data$Clothing..Footwear.and.Other.Wear.Expenditure+data$Ho
using.and.water.Expenditure+data$Medical.Care.Expenditure+data$Transportation.Expenditur
e+data$Communication.Expenditure)
View(data)
#to store the variable
data<- data %>% mutate(data, TotalExpenditure=
data$Total.Food.Expenditure+data$Clothing..Footwear.and.Other.Wear.Expenditure+data$Ho
using.and.water.Expenditure+data$Medical.Care.Expenditure+data$Transportation.Expenditur
e+data$Communication.Expenditure)
slice(data,10:15)
sample<-sample_n(data, 10, replace = TRUE)
View(sample)
glimpse(View(data))
count(data, data$Total.Household.Income)
#what is the total income and total expenditure according to sources of incomes
data%>%
group_by(data$Main.Source.of.Income)%>%
summarise_each(funs(mean),Total.Household.Income, TotalExpenditure)
#what are the no of bedrooms and total family members according to type of households
data%>%
group_by(data$Type.of.Household)%>%
summarise_each(funs(mean),Number.of.bedrooms,Total.Number.of.Family.members)
#what number of households have access to electricty according to their household type
data%>%
group_by(data$Type.of.Household) %>%
select(Electricity) %>%
table()
#hierarchical clustering
View(mtcars)
mtcars <- mtcars
#scaling
mt <- scale(mtcars)
View(mt)
#distance matrix
dismat <- dist(mt, method = "euclidean")
#another way
dist(as.matrix(mt))
hc <- hclust(dismat)
View(hc)
plot(hc, hang = -1, main = "Hclustering")
hc
#HCAverage
hcAvg <- hclust(dismat, method = "average")
plot(hcAvg,hang = -1)
#cut the dendo
clust3 <- cutree(hcAvg, k=3)
clust3
table(clust3)
rect.hclust(hcAvg, k=3, border = "red")
#try for 4
clust4 <- cutree(hcAvg, k=4)
clust4
table(clust4)
rect.hclust(hcAvg, k=4, border = "red")
#finding optimal value
install.packages("NbClust")
library(NbClust)
nb <- NbClust(mtcars, distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete",
index = "all")
#silhoutte
library(cluster)
plot(silhouette(cutree(hc,k=3),dismat))
#kmeans clustering
pc<- read.csv(file="ProteinConsumption.csv")
pcn <- pc[,-1]
library("cluster")
colSums(is.na(pcn))
set.seed(666)
kout3 <- kmeans(pcn,centers = 3,nstart = 10)
kout3
kout3$cluster
table(kout3$cluster)
Last year’s Assignment:
OAthletes<- read.csv("Olympic Athletes.csv")
m<-is.na(OAthletes)
table(m)
colSums(is.na(OAthletes))
NewOAth <- na.omit(OAthletes)
#2.Compare the average age of Athletes who play Badminton and who play Table Tennis?
Badminton<- subset(OAthletes,OAthletes$Sport=="Badminton")
mean(Badminton$Age)
#can do it with NewOAth too
Badmin <- subset(NewOAth,NewOAth$Sport=="Badminton")
mean(Badmin$Age)
TableTennis<- subset(OAthletes,OAthletes$Sport=="Table Tennis")
table(OAthletes$Sport)
mean(TableTennis$Age)
#3a Create a new attribute in the existing data which should represent the total number of medals won by
an athelete. Name this attribute "TotalMedals".
NewOAth$TotalMedals<-
NewOAth$Gold.Medals+NewOAth$Silver.Medals+NewOAth$Bronze.Medals
#3b. What is the highest number of medals won by the swimmer Michael Phelps in any Olympics games?
Also mention the year of Olympics Games when he secured maximum medals.
MP <- subset(NewOAth,Athlete=="Michael Phelps" )
MP
#4. Create a separate smaller dataset which has information on following five attributes of Olympics
Games from 2000 to 2012; name of Athlete, gold medals won, silver medals won, bronze medals won
and the total number of medals won.
#Name this data set "OAthletes.small".
OAtheletes.small<-NewOAth[c(1,6:9)]
library(ggplot2)
#5. Use an appropriate visual to find out the maximum number of medals secured by any athlete in
swimming in 2012 olympics.
#You are required to save the plot in PNG format and submit it with R script file
Games2012<- subset(NewOAth,Year==2012 & Sport=="Swimming")
data_plot <- data.frame(Medals = c("Gold", "Silver", "Bronze" ,"Total"), total =
c(sum(Games2012$Gold.Medals), sum(Games2012$Silver.Medals), sum(Games2012$Bronze.Medals),
sum(Games2012$TotalMedals)))
ggplot(data_plot, aes(Medals, total))+
geom_bar(stat = "identity")
#Visualization 1
myMTCars <- mtcars
Cyl <- as.factor(myMTCars$cyl)
ggplot(data = myMTCars, mapping = aes(x = wt, y = mpg, color=Cyl)) + geom_point()+
stat_smooth(method = 'lm')
# Visualization 2
library(ggplot2)
density <- ggplot(data=iris, aes(x=Sepal.Width))
density + geom_histogram(binwidth=0.2, color="black", fill="steelblue", aes(y=..density..)) +
geom_density(stat="density", alpha=0.2, fill="blue") +
xlab("Sepal Width") + ylab("Density") + ggtitle("Histogram & Density Curve")