0% found this document useful (0 votes)
37 views17 pages

#Factoring: #Importingtitanic

This document loads and explores a Titanic passenger dataset using R. It imports the dataset, sorts and factors variables, calculates summary statistics, and creates various visualizations of survival rates by variables like gender, class, and age using ggplot. It also explores missing age data, calculates correlations, and maps passenger survival trends. In the second half, it loads airquality data and demonstrates data cleaning techniques like handling NAs and subsets the mtcars dataset to create annotated plots.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
37 views17 pages

#Factoring: #Importingtitanic

This document loads and explores a Titanic passenger dataset using R. It imports the dataset, sorts and factors variables, calculates summary statistics, and creates various visualizations of survival rates by variables like gender, class, and age using ggplot. It also explores missing age data, calculates correlations, and maps passenger survival trends. In the second half, it loads airquality data and demonstrates data cleaning techniques like handling NAs and subsets the mtcars dataset to create annotated plots.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 17

getwd()

#ImportingTitanic

Titanic <- read.csv(file="titanic.csv")


View(Titanic)
summary(Titanic)
class(Titanic)
str(Titanic)
names(Titanic)
head(Titanic, n=10)
tail(Titanic, n=10)
Titanic[1:50,1:2]

Titanic.Sorted <- Titanic[order(Titanic$Name),]


View(Titanic.Sorted)

#factoring
Titanic$Survived <- as.factor(Titanic$Survived)
Titanic$Pclass <- as.factor(Titanic$Pclass)
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$Embarked <- as.factor(Titanic$Embarked)

#InstallingGGPLot2
install.packages("ggplot2")
library(ggplot2)

#Survival Rate
table(Titanic$Survived)
prop.table(table(Titanic$Survived))

#Using ggplots

ggplot(Titanic,aes(x=Survived)) +
theme_bw() + geom_bar()
#survival against gender
ggplot(Titanic,aes(x=Sex, fill = Survived)) + geom_bar() + labs(y="Passenger Count",title="Survival rate
by sex")

#survivalrate by class of ticket


ggplot(Titanic,aes(x=Pclass, fill = Survived)) + geom_bar()

#Survivalrate by class of ticket and gender


ggplot(Titanic,aes(x=Sex, fill = Survived)) + facet_wrap( ~ Pclass) + geom_bar()
#what is the distribution of passenger ages
ggplot(Titanic,aes(x=Age, fill = Survived)) + geom_histogram(binwidth = 5)

#boxplot
ggplot(Titanic,aes(x=Survived,y=Age, color = Survived)) + geom_boxplot()
#what is survival rate by age when segmented by gender and class of ticket?

ggplot(Titanic,aes(x=Age,fill=Survived)) + facet_wrap(Sex ~ Pclass) + geom_density(alpha = 0.5)

#use hist for same question


ggplot(Titanic,aes(x=Age,fill=Survived)) + facet_wrap(Sex ~ Pclass) + geom_histogram(binwidth = 5)
#scatterplot
ggplot(Titanic,aes(x=Age,Fare,color=Survived,shape=Pclass)) + geom_point()

#mean age
meanAge <- mean(Titanic$Age)
#you wont get the mean because of the missing values
summary(Titanic)

#omit missing values

NAValues <- is.na(Titanic)


View(NAValues)
table(NAValues)

TitaniNew <- na.omit(Titanic)


meanAge1 <- mean(TitaniNew$Age)

#Doing it within functions


x <- c(3,NA,4,7)
mean(x, na.rm = TRUE)
meanAge <- mean(Titanic$Age, na.rm = TRUE)

#Subset
cTitanic <- subset(TitaniNew,TitaniNew$Embarked == "C")
View(cTitanic)

fTitanic <- subset(TitaniNew,TitaniNew$Sex == "female")


View(fTitanic)

aTitanic <- subset(TitaniNew,TitaniNew$Age < 20)


View(aTitanic)

afTitanic <- subset(TitaniNew, TitaniNew$Age < 20 & TitaniNew$Sex == "female" )


View(afTitanic)

#correlation
cor(TitaniNew$Age,TitaniNew$Fare)
corM <- cor(TitaniNew[c(6,10)])

print(corM)

#Corrplot
install.packages("corrplot")
library(corrplot)

corrplot(corM)

getwd()
# Working with a new dataset - Airquality
# Various ways to clean the data

myData <- airquality


summary(myData)

View(myData)
#creating columns and rows with NA values
myData[154,] <- c(NA)
myData[,7] <- c(NA)

myData
View(myData)

any(is.na(myData))
str(myData)
head(myData)
tail(myData)

#getting rid of columns(deleting columns or rows)


myData <- myData[,-7]
myData <- myData[-154,]

any(is.na(myData))

#how many missing values and where?


colSums(is.na(myData))

myDataClean <- na.omit(myData)


nrow(myDataClean)

myDataClean2 <- myData[complete.cases(myData),]


nrow(myDataClean2)

#Remove a column since its not part of your model

myDataClean3 <- na.omit(myData[-1])


colSums(is.na(myDataClean3))

myDataClean4 <- myData[,colSums(is.na(myData)) < 10]


colSums(is.na(myDataClean4))
myFinalData <- na.omit(myDataClean4)
View(myFinalData)

# Load mtcars dataset


View(mtcars)
head(mtcars)
myMTCars <- mtcars
library(ggplot2)

myMTCars$cyl <- as.factor(myMTCars$cyl)

#Annotation
ggplot(myMTCars, aes(x=wt, y=mpg)) +
geom_point(aes(colour=cyl))+
geom_smooth(method = "lm", se = FALSE)

annotate("rect", xmin=5.1, xmax=5.5, ymin=0, ymax=40, alpha=.1,


fill="blue")

+
annotate("text", x=4.7, y=4.4, label="Lincoln Continental") +
annotate("segment", x=5.1, xend=5.41, y=5, yend=10.3, colour="blue",
size=1, arrow=arrow())

#Working with maps


install.packages('maps')
library(maps)

world_map <- map_data("world")


View(world_map)
sort(unique(world_map$region))

#Map East Asia

east_asia <- map_data("world", region = c("Japan", "China", "North Korea", "South Korea"))

ggplot(east_asia, aes(x=long, y=lat, group=group, fill=region)) +


geom_polygon(colour="black") +
scale_fill_brewer(palette="Set2")

##Create Yourself: Map of South Asia


south_asia <- map_data("world", region = c("Bangladesh", "India", "Nepal", "Pakistan", "Sri Lanka"))

ggplot(south_asia, aes(x=long, y=lat, group=group, fill=region)) +


geom_polygon(colour="black") +
scale_fill_brewer(palette="Set2")

##Create Yourself: Map the World


world_map <- map_data("world", region = sort(unique(world_map$region)))

ggplot(world_map, aes(x=long, y=lat, group=group)) +


geom_polygon(fill= "white" , colour="black")

# Get map data for USA


states_map <- map_data("state")

#This is a grouping variable for each polygon. A region or subregion might have multiple polygons, for
example, if it includes islands.
# Import crime_map dataset

usacrime_map <- read.csv("crime_map.csv")


View(usacrime_map)

install.packages("mapproj")
library(mapproj)

ggplot(usacrime_map, aes(map_id = region, fill=Assault)) +


geom_map(map = states_map, colour="black") +
scale_fill_gradient2(low="#559999", mid="grey90", high="#BB650B",
midpoint=median(usacrime_map$Assault)) +
expand_limits(x = usacrime_map$long, y = usacrime_map$lat) +
coord_map("polyconic")

# Basic Data Manipulation

# introduction of dplyr Package - Purpose: Data Transformation

# Install or load 'dplyr' package


install.packages("dplyr")
library("dplyr")

install.packages("hflights")
library("hflights")

# Load built in dataset hflights, you need to load the library for this

flights <- hflights


flights
View(flights)

# See dplyr cheatsheet

library(dplyr)
#install.packages('hflights')

library(hflights)

flights <- hflights


flights

flights <- tbl_df(hflights)


flights

# Review only flights from 1st Jan


# 1. base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]

# 2. dplyr approach
# note: you can use comma or ampersand to represent AND condition
filter(flights, Month==1, DayofMonth==1)

# 3. use pipe for OR condition


filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")

# 4. you can also use %in% operator


filter(flights, UniqueCarrier %in% c("AA", "UA"))

# 5. base R approach to select DepTime, ArrTime, and FlightNum columns


flights[ , c("DepTime", "ArrTime", "FlightNum")]

# 6. SELECT: Pick columns by name


select(flights, DepTime, ArrTime, FlightNum)

# 7.
# nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)

# 8. chaining method
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60)

# 9. arrange: Reorder rows

flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)

flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))

# 10. mutate: Add new variables

# base R approach to create a new variable Speed (in mph)


flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]

# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)

# 11. store the new variable


flights <- flights %>% mutate(Speed = Distance/AirTime*60)
# 12, summarise: Reduce variables to values

flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))

# 13. for each carrier, calculate the percentage of flights cancelled or diverted
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
# funs create a list of functions calls.

# 14. for each day of the year, count the total number of flights and sort in descending order
flights %>%
group_by(Month, DayofMonth) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count))

# 15. for each destination, count the total number of flights and the number of distinct planes that flew
there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))

# 16. for each destination, show the number of cancelled and not cancelled flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table()

# 17. for each month, calculate the number of flights and the change from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count -lag(flight_count))

#dplyr on iris data


iris<- iris
View(iris)

dplyr::glimpse(iris)

utils::View(iris)

#calculate the mean sepal width of each of the species.


iris%>%
group_by(Species)%>%
summarise(avg=mean(Sepal.Width))

#arrange the width in ascending order


iris%>%
group_by(Species)%>%
summarise(avg=mean(Sepal.Width))%>%
arrange(avg)

#calculate mean of sepal width


mean(iris$Sepal.Width)

dplyr::filter(iris, Sepal.Width>4)

dplyr::slice(iris, 1:10)

dplyr::select(iris,starts_with("Petal"))

lapply(iris[,c(1:4)], mean)

count(iris, iris$Species, wt=Sepal.Length)

#creating a new varibale which is sum of petal and sepal length


New<- mutate(iris,New=iris$Sepal.Length+iris$Petal.Length)

#dplyr on family income and expenditure


data<- read.csv("FamilyIncomeExpenditure.csv")
summary(data)

#create a new dataset which includes observations of househilds which have income greater
than 600,000 and the main source of income is wages/salaries.
ftdata<- filter(data, data$Total.Household.Income>600000,
data$Main.Source.of.Income=="Wage/Salaries")

#checking missing values


colSums(is.na(data))

#average of all columns (remember it will only give results for numeric columns and warning for
characters)
summarise_each(data, funs(mean))

#using lapply to calculate average of all columns


lapply(data[1:21], mean)

#average of household income


summarise(data, avg=mean(data$Total.Household.Income))

#creating new variable which converts source of income into numeric data
numdata<- as.numeric(data$Main.Source.of.Income)
table(numdata)
prop.table(table(numdata))

#creating new variable of total expenditure


data<- mutate(data, TotalExpenditure=
data$Total.Food.Expenditure+data$Clothing..Footwear.and.Other.Wear.Expenditure+data$Ho
using.and.water.Expenditure+data$Medical.Care.Expenditure+data$Transportation.Expenditur
e+data$Communication.Expenditure)
View(data)

#to store the variable


data<- data %>% mutate(data, TotalExpenditure=
data$Total.Food.Expenditure+data$Clothing..Footwear.and.Other.Wear.Expenditure+data$Ho
using.and.water.Expenditure+data$Medical.Care.Expenditure+data$Transportation.Expenditur
e+data$Communication.Expenditure)

slice(data,10:15)
sample<-sample_n(data, 10, replace = TRUE)
View(sample)
glimpse(View(data))
count(data, data$Total.Household.Income)

#what is the total income and total expenditure according to sources of incomes
data%>%
group_by(data$Main.Source.of.Income)%>%
summarise_each(funs(mean),Total.Household.Income, TotalExpenditure)

#what are the no of bedrooms and total family members according to type of households
data%>%
group_by(data$Type.of.Household)%>%
summarise_each(funs(mean),Number.of.bedrooms,Total.Number.of.Family.members)

#what number of households have access to electricty according to their household type
data%>%
group_by(data$Type.of.Household) %>%
select(Electricity) %>%
table()

#hierarchical clustering
View(mtcars)
mtcars <- mtcars

#scaling

mt <- scale(mtcars)
View(mt)

#distance matrix
dismat <- dist(mt, method = "euclidean")
#another way
dist(as.matrix(mt))

hc <- hclust(dismat)
View(hc)
plot(hc, hang = -1, main = "Hclustering")
hc

#HCAverage
hcAvg <- hclust(dismat, method = "average")
plot(hcAvg,hang = -1)

#cut the dendo


clust3 <- cutree(hcAvg, k=3)
clust3
table(clust3)

rect.hclust(hcAvg, k=3, border = "red")

#try for 4
clust4 <- cutree(hcAvg, k=4)
clust4
table(clust4)

rect.hclust(hcAvg, k=4, border = "red")

#finding optimal value


install.packages("NbClust")
library(NbClust)

nb <- NbClust(mtcars, distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete",


index = "all")

#silhoutte
library(cluster)
plot(silhouette(cutree(hc,k=3),dismat))

#kmeans clustering
pc<- read.csv(file="ProteinConsumption.csv")
pcn <- pc[,-1]

library("cluster")
colSums(is.na(pcn))

set.seed(666)

kout3 <- kmeans(pcn,centers = 3,nstart = 10)


kout3
kout3$cluster

table(kout3$cluster)

Last year’s Assignment:

OAthletes<- read.csv("Olympic Athletes.csv")


m<-is.na(OAthletes)
table(m)

colSums(is.na(OAthletes))

NewOAth <- na.omit(OAthletes)

#2.Compare the average age of Athletes who play Badminton and who play Table Tennis?

Badminton<- subset(OAthletes,OAthletes$Sport=="Badminton")
mean(Badminton$Age)

#can do it with NewOAth too


Badmin <- subset(NewOAth,NewOAth$Sport=="Badminton")
mean(Badmin$Age)

TableTennis<- subset(OAthletes,OAthletes$Sport=="Table Tennis")


table(OAthletes$Sport)
mean(TableTennis$Age)

#3a Create a new attribute in the existing data which should represent the total number of medals won by
an athelete. Name this attribute "TotalMedals".

NewOAth$TotalMedals<-
NewOAth$Gold.Medals+NewOAth$Silver.Medals+NewOAth$Bronze.Medals

#3b. What is the highest number of medals won by the swimmer Michael Phelps in any Olympics games?
Also mention the year of Olympics Games when he secured maximum medals.
MP <- subset(NewOAth,Athlete=="Michael Phelps" )
MP

#4. Create a separate smaller dataset which has information on following five attributes of Olympics
Games from 2000 to 2012; name of Athlete, gold medals won, silver medals won, bronze medals won
and the total number of medals won.
#Name this data set "OAthletes.small".
OAtheletes.small<-NewOAth[c(1,6:9)]

library(ggplot2)
#5. Use an appropriate visual to find out the maximum number of medals secured by any athlete in
swimming in 2012 olympics.
#You are required to save the plot in PNG format and submit it with R script file

Games2012<- subset(NewOAth,Year==2012 & Sport=="Swimming")

data_plot <- data.frame(Medals = c("Gold", "Silver", "Bronze" ,"Total"), total =


c(sum(Games2012$Gold.Medals), sum(Games2012$Silver.Medals), sum(Games2012$Bronze.Medals),
sum(Games2012$TotalMedals)))

ggplot(data_plot, aes(Medals, total))+


geom_bar(stat = "identity")

#Visualization 1

myMTCars <- mtcars

Cyl <- as.factor(myMTCars$cyl)

ggplot(data = myMTCars, mapping = aes(x = wt, y = mpg, color=Cyl)) + geom_point()+


stat_smooth(method = 'lm')

# Visualization 2

library(ggplot2)
density <- ggplot(data=iris, aes(x=Sepal.Width))
density + geom_histogram(binwidth=0.2, color="black", fill="steelblue", aes(y=..density..)) +
geom_density(stat="density", alpha=0.2, fill="blue") +
xlab("Sepal Width") + ylab("Density") + ggtitle("Histogram & Density Curve")

You might also like