R notes
R notes
age[-c(2,5)]
# some specialities of R
0/0
1/0
a<- c(Inf,-Inf,Inf)
a
1/a
0/a
Inf/a
Inf-Inf
z<- 2+3i
z
#operations on vectors
log(age)
log(age,2)
log(age^2)
27.11.2020
# relational operators
x=c(10,5,12,45,1,63,14)
x>15
# Methods to find no of elements greater than 15
(1)
sum(x>15)
(2)
length(x[x>15])
(3)
length(which(x>15))
30.11.2020
#PROBLEMS
?How to create the following vector
(a)(0.1^3. 0.2^1, 0.1^6. 0.2^4,........0.1^36. 0.1^24)
Page 3
solution:
x= (0.1)^(seq(3,36,3))*(.2)^(seq(1,4,24))
(b) (2,2^2/2,2^3/3,.....2^25/25)
solution
x= (2^(seq(1,25,1)))/seq(1,25,1)
2.12.2020
# matrix-retangular arrangement of values
# keyword-"matrix"
# how to create a matrix
x<-matrix(0,3,3)
x
# when all the elements are not the same
Page 4
x1<-matrix(1:9,3,3)
x1
here the elements are successive
# to fill across rows
x2<-matrix(1:9,3,3,byrow=T)
x2
Another way
x1<-matrix(1:9,nrow=3,ncol=3)
x1
# to create a vector is with all kinds of elements
y<-matrix(c(10,4,2,12,5,7,15,8,0),nrow=3,ncol=3)
class(y)
dim(y)
# to extract submatrix
# to extract a submatrix from consecutive rows and columns
y[1:2,1:2] #u will get a matrix from first two rows and first two columns
y[2:3,2:3]
y[1:3,2:3]
y[1:2,]
4.12.2020
# to add a row
rbind(y,c(5,6,7))
rbind(y,rowMeans(y))
# matrix multiplication
y%*%y
#QUESTIONS
A and B are two matrices of order 3x3 where A has elements 1,5,15,4,6,17,3,10,21
and B has 2,10,3,17,3,5,1,7,6 .Obtain (AB)^-1.
ANS
A=matrix(c(1,5,15,4,6,17,3,10,21),3,3,byrow=T)
B= matrix(c(2,10,3,17,3,5,1,7,6),3,3,byrow=T)
First check
det(A%*%B)==0
solve(A%*%B)
sum((x)/(3+j)) or
sum(i^4)*sum(1/(3+j))
y<- matrix(0,5,5)
y[1,]=y[,]<-c(1,2,3,4,5)
diag(y)<-1
diag(y[2:5,1:4])=diag(y[1:4,2:5])<-2
diag(y[3:5,1:3])=diag(y[1:3,3:5])<-3
diag(y[1:2,4:5])=diag(y[4:5,1:2])<-4
Construct the matrix A.Check whether a^3=0 and replace the third column of A by the
sum of second and third column.
a<-matrix(c(1,1,3,5,2,6,-2,-1,-3),3,3,byrow=T)
z<-a%*%a
z%*%a
z%*%a==0
x=(a[,2]+a[,3])
a[,3]<-x
Create a matrix B with 15 rows each row with elements 10,-10,10 and find B^TB
b<-matrix(rep(c(10,-10,10),times=15),15,3,byrow=T)
t(b)%*%b
# Outer function
y=c(2,4)
vec%o%vec # each element will get multiplied to all other elements and a
matrix is formed
vec%o%y
Alternative way
outer(vec,y,"^") # each value raised to the power y
outer(vec,y,"*")
eg
z<-c(1,5,6,9,10,45,68,98)
t<- sort(z)
p=t[-c(1,length(t))]
sum(p)
Page 7
mean(p)
11.12.2020
102030405060708090100
x<-1:10
y<-1:10
outer(x,y,"*")
(b) 01234
12345
23456
34567
45678
x<-0:4
y<-0:4
outer(x,y,"+")
(c) 01234
12340
23401
34012
40123
x<-0:4
y<-0:4
z=outer(x,y,"+")
z%%5
a<-matrix(sample(1:10,60,replace=T),6,10)
a)Find the number of entries in each row which are greater than 4.
length(which(a[1,]>4))
length(which(a[2,]>4))
length(which(a[3,]>4))
length(which(a[4,]>4))
length(which(a[5,]>4))
length(which(a[6,]>4))
OR
sum(a[1,]>4)
sum(a[2,]>4).....
THE METHOD TO BE USED
rowSums(a>4)
14.12.2020
#apply function,list,lapply,sapply
#apply
Page 8
#Use the apply function to find the median of the columns of matrix a.
apply(a,2,median)
# to find sd of each column
apply(a,2,sd)
# to find the maximum value in each row
apply(a,1,max)
# to find he log of each element in a
apply(a,1:2,log)
range function in matrix will not give the difference btw max and min
# To obtain the range of each column of a
apply(a,2,function(x)max(x)-min(x))
x<-c(12,9,3,5,7,15)
median(x) #second quartile
quantile(x,0.25) #Q1
quantile(x,0.5) #Q2
quantile(x,0.75) #Q3
quantile(x,0.8) #80 th percentile
a<-c("India","Japan","US","UK")
b<-c(153.5,170.7,170.2,167.5)
d<-c(TRUE,TRUE,FALSE,FALSE)
#to create a list
list.obj<-list(a,b,d)
?Create a list containing numeric vector from 1:10 and matrix of order 4x4
having numbers from 1 to 16.
list.var<-list(1:10,matrix(1:16,4,4))
Page 9
#using lapply
lapply(list.obj,length) #to find length
lapply(list.var,max)
15.12.2020
#CONDITIONAL STATEMENTS,LOOPS
#ifelse,if
#ifelse
x<--10:10
ifelse(x>0,sqrt(x),x^2)
ifelse(x>0,sqrt(x),"neagative")
#if
if(x[14]>0
{
print(log(x[14],2)) #prints log value of 14th element if>0
if(x[20]>0)
{
print(log(x[20],2))
}
#LOOPING
for(i in c(-10,-9,3,6,7))
{
if(i<0)
{
print(i^2)
}else
{
print(log(i,2))
}
}
for(i in 1:5)
{
Page 10
for(j in 5:1)
print(i+j)
}
? You have two numeric vectors say xvec,yvec each of size 10 sampled from integers
1 to 100 without replacement.Display the total number of xvec less than yvec
values.
xvec<-sample(1:100,10)
yvec<-sample(1:100,10)
sum=0
for(i in 1:10)
{
for(j in 1:10)
{
if(xvec[i]<yvec[j])
{
sum=sum+1
}
}
}
print(sum)
18.12.2020
#while
x<-5
while(x<=10)
{
print(x-1)
x=x+2
}
[1]
quadratic<-function(x) #here x is a local variable
{
a=2
b=4
d=7
print(a+b*x+d*x^2) # now give values to x
}
quadratic(6)
quadratic(c(2,4))
quadratic(2:7)
[2]
add<-function(x,y)
{
print(x+y)
}
add(c(2,4),c(5,6))
add(x=3,y=4)
add(10000,1000000)
Page 11
[3]
power<-function(x,y)
{
a=2
b=3
print(a*x+b*y)
}
power(2,3)
power(1,1)
power(3,2)
[4]
#to check symmetricity
mat<-function(x)
{
ifelse((t(x)==x),"symm","asymm")
}
mat(matrix(c(1,0,0,0,1,0,0,0,1),3,3,byrow=T))
#using if
mat<-function(x)
{
if(t(x)==x)
{
print("yes")
}else
{
print("no")
}
}
mat(matrix(c(1,0,0,0,1,0,0,0,1),3,3,byrow=T)
#using return
power<-function(x,y)
{
z<-x^y
return(z)
}
power(3,4)
mathop<-function(x=5,y=10)
{
a=x+y
b=x-y
d=x*y
e=x/y
z<-list(a,b,d,e)
return(z)
}
mathop()
?Write a function of the name "descriptive" that accepts a vector of numeric values
and returns the mean,sd,median,max,min,range of the vector.
descriptive<-function(x)
{
a=mean(x)
b=sd(x)
d=median(x)
e=max(x)
Page 12
f=min(x)
g=max(x)-min(x)
z<-list(mean=a,sd=b,median=d,max=e,min=f,range=g)
return(z)
}
des<-descriptive(c(5,6,7,8))
des$median
21.12.2020
?Write functions tmpFn1 and tmpFn2 such that if xVec is the vector (x1,x2,x3,..
xn) then tmpFn1(xVec) returns the vector (x1,x2^2,x3^3....) and tmpFn2(xVec) returns
the vector (x1,x2^2/2,x3^3/3......)
Ans
tmpFn1<-function(xvec)
{
z<-xvec^c(1:length(xvec))
return(z)
}
xvec<-c(1:5)
tmpFn1(xvec)
OR
tmpFn1<-function(xvec)
{
for(i in 1:length(xvec))
{
xvec[i]=xvec[i]^i
}
return(xvec)
}
xvec<-c(1:5)
tmpFn1(xvec)
OR
xvec<-c(1:5)
tmpFn1=function(xvec)
{
i=1:5
z=xvec^i
return(z)
}
tmpFn1(xvec)
OR
tmpFn1=function(xvec)
{
i=1:length(xvec)
z=xvec^i
return(z)
}
xvec<-c(1:10)
tmpFn1(xvec)
tmpfn1<-function(x,n=1)
{
z=x^n
return(z)
n=n+1
}
tmpfn1(1:6,1:6)
tmpFn1<-function(x)
{
a=seq(1,5,1)
z=x^a
return(z)
}
tmpFn1(1:5)
b)
tmpFn2<-function(xvec)
Page 13
{
i=1:length(xvec)
z=xvec^i/i
return(z)
}
xvec<-1:10
tmpFn2(xvec)
OR
tmpFn2<-function(xvec)
{
for(i in 1:length(xvec))
{
xvec[i]=(xvec[i]^i)/i
}
return(xvec)
}
xvec<-1:10
tmpFn2(xvec)
tmpFn2<-function(x)
{
a=seq(1,5,1)
z=(x^a)/a
return(z)
}
tmpFn2(1:5)
tmpfn2<-function(x,n)
{
z=(x^n)/n
return(z)
}
tmpfn2(1:6,1:6)
tmpfn3<-function(xvec,n)
{
n=1:length(xvec)
m<- (xvec^n)/n
z<-1+sum(m)
return(z)
}
xvec<-1:5
n<-1:5
tmpfn3(xvec,n)
tmpFn<-function(xvec,i)
{
i=3:length(xvec)
z=(xvec[i-2]+xvec[i-1]+xvec[i])/3
return(z)
}
xvec<-c(1:5,6:1)
tmpFn(xvec)
OR
tmpFn<-function(xvec)
{
i=3:length(xvec)
z=(xvec[i-2]+xvec[i-1]+xvec[i])/3
return(z)
}
xvec<-c(1:5)
tmpFn(xvec)
Page 14
tmpFn<-function(xvec){
n=1:length(xvec)
for(i in n){
if(xvec[i]<0){
y=xvec[i]
print((y^2)+(2*y)+3)
}
if((xvec[i]>=0)&(xvec[i]<2)){
y=xvec[i]
print(y+3)
}
if(xvec[i]>=2)
y=xvec[i]
print((y^2)+(4*y)-7)
}}
tmpFn(seq(-3,3,0.1))
tmpFn<-function(x)
{
ifelse(x < 0, x^2 + 2*x + 3, ifelse(x < 2, x+3, x^2 + 4*x - 7))
}
tmpFn(seq(-3,3,0.1)
mat<-function(x)
{
z<-ifelse(x[x%%2==1],2*x[x%%2==1],x)
return(z)
}
mat(matrix(c(1,1,3,5,2,6,-2,-1,-3),3,3,byrow=T))
mat<-function(x)
{
x[x%%2==1]<- 2*x[x%%2==1]
print(x)
}
mat(matrix(c(1,1,3,5,2,6,-2,-1,-3),3,3,byrow=T))
mat(x)
tmpfn3<-function(xvec,n)
{
n=1:length(xvec)
m<- (xvec^n)/n
z<-1+sum(m)
return(z)
}
xvec<-1:5
n<-1:5
tmpfn3(xvec,n)
Now write a function testLoop2 which takes a single argument yvec which is
a vector.The function should return
sum(e^j ) j=1:n
a)testLoop1<-function(n)
{
xvec<-rep(NA,n-1)
xvec[1]=1
xvec[2]=2
print(xvec[1])
print(xvec[2])
for(j in 3:(n-1))
{
xvec[j]=xvec[j-1]+ (2/xvec[j-1])
print(xvec[j])
}
}
testLoop1(10)
OR
testLoop1<-function(n)
{
x0=xvec[1]=1
x1=xvec[2]=2
print(x0)
print(x1)
for(j in 3:(n-1))
{
xvec[j]=xvec[j-1]+ (2/xvec[j-1])
print(xvec[j])
}}
testLoop1(10)
b)testLoop2<-function(yvec)
{
m=0
for(j in 1:length(yvec))
{
m<-m+ (exp(j))
}
return(m)
}
testLoop2(c(2,3,4))
OR
testLoop2<-function(yvec)
{
n<-length(yvec)
j=1:n
z=sum(exp(j))
print(z)
}
tmpFn1<-function(xvec)
{
n<-length(xvec)
t=mean(xvec)
p=(xvec-t)
r1=sum(p[2:n]*p[1:(n-1)])/sum(p^2)
r2=sum(p[3:n]*p[1:(n-2)])/sum(p^2)
z<-list(r1,r2)
return(z)
}
tmpFn1(seq(2,56,3))
Page 16
28.12.2020
letters
LETTERS
tolower(LETTERS)
toupper(letters)
I<-LETTERS
for(i in 1:length(I))
{
print(c(I[i],tolower(I[i])))
}
OR
Z<-LETTERS
for(i in 1:length(Z))
{
print(Z[i])
print(tolower(Z[i]))
}
#ANONYMOUS FUNCTIONS
(function(x) {x*x})(-5:5)
(function(x,y) {x+y}) (2:5,3:6)
(function(x,y=5) {x+y}) (1:3)
(function(x,y){z<-x+y;z^2}) (1:3,2:4)
30.12.2020
d=0
primeno<-c(2:100)
for(i in 2:length(primeno))
{
for(j in 1 :i)
{
if(i%%j==0)
{
d=d+1
}
}
if(d==2)
{
print(i)
}
d<-0
}
OR
for(i in 3:100)
if(sum(i%%(2:(i-1))==0)==0)
{
primeVec <- c(primeVec, i)
}
}
primeVec
if(num%%j==0)
print("not prime")
break
}
print(num)
}
z<-table(c("a","z","c","a","c","d"))
class(z)
age<-c(8,8,9,10,9,8,10,9,9)
height<-c(100,105,100,102,100,100,105,102,100)
kids<-table(age,height)
kids
4.1.2021
#DATAFRAMES
age<-c(20,22,21,21,20)
state<-c("D","M","C","B","K")
gender<-c("M","M","F","F","F")
cgpa<-c(8.5,9.7,6.8,8.9,5.3)
major<-c("S","M","M","S","S")
stud.details<-data.frame(age,state,cgpa,gender,major)
Page 18
class(stud.details)
dd<-data.frame()
fix(dd)
#construct a matrix
e<-matrix(1:9,3,3)
rownames(e)<-c("stud1","stud2","stud3")
colnames(e)<-c("present","absent","halfday")
#converting the matrix to a dataframe
as.data.frame(e)
Q.Print the age, state and major of 3rd and 5th student.
stud.details[c(3,5),c(1,2,5)]
Q.Print details of all students whose major is statistics and whose cgpa is
above 8.
stud.details[(stud.details$major=="S")& (stud.details$cgpa>8),]
table(stud.details$age,stud.details$gender)
6.1.2021
1.
Page 19
Age<-c(21,20,21,22,21,20,22,21)
Gender<-c("M","M","F","F","M","F","M","M")
freq.table<-table(Age,Gender)
freq.table
age_margin<-margin.table(freq.table,1)
#or
rowSums(freq.table)
gender_margin<-margin.table(freq.table,2)
#or
colSums(freq.table)
2.
gender<-c("M","F","F","M","F","M","M","F","M","M")
qualification<-c("UG","PG","BTECH","BTECH","UG","UG","PG","PG","BTECH","UG")
age<-c(24,27,28,25,21,34,26,25,34,27)
marks<-c(68,78,67,77,86,56,89,90,55,67)
#creating dataframe
dd<-data.frame(gender,qualification,age,marks)
cbind(dd,marks1)
8.1.2021
#non inclusion of NA
#how to omit NAs in the data
let
x<-c(40,29,31,NA,45,NA)
To perform operations like sum(x),max(x) ... first the NA values needs to be
omitted.
sum(na.omit(x))
mean(na.omit(x))
#in-built dataframes
read.table("pathname",header,sep)
eg of pathname: C:/Users/Program files/File name
If data set is comma seperated use sep="," and if it is colon sepeated use
sep=":"
Eg
age_height<-read.table("C:/Users/acer/Desktop/age_height.txt",header=T)
Method 2:
read.table(file.choose(),header=T)
#
str(iris) #will give information on columns of data frame
attach(iris)
iris$Petal.Length[which(iris$Petal.Length>4)]
?Find the sd values of the first 4 columns of airquality after omitting NA values
apply(na.omit(airquality)[,1:4],2,sd)
11.1.2021
iris
sample1<-iris[sample(1:50,10),]
sample2<-iris[sample(51:100,10),]
sample3<-iris[sample(101:150,10),]
sampleiris<-rbind(sample1,sample2,sample3)
#mean
apply(sampleiris[1:10,c(2,4)],2,mean)
apply(sampleiris[11:20,c(2,4)],2,mean)
apply(sampleiris[21:30,c(2,4)],2,mean)
b)
attach(sampleiris)
#summary
Page 21
summary(Sepal.Length)
summary(Petal.Length)
OR
summary(sampleiris[,c(1,3)])
c)
#order
sampleiris[order(Sepal.Length,Petal.Length),]
d)
#ratio
ratio1<-Sepal.Length/Sepal.Width
ratio2<-Petal.Length/Petal.Width
sampleiris$ratio1<-ratio1
sampleiris$ratio2<-ratio2
sampleiris
OR
ratio1=Sepal.Length/Sepal.Width
ratio2=Petal.Length/Petal.Width
cbind(sampleiris,ratio1,ratio2)
e)
m1<-mean(sampleiris[1:10,3])
sum(sampleiris[1:10,3]>m)
m2<-mean(sampleiris[11:20,3])
sum(sampleiris[11:20,3]>m2)
m3<-mean(sampleiris[21:30,3])
sum(sampleiris[21:30,3]>m3)
2.#reading a file
read.table("C:/Users/SUMA P P/Documents/disease.status.txt",header=T,sep="\t")
read.table(file.choose(),header=T,sep="\t")
13.01.2021
#subset
?Display the rows in sample iris for which sepal length is more than 5.
attach(sampleiris)
sampleiris[Sepal.Length>5,]
subset(sampleiris,Sepal.Length>5)
? Display the rows of sampleiris with petal length and petal width for which
sepal length is above 5.
subset(sampleiris,Sepal.Length>5,select=c(3,4))
subset(sampleiris,Sepal.Length>5,select=c(Petal.Length,Petal.Width))
subset(sampleiris,Sepal.Length>5,select=-c(1,2,5))
? Display the rows of sampleiris with petal length and petal width for which
sepal length is above 5 and petal length>4.
subset(sampleiris,Sepal.Length>5 & Petal.Length>4,select=c(3,4))
?Display mean of petal length and petal width based on rows of sampleiris
satisfying above condition.
a<-subset(sampleiris,Sepal.Length>5 & Petal.Length>4,select=c(3,4))
apply(a,2,mean)
OR
mean(a$Petal.Length)
mean(a$Petal.Width)
Consider airquality
subset(airquality,Wind>9) #NA values will also be shown
subset(na.omit(airquality),Wind>9)
#transform function
?Attach a new column in sampleiris as the sqrt of Sepal length
attach(sampleiris)
new.sepal.length<-sqrt(Sepal.Length)
cbind(sampleiris,new.sepal.length)
?Do square root transform of both petal length and sepal length.
transform(sampleiris,new1=sqrt(Sepal.Length),new2=sqrt(Petal.Length))
18.1.2021
#within function
#within-we can transform the variables and the transformed variables can be
used for further operations or manipulations
?Do a square root transformation on sepal length and petal length in sample
iris and find the difference btw the transformed variables.
attach(sampleiris)
dd<-within(sampleiris,{
sqrtseplen<-sqrt(Sepal.Length)
sqrtpetlen<-sqrt(Petal.Length)
dif<-sqrtseplen-sqrtpetlen})
1)Save as marks.txt
studmarks<-read.table("C:/Users/SUMA P P/Documents/marks.txt",header=T)
3)Create two new variables mark1_100 and mark2_100 where they are obtained by
transforming marks1 and marks2 values to 100.
transform(studmarks,marks1_100=marks1*5,marks2_100=marks2*5)
OR
transform(studmarks,marks1_100=(marks1/20)*100,marks2_100=(marks2/20)*100)
4)Do Q3 using within command.Also obtain average marks of each student based
on marks1_100 and marks2_100.Save the output in the object dd.Name the average
marks as avgmark
dd<-within(studmarks,{
marks1_100<-marks1*5
marks2_100<-marks2*5
avgmark<-(marks1_100+marks2_100)/2})
OR
dd<-within(studmarks,{
marks1_100<-(marks1/20)*100
marks2_100<-(marks2/20)*100
avgmark<-(marks1_100+marks2_100)/2})
5)Export the dataframe dd to text file and excel file named "newmarks" and
store in desktop.
#text file
Page 23
write.table(dd,"C:/Users/SUMA P P/Desktop/newmarks.txt",row.names=F)
#excel file
write.table(dd,"C:/Users/SUMA P P/Desktop/newmarks.xls",row.names=F,sep="\t")
20.01.2021
1.#first 20
head(airquality,20)
2.#colnames
colnames(airquality)
3.#order
attach(airquality)
head(airquality[order(Temp),],20)
4.
airquality[order(Temp,Solar.R),]
5.
attach(airquality)
na.omit(airquality[order(Temp,Solar.R),])
6.
summary(na.omit(airquality)[,-c(5,6)])
7.
apply(na.omit(airquality)[,-5],2,function(x)c(sd(x),max(x)-min(x)))
8.
attach(airquality)
apply(airquality[Month==5,c(3,4)],2,mean)
apply(airquality[Month==6,c(3,4)],2,mean)
apply(airquality[Month==7,c(3,4)],2,mean)
apply(airquality[Month==8,c(3,4)],2,mean)
apply(airquality[Month==9,c(3,4)],2,mean)
OR
tapply(Wind,Month,mean,na.rm=T)
tapply(Temp,Month,mean,na.rm=T)
OR
for(i in 5:9)
{
x<-apply(subset(airquality,Month==i,c(3,4)),2,function(x)c(mean(x),sd(x)))
print(x)
}
9.
attach(airquality)
transform(airquality,log_ozone=log(Ozone,2),log_temp=log(Temp,2))
10.1)
within(na.omit(airquality),{
log_ozone<-log(Ozone,2)
diff1<- Ozone-log_ozone})
2)
within(na.omit(airquality),{
log_temp<-log2(Temp)
diff2<- Temp-log_temp})
11.
attach(airquality)
subset(airquality,Ozone>20)
12.
subset(airquality,Ozone>20,select=c(Solar.R,Wind))
13.
m<-subset(na.omit(airquality),Ozone>20,c(Solar.R,Wind))
apply(m,2,function(x)c(mean(x),sd(x)))
Page 24
14.
attach(airquality)
Solar.complete<-Solar.R
Solar.complete[is.na(Solar.complete)]<-mean(na.omit(Solar.complete))
cbind(airquality,Solar.complete)
OR
airquality$Solar.complete<-ifelse(is.na(Solar.R),mean(Solar.R,na.rm=T),Solar.R)
15.
airquality.subset<-na.omit(subset(airquality,Ozone>20,c(Solar.R,Wind)))
write.table(airquality.subset,"C:/Users/SUMA P P/Desktop/airquality.xls",row.names=F,sep="\t")
22.1.2021
#merge
a<-iris[1:10,]
b<-iris[51:60,]
#rowwise merge
rbind(a,b)
a<-iris[1:10,1:2]
b<-iris[1:10,3:4]
#columnwise merge
cbind(a,b)
#split
#categorical variables can be split
attach(iris)
g<-split(iris,Species)
class(g) # class of g is list
str(g) #each category i.e,each Species is a dataframe
Q.Display the mean and sd of first four columns of the species setosa.
apply(g$setosa[,1:4],2,function(x)c(mean(x),sd(x)))
Q.Use orange data set in R and display the mean and sd of age and cicumference
of tree types:
1)Using subset function
attach(Orange)
apply(subset(Orange,Tree==1,select=c(2,3)),2,function(x)c(mean(x),sd(x)))
apply(subset(Orange,Tree==2,select=c(2,3)),2,function(x)c(mean(x),sd(x)))
apply(subset(Orange,Tree==3,select=c(2,3)),2,function(x)c(mean(x),sd(x)))
apply(subset(Orange,Tree==4,select=c(2,3)),2,function(x)c(mean(x),sd(x)))
apply(subset(Orange,Tree==5,select=c(2,3)),2,function(x)c(mean(x),sd(x)))
OR
s<-c(type1=t$'1'[,2:3],type2=t$'2'[,2:3],type3=t$'3'[,2:3],type4=t$'4'[,2:3],type5=t$'5'[,2:3])
lapply(s,function(x)c(MEAN=mean(x),SD=sd(x)))
OR
lapply(split(Orange, Orange$Tree), function(x){apply(x[,2:3],2,function(x)c(Mean=mean(x),SD=sd(x)
OR
for(i in 1:5)
Page 25
{
b<-subset(Orange,Tree==i)
print(apply(b[,-1],2,function(x)c(MEAN=mean(x),SD=sd(x))))
}
25.1.2021
#cut function
#It helps to categorise a continuos variable
g<-c(2,1,3,2,4,5,1,2,5,2,4,6)
cut(g,3)
Each value in g is assigned to an interval.The cut point/width of interval is found
by range divided by number of intervals.That is,
5/3=1.667
#Here the interval is right closed.To make it right open we use:
cut(g,3,right=F)
#Instead of specifying number of intervals we can specify the breaks.
cut(g,breaks=c(0,3,6))
#Including labels for the interval:
b<-cut(g,breaks=c(0,3,6),labels=c("I","K"))
class(b):'factor'
table(b)
a<-c(15000,10000,5000,24000,32000,3000,45000,26000,18000,9000)
s<-cut(a,breaks=c(0,10000,25000,50000),labels=c("low","middle","high"),right=F)
table(s)
plot(s)
#PLOTTING IN R
#Traditonal way of plotting is available in graphic packages
#grid approach to plotting: gpplot,lattice packages
Q.Practice question:
Consider the following set of random values of some characteristic interest:
123.5,142.7,155.3,120.4,112.8,110.9,152.6,147.2
Take a sample of size n=8 with replacememt from above values and give it as
argument to the function named 'boot'.Th function should find the mean and
variance of the sampled observation and the results are added as a row to
a datafrane named "sam".
The above process has to be repeated 100 times .Display the contents of
sam.
boot<-function(x)
{
samp<-matrix(x,100,8)
Mean<-apply(samp,1,mean)
SD<-apply(samp,1,sd)
sam<-data.frame(Mean,SD)
print(sam)
}
x<-sample(c(123.5,142.7,155.3,120.4,112.8,110.9,152.6,147.2),800,replace=T)
boot(x)
Page 26
x<-sample(c(115,123,146,234,134,145,167,156),800,replace=T)
boot(x)
OR
sam<-data.frame(Mean=rep(0,100),Variance=rep(0,100))
xvec<-c(123.5,142.7,155.3,120.4,112.8,110.9,152.6,147.2)
for(i in 1:100){
boot<-function(x){
m=mean(x)
v=var(x)
return(c(m,v))
}
boot(sample(xvec,replace = T))
sam[i,]=boot(sample(xvec,replace = T))
OR
sam<-data.frame()
boot<-function(x){
for(i in 1:100){
y=sample(x,8,replace=TRUE)
temp<-c(mean(y),sd(y))
sam<-rbind(sam,temp)
}
colnames(sam)<-c("Mean","SD")
sam
}
boot(c(123.5,142.7,155.3,120.4,112.8,110.9,152.6,147.2))
29.01.2021
#PLOTTING BARPLOTS
emp<-(c(rep("unemp",10),rep("emp",5)))
#Since the given data has only characters,extract the frequencies using table().
a<-table(emp)
barplot(a)
#using a table
emp<-(c(rep("unemp",10),rep("emp",5)))
gender<-c(rep("m",2),rep("f",3),rep("m",6),rep("f",4))
z<-table(emp,gender)
#To plot a bar diagram after reversing order of gender and emp
barplot(t(z),beside=T)
year<-c(2015:2020)
admissions<-c(67,75,60,72,66,72)
#Here we do not have a categorical data hence using a table doesn't make much sense
Consider:
year<-c(2015:2020)
admissions<-c(67,75,60,72,66,72)
dropouts<-c(10,3,12,10,5,12)
stats<-data.frame(year,admissions,dropouts)
barplot(cbind(admissions,dropouts)~year,data=stats,ylim=c(0,100))
1.02.2021
a)Draw a mutiple barplot of total males and total females across age groups and
color the bars.Also give proper legend,
Give the title as 'Barplot of counts'.
barplot(cbind(males,females)~age_group,data=dd,beside=T)
barplot(cbind(males,females)~age_group,data=dd,beside=T,legend=c("males","females"),
col=c("red","green"),main="Barplot of counts")
b) Find the odds of males and females and draw the multiple barplot and give
the title as barplot of odds.
odd_male<-dd$males/dd$females
odd_female<-dd$females/dd$males
ff<-data.frame(age_group,odd_male,odd_female)
barplot(cbind(odd_male,odd_female)~age_group,data=ff,beside=T,main="Barplot of odds")
Page 28
#histogram
marks<-c(35,38,42,47,30,56,67,63,71,79,83,94,58)
hist(marks) #the data should be continuous
hist(marks,breaks=3) #specifying the number of bins to be produced
In the above case we still get 4 bins as R finds 3 bins as less appealing.To
force R to produce 3 bins:
hist(marks,breaks=c(30,50,70,100))
#Here we get density in y axis.To convert it into frequency and to include
the lower limits
hist(marks,breaks=c(30,50,70,100),freq=T,include.lowest=T)
#To color
hist(marks,breaks=c(30,50,70,100),freq=T,include.lowest=T,col=rainbow(3))
p=hist(marks,col=rainbow(3),density=3)
names(p)
p$breaks #gives the break points
p$counts #gives the count/frequency of each interval
p$density #relative frequency 3/13,2/13.....
p$mids #gives the mid value
p$xname
p$equidist
plot(p)
text(p$mids,p$counts)
#Line plots
plot(marks) #we get a line plot
plot(log(marks)) #transformation on marks
plot(marks,main="line plot of marks",type="l")
plot(marks,type="c",lty=2)
lty-line type
3.2.2021
math_marks<-c(12,19,9,16,18)
stat_marks<-c(14,20,7,12,14)
Q.Draw the line plot of Sepal length,sepal width,petal length and petal width
Page 29
using a single plot.Give suitable title to the plot.Label the y axis and give
proper legends.Write inference.
for(i in 2:4)
{
lines(iris[,i],type="l",lty=i,col=i,ylim=c(0,8))
}
legend("topright",legend=colnames(iris),lty=c(1:4),col=c(1:4))
iris
attach(iris)
plot(iris[,1],type="l",lty=4,col="red",main="Line plot",ylim=c(0,10),ylab="iris")
lines(iris[,2],type="l",lty=2,col="blue")
lines(iris[,3],type="l",lty=2,col="green")
lines(iris[,4],type="l",lty=2,col="black")
legend("topright",legend=colnames(iris[,-5]),lty=c(4,2,2,2),
col=c("red","blue","green","black"))
tmpFn<-function(x)
{
ifelse(x < 0, x^2 + 2*x + 3, ifelse(x < 2, x+3, x^2 + 4*x - 7))
}
x<-c(.5,.7,1.2,2.4,2.6,3.5)
y<-tmpFn(x)
plot(x,y,type="l",lty=4,col="red",ylab="tmpFn",main="tmpFn values")
a<-c(0.5,1.2,2.4,3.5,0.7,2.6)
tmpFn<-function(x)
{
ifelse(x < 0, x^2 + 2*x + 3, ifelse(x < 2, x+3, x^2 + 4*x - 7)) }
x<-tmpFn(a)
plot(x,type="b",lty=2,col="blue",ylim=c(3,20))
5.2.2021
#scatter plot
Scatter plot makes sense only for bivariate distribution
height<-c(123,149,95,116,168)
weight<-c(45,60,34,52,64)
plot(height,weight,pch=10,col="green",main="scatterplot of height/weight")
attach(iris)
plot(Sepal.Length,Petal.Length,pch=20,col="black")
2.Draw the scatter plot of Sepal length vs Petal length for each of the
species.
plot(iris[1:50,1],iris[1:50,3],pch=20,main="Setosa",xlab="Sepal length",
ylab="Petal length")
plot(iris[51:100,1],iris[51:100,3],pch=20,main="Vesicolor",xlab="Sepal length",
ylab="Petal length")
plot(iris[101:150,1],iris[101:150,3],pch=20,main="Virginica",xlab="Sepal length",
ylab="Petal length")
OR
plot(b[,c(1,3)], main="versicolor")
height<-c(123,149,95,116,168)
weight<-c(45,60,34,52,64)
par(mfrow=c(1,2))
plot(height,type="l",col="red")
plot(weight,type="l",col="green")
#pairwise plots
if a dataframe is plotted each column in dataframe will be plotted against each other
plot(iris[,-5])
or
pairs(iris[,1:4])
#pairs function can be used to check collinearity that is whether the independent
variables are actually independent.
10.2.2021
#boxplot
Boxplots are mainly used to know whether distribution from which the samples
are drawn is skewed and whether outliers are present in the sample.
Q3+Q1-2Q2=O(Measure for symmetricity-Bowley's coefficient of skewness)
x<-c(5,10,12,3,14,28,18,10,16,22)
boxplot(x)
summary(x)
Boxplot is also called box and whisker.
The bold line is the median.
If Q3-Q2>Q2-Q1 -positive skewness
If Q3-Q2<Q2-Q1 -negative skewness
x<-c(5,10,12,3,14,28,18,10,16,22,80)
boxplot(x,main="Boxplot",col=rainbow(1))
Outlier is indicated by a dot.
Boxplot is used to make inferences about the data.
From the boxplot constructed we can infer that:
The data is almost symmetric and there is a presence of outlier in the data.
12.2.2021
#Boxplot of mpg corresponding to cylinders in data "mtcars"
boxplot(mpg~cyl,data=mtcars,main="car mileage data",xlab="no of cylinders",
ylab="miles per gallon")
dbinom(5,20,0.2)
boxplot(Petal.Width~Species,data=iris,ylim=c(0,8))
Generate 100 random numbers & calculate the density of these numbers
y<-rbinom(100,50,.7)
dbinom(y,70,.5)
15.2.2021
Example:
height<-c(123,140,127,135,153,165,172,181,162,163)
gender<-c("m","m","f","m","f","f","f","m","m","f")
dd<-data.frame(height,gender)
boxplot(height~gender,data=dd)
Inference:
For females Q3-Q2<Q2-Q1 ,hence negatively skewed
For males Q3-Q2>Q2-Q1 ,hence positively skewed
The spread of height of males is larger than that of females.
Probabaility distributions
Alphabets to remember:
d-density,p-distribution,q-quantile,r-random number generation
Keywords for each distribution:
binom,pois,exp,norm,gamma,unif
?Find P[X<=3]
Method 1:
sum(dbinom(0,5,.5)+dbinom(1,5,.5)+dbinom(2,5,.5)+dbinom(3,5,.5))
Method 2:
pbinom(3,5,.5)
?Find P[X>3]
P[X>3]=1-P[X<=3]
1-pbinom(3,5,.5)
OR
pbinom(3,5,0.5,lower.tail=F)
?Find P[X<3]
sum(dbinom(0:2,5,.5))
?Find P[3<=X<=5]
sum(dbinom(3:5,5,.5))
or
pbinom(5,5,.5)-pbinom(3,5,.5)+dbinom(3,5,.5)
#Quantile values
Given a probablity if we want to find the value of x that satisfies the
prob value ,we use the quantile function
Example;
Let X~exp(theta)
F(x)=1-exp(-theta^x)
Equate F(x)=u i.e, 1-exp(-theta*x)=u
exp(-theta*x)=1-u
(-theta*x)=ln(1-u)
x= -ln(1-u)/theta
QUESTIONS
Let X follow binomial distribution with n=10,p=.3.
a)Evaluate the binomial probs for x=2,4,6,8
dbinom(seq(2,8,2),10,.3)
b)Evaluate:
i) P[X<=4]
pbinom(4,10,.3)
ii) P[3<=X<=7]
pbinom(7,10,.3)-pbinom(3,10,.3)+dbinom(3,10,.3)
iii) P[2<X<5]
sum(dbinom(3:4,10,.3))
iv) P[3<X<=8]
pbinom(8,10,.3)-pbinom(3,10,.3)
v) P[X>=7]
1-pbinom(6,10,.3)
OR
pbinom(6,10,.3,lower.tail=F)
17.2.2021
#Poisson distribution
a)P[X=5]
dpois(5,4.5)
b) P[X>4]
ppois(4,4.5,lower.tail=F)
OR
1-ppois(4,4.5)
qpois(.1,4.5)
?Let X~P(lambda=1.2)
a) Find P[X<=3]
ppois(3,1.2)
b) Find P[X>5]
ppois(5,1.2,lower.tail=F)
Question
Sol:
x<-c(0,1,2,3,4,5,6)
freq<-c(53,45,38,24,17,8,4)
mean=sum(x*freq)/sum(freq)
exp_freq<- ppois(x,mean)*sum(freq)
b)cdf at 3.5
ppois(3.5,1.2)
#Exponential distribution
It is also called lifetime distribution.
dexp(5,1.3)
pexp(5,1.3)
qexp(.5,1.3)
rexp(10,1.3)
Q.Generate 100 obs from exponential distribution with theta =1.3 and obtain
the histogram and density plot of exp distribution.
y=rexp(100,1.3)
#plotting a histogram
hist(y,freq=F,col="green")
#to plot against density values
par(mfrow=c(1,2))
life<-c(67.3,78.1,87.3,90.4,95.1,101.8,107.4,89.2,97.3,75.2)
m<-dexp(sort(life),1/90.3)
plot(sort(life),m,type="l",main="density",col=rainbow(1))
Page 34
s<-pexp(sort(life),1/90.3)
plot(sort(life),s,type="l",main="cdf",col=rainbow(2))
19.2.2021
#adding a curve on the histogram
y<-rexp(100,1.2)
hist(y,freq=F,col="purple")
curve(dexp(y,1.3),col="green",add=T,xname="y")
Q.It is assumed that the following random obs of X are from exponential
distribution with theta=.8
X:1.2,1.6,.6,1.5,2.7,2.9,3.1,.7,.58,3.1,4.6,4.9
Draw the histogram of X and add the exponential density curve in the histogram.
Does the plot reveal whether X is exponentially distributed.
x<-c(1.2,1.6,.6,1.5,2.7,2.9,3.1,.7,.58,3.1,4.6,4.9)
hist(x,freq=F,col="red")
curve(dexp(x,.8),add=T,col="yellow")
# if we use any other variable other than "x",we need to include the
argument "xname"
eg:curve(dexp(y,.8),add=T,col="yellow",xname='y")
The above plot is not exponentially dist as the curve doesn't cover the
entire histogram.
#Normal distribution
If parameters are not specified,R considers it as standard normal dist.
a)P[Z<=3]
pnorm(3)
b)P[-3<Z<3]
pnorm(3)-pnorm(-3)
#This can also be found directly by 3-sigma rule
Q.Generate 1000 random obs from standard normal dist.Obtain the histogram and
plot the density curve on the histogram.
x<-rnorm(1000)
hist(x,freq=F,col="yellow")
curve(dnorm(x),add=T,col="green")
a)P[X<=2]
pnorm(2,mean=1.2,sd=sqrt(4))
b)P[-2<X<0]
pnorm(0,1.2,2)-pnorm(-2,1.2,2)
c)70th percentile of X
qnorm(.7,1.2,2)
normally distributed with mean 72 and variance 2.3. How many students have
obtained marks below 60, marks above 80 and marks btw 40 and 60.Assume that
the total no of students who took the test is 500.
Sol.#Obtain the corresponding prob and multiply with the total no of students
#below 60
pnorm(60,72,sqrt(2.3))*500
#above 80
500*pnorm(80,72,sqrt(2.3),lower.tail=F)
#btw 40 and 60
500*(pnorm(60,72,sqrt(2.3))-pnorm(40,72,sqrt(2.3)))
Q.1.Select 1000 random nos from normal distribution with mean 5 and sd 2.
2.Calcuate the mean of the generated obs
3.Repeat steps 1 and 2 200 times.
4.Plot the histogram of sample mean
5.Does the shape lools like bell shape? why or why not?
Solution.
1.
y<-rnorm(1000,5,2)
2.
mean(y)
3.
x<-rep(NA,200)
for(i in 1:200)
{
ran<-rnorm(1000,5,2)
x[i]<-mean(ran)
}
4.
hist(x,main="Histogram of sample mean",col="blue")
5.
The shape is bell shaped.Since the generated values are normal, their sample
means would also be normal
22.2.2021
#How to know whether the given random observations are generated from a
specified or assumed distribution?
Question:
Page 36
Generate 1000 random numbers from an exponential dist with theta 1.5.Obtain the
QQ plot.
x<-rexp(1000,1.5)
sample<-quantile(x,seq(.01,.99,.01))
dist<-qexp(seq(.01,.99,.01),1.5)
plot(sample,dist,type="l")
Interpretation:Based on QQ plot it is observed that the random sample comes
from a distribution with theta 1.5
#HYPOTHESIS TESTING
?.The following data on X have been generated from a normal dist with unknown
mean and unknown sd.
X: 123.4,11.4,127.8,132.6,143.7,110.5,108.9,109.4,106.3
Stating the hypothesis ,verify whether the population mean of X is 110.3.
Solution:
Let mu denote the pop mean of X.The hypothesis to be tested is:
H : mu=110.3 against K= mu!=0
The test used in testing H against K is one sample t test.
The test statistic is given as:t= (xbar-mu_0)/(s/sqrt(n)) which follows t dist
with n-1 df where n is the sample size.
x<-c(123.4,11.4,127.8,132.6,143.7,110.5,108.9,109.4,106.3)
Function:
t.test(x,mu=110.3)
If its one tailed or two tailed an extra argument 'alternative' should be given.
If mu is not specified R assumes the value to be zero.
#Interpretation:
Since p value is .8754 which is much larger than 5% we conclude that there is
no sample evidence to reject the null hypothesis at 5%.Thus we accept the
null hyothesis and conclude that the sample obsvs have been generated from
a normal dist with mean 110.3.
Question
Use iris data set. Test whether the population mean of sepal length is 4.5
against mean is greater than 4.5.
Solution:
Let mu denote the pop mean of sepal length..The hypothesis to be tested is:
H : mu=4.5 against K= mu>4.5
The test used in testing H against K is one sample t test.
The test statistic is given as:t= (xbar-mu_0)/(s/sqrt(n)) which follows t dist
with n-1 df where n is the sample size.
#code
attach(iris)
t.test(Sepal.Length,mu=4.5,alternative="greater")
#Interpretation:
Page 37
Since p value is less than 2.2e-16 which is much less than 5% we conclude that
there is no sample evidence to accept the null hypothesis at 5%.Thus we
reject the null hyothesis and conclude that the mean population of sepal length
is greater tahn 4.5.
thick<-c(7.5,7.60,7.65,7.70,7.55,7.55,7.40,7.40,7.50,7.50)
# let mu is the mean thisckness in hund of an inch for pc of gums
t.test(thick,mu=7.5)
# interpretation: We fail to reject the null hypotheis as the p value
0.2848 is not less than 0.01
y<-c(44,31,52,48,46,39,43,36,41,49)
t.test(y,mu=44)
Let mu_1 and mu_2 denote resp the pop means of two independent samples say X
and Y.The hypothesis to be tested is:
H: mu_1=mu_2 against K= mu_1(!=,<,>)mu_2
The test statistic is t= (x1bar-x2bar)/S*(sqrt(1/n1+1/n2)) which follows t dist
with (n1+n2-2) df.
Question
Let X equal the weight in grams of Low fat strawberry kudo and Y the weight of
Low Fat Blueberry Kudo. Assume the distribution follow normal distribution.
Let 21.7,21.0,21.2,20.7,20.4,21.9,20.2,21.6,20.6 be n=9 observations of X and
let 21.5,20.5,20.3,21.6,21.7,21.3,23.0,21.3,18.7,20.0,20.4,220.8,20.3 be m=13
observations of Y.Does the data support the claim that mean weight of X is
samller than that of Y. Test at 5% level of significance. Assume the
population variance to be equal.
Solution
Let mu_1 and mu_2 denote resp the pop means of two independent samples say X
and Y.The hypothesis to be tested is:
H: mu_1=mu_2 against K= mu_1<mu_2
The test statistic is t= (x1bar-x2bar)/S*(sqrt(1/n1+1/n2)) which follows t
dist with (n1+n2-2) df
x<-c(21.7,21,21.2,20.7,20.4,21.9,20.2,21.6,20.6)
y<-c(21.5,20.5,20.3,21.6,21.7,21.3,23,21.3,18.9,20,20.4,20.8,20.3)
#code
t.test(x,y,alternative="less",var.equal=T)
#Interpretation:
Sine p value is greater than .05 we do not reject the null hypothesis and
conclude that the pop mean of X and Y are same.
While doing this test we assume that variances of X and Y are equal.If
pooled variance is not equal then we need to give the argument as var.equal=F.
This will give the Welsch test rather than t test.
#Code
attach(thickness)
t.test(weight~g,alternative="less",var.equal=T)
24.2.2021
x<-c(21.7,21,21.2,20.7,20.4,21.9,20.2,21.6,20.6)
y<-c(21.5,20.5,20.3,21.6,21.7,21.3,23,21.3,18.9,20,20.4,20.8,20.3)
#Code
var.test(x,y)
#using data frame
attach(thickness)
var.test(weight~g)
#Interpretation:
Based on p value we conclude that H is not rejected at 5%.
Before doing a t test the the test for equality of variance should be done.
If variance is not equal based on the test then use argument var.equal=F.
Here our interest is in the test of equality of means of more than 2 pop
simultaneously assuming that the populations are independent and normally
distributed.
If there are 'k' populations with means mu1,mu2,...muk then assuming variances
are equal hypothesis to be tested is
H:mu1=mu2=.....muk against K:H is false
ANOVA Table
Sourcedfsum of squares(SS)mean SSF ratiop value
Treatmentk-1SSTSST/k-1MSST/MSSE
Errorsn-kSSESSE/n-k
Total
Total variabiity is split into variability due to two factors.
Question
Consider 3 treatments A,B,C with sample observations of their yield.
A:10.3,12.2,14.5,11.6,10.7
B:20.4,27.1,28.2,29.4,26.9,32.1,20.8
C:30.1,33.2,38.9,40.1,42.6,37.5
Test whether the yield of the 3 treatments are equal at 5% level.
Solution:
H: Yield of 3 treatments are equal vs K: H is false
Create a data frame with yield values and correspoding group variables.
a<-c(10.3,12.2,14.5,11.6,10.7)
b<-c(20.4,27.1,28.2,29.4,26.9,32.1,20.8)
d<-c(30.1,33.2,38.9,40.1,42.6,37.5)
y<-c(a,b,d)
g<-c(rep("1",5),rep("2",7),rep("3",6))
treat=data.frame(yield=y,g)
#code
attach(treat)
h<-aov(yield~g)
summary(h)
#Output:
Df Sum Sq Mean Sq F value Pr(>F)
g 2 10.94 5.472 2.993e+31 <2e-16 ***
Residuals 15 0.00 0.000
#Interpretation:
Since the p value is less than 5% we reject the null hypothesis.
Thus there is no sample evidence to conclude that the mean yield of the three
treatments are equal.
Question
Use Orange data set and test whether the mean circumference of the trees are
Page 39
equal.
Solution:
H: Mean circumference of the trees are equal
K:H is false
#code
attach(Orange)
s<-aov(circumference~Tree)
summary(s)
#output
Df Sum Sq Mean Sq F value Pr(>F)
Tree 4 11841 2960 0.883 0.486
Residuals 30 100525 3351
#Interpretation
Since p value is greater than .05 we accept the null hypothesis.The mean
circumference of trees are equal.
Example:
Let M denote Machine and O denote operator
M1 M2 M3
O1 4.5 3.6 5.7
O2 10.5 6.3 4.7
O3 3.3 4.7 8.1
The values denote the time(in hrs) to manufacture a product.Test whether the
mean manufacturing time of the operators are same and mean manufacturing time
by machines are the same.
Solution:
#Hypothesis
H1: mean manufacturing time of operators are same
against K1: H1 is false
H2: mean manufacturing time of machinesnare same
against K2: H2 is false
#code
time<-c(4.5,3.6,5.7,10.5,6.3,4.7,3.3,4.7,8.1)
machine<-c(rep(c("m1","m2","m3"),3))
operator<-c(rep(c("o1","o2","o3"),each=3))
s<-data.frame(time,machine,operator)
attach(s)
h<-aov(time~machine+operator)
summary(h)
#Output
Df Sum Sq Mean Sq F value Pr(>F)
machine 2 3.216 1.608 0.221 0.811
operator 2 10.416 5.208 0.715 0.543
Residuals 4 29.138 7.284
#Interpretation
Since both p values are above 5% both the hypotheses H1 and H2 are accepted.