UNIVERSITY OF NEW SOUTH WALES
SCHOOL OF MATHEMATICS AND STATISTICS
MATH3821 Statistical Modelling and Computing
Term Two 2020
Week 1 Lab Solutions
1. (a)
x <- c(3, 1, 4, 5, 9, 3)
y <- c(2, 7, 1, 8, 2, 8)
(b)
x + y
## [1] 5 8 5 13 11 11
(c)
z <- x + y
(d)
rm(z) # Not really necessary here since we replace z with new values below
z <- c(1, 4, 1)
x + z
## [1] 4 5 5 6 13 4
y + z
## [1] 3 11 2 9 6 9
2. (a)
x <- matrix(c(3, 4, 1, 5), ncol = 2)
y <- matrix(c(2, 1, 7, 2), ncol = 2)
(b)
x + y
## [,1] [,2]
## [1,] 5 8
## [2,] 5 7
(c)
x * y
## [,1] [,2]
## [1,] 6 7
## [2,] 4 10
(d)
x %*% y
## [,1] [,2]
## [1,] 7 23
## [2,] 13 38
(e)
1
( z <- t(x) )
## [,1] [,2]
## [1,] 3 4
## [2,] 1 5
3. (a) i.
( x <- seq(from = 1, to = 100, by = 1) ) # Or you could type x <- 1:100
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
ii.
( x <- seq(from = 1, length = 20, by = 0.1) )
## [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6
## [18] 2.7 2.8 2.9
iii.
( x <- seq(from = 1, to = 2, length = 6) )
## [1] 1.0 1.2 1.4 1.6 1.8 2.0
(b) i.
x <- c(3, 1, 4, 1, 5)
( y <- rep(x, each = 2) )
## [1] 3 3 1 1 4 4 1 1 5 5
ii.
( z <- rep(x, times = 2) )
## [1] 3 1 4 1 5 3 1 4 1 5
iii.
( z <- rep(x, times = c(4, 4, 4, 2, 2)) )
## [1] 3 3 3 3 1 1 1 1 4 4 4 4 1 1 5 5
4. (a)
kiama <- c(83, 51, 87, 60, 28, 95, 8, 27, 15, 10, 18, 16, 29, 54, 91, 8, 17, 55, 10, 35)
(b)
summary(kiama)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 15.75 28.50 39.85 56.25 95.00
(c)
hist(kiama, xlab = "Durations", main = "Histogram of Kiama Blowhole Durations",
col = "blue") # Produces histogram
2
Histogram of Kiama Blowhole Durations
8
6
Frequency
4
2
0
0 20 40 60 80 100
Durations
(d)
boxplot(kiama, xlab = "Durartions", col = "orangered",
main = "Boxplot of Kiama Blowhole Durations") # Produces boxplot
Boxplot of Kiama Blowhole Durations
80
60
40
20
Durartions
(e)
3
plot(kiama, col = "blue", pch = 19, ylab = "Durations")
80
Durations
60
40
20
5 10 15 20
Index
# If only one vector is supplied to plot, values are plotted
# against integers from 1 to the length of the vector
5. (a)
firearm <- matrix(c(
1983, 4.31,
1984, 4.42,
1985, 4.52,
1986, 4.35,
1987, 4.39,
1988, 4.21,
1989, 3.4,
1990, 3.61,
1991, 3.67,
1992, 3.61,
1993, 2.98,
1994, 2.95,
1995, 2.72,
1996, 2.96,
1997, 2.3), byrow = TRUE, ncol = 2)
# Note: Instead of the above intruction, you cal also type:
# firearm <- matrix(scan(), byrow = TRUE, ncol = 2)
# and then copy-paste the values only (from Acrobat-Reader)
# and then type ENTER twice
firearm <- data.frame(Year = firearm[, 1], Rate = as.numeric(firearm[, 2]))
(b)
4
plot(Rate ~ Year, data = firearm, pch = 19, col = "purple") # Suggestion of a linear trend
4.5
4.0
3.5
Rate
3.0
2.5
1984 1986 1988 1990 1992 1994 1996
Year
(c)
firearm.lm <- lm(Rate ~ Year, data = firearm)
summary(firearm.lm)
##
## Call:
## lm(formula = Rate ~ Year, data = firearm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38142 -0.16824 -0.01667 0.22071 0.30701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 306.3199 29.8444 10.26 1.33e-07 ***
## Year -0.1521 0.0150 -10.14 1.53e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.251 on 13 degrees of freedom
## Multiple R-squared: 0.8878, Adjusted R-squared: 0.8792
## F-statistic: 102.9 on 1 and 13 DF, p-value: 1.527e-07
# The F-test and partial t-test for the slope indicate
# strong evidence of a linear trend
# Also, R^2 = 0.88 - 88% of the variance for Rate is explained by Year
(d)
par(mfrow = c(2, 2))
# Split graphsheet into four, two rows and two
5
# columns so that linear model diagnostic plots
# can fit on one graphsheet.
plot(firearm.lm)
Standardized residuals
Residuals vs Fitted Normal Q−Q
5 5
Residuals
0.2
0.5
−1.5
−0.4
7 1 1 7
2.5 3.0 3.5 4.0 4.5 −1 0 1
Fitted values Theoretical Quantiles
Standardized residuals
Standardized residuals
Scale−Location Residuals vs Leverage
7 1
5 14
0.8
0.0
Cook's distance 15
−2.0
0.0
1 0.5
2.5 3.0 3.5 4.0 4.5 0.00 0.05 0.10 0.15 0.20 0.25
Fitted values Leverage
# No observable pattern in the residuals vs fitted value plot suggest
# that the variance of the errors is the same for each value of Year.
# Normal Q-Q plot tends to follow a straight line (theoretical and
# residual quantile match up) and so the normality assumption is OK
6. EyeColour <- factor(rep(c("Brown", "Green", "Blue"), times = c(8, 5, 6)))
Flicker <- c(26.8, 27.9, 23.7, 25.0, 26.3, 24.8, 25.7,
24.5, 26.4, 24.2, 28.0, 26.9, 29.1, 25.7,
27.2, 29.9, 28.5, 29.4, 28.3)
Flicker.df <- data.frame(EyeColour = EyeColour, Flicker = Flicker)
plot.design(Flicker.df)
6
Blue
25.5 26.0 26.5 27.0 27.5 28.0
mean of Flicker
Green
Brown
EyeColour
Factors
lm(Flicker ~ EyeColour - 1) # The 3 means
##
## Call:
## lm(formula = Flicker ~ EyeColour - 1)
##
## Coefficients:
## EyeColourBlue EyeColourBrown EyeColourGreen
## 28.17 25.59 26.92
plot.design(Flicker.df, fun = median)
28.5
Blue
27.5
median of Flicker
Green
26.5
25.5
Brown
EyeColour
Factors
7
res <- plot(Flicker.df, col = c("Blue", "Brown", "Green")) # Argument col is for color!
30
29
28
Flicker
27
26
25
24
Blue Brown Green
EyeColour
# Same as: boxplot(Flicker ~ EyeColour)
res$stats[3, ] # The 3 medians
## [1] 28.40 25.35 26.90
( Flicker.anova <- anova(lm(Flicker ~ EyeColour, data = Flicker.df)) )
## Analysis of Variance Table
##
## Response: Flicker
## Df Sum Sq Mean Sq F value Pr(>F)
## EyeColour 2 22.997 11.4986 4.8023 0.02325 *
## Residuals 16 38.310 2.3944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Same as:
Flicker.aov <- aov(Flicker ~ EyeColour, data = Flicker.df)
summary(Flicker.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## EyeColour 2 23.00 11.499 4.802 0.0232 *
## Residuals 16 38.31 2.394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# From the F-test in the ANOVA table, with a 5% level of significance it seems there
# are significant differences between at least some of the eye colour groups.
7.
height <- c(68, 61, 63, 70, 69, 65, 72)
weight <- c(155, 99, 115, 205, 170, 125, 220)
8
hw.lm <- lm(height ~ weight)
summary(hw.lm)
##
## Call:
## lm(formula = height ~ weight)
##
## Residuals:
## 1 2 3 4 5 6 7
## 1.1911 -1.0847 -0.4345 -1.0270 0.9257 0.7219 -0.2924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.73302 1.48022 36.30 2.99e-07 ***
## weight 0.08436 0.00918 9.19 0.000256 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.03 on 5 degrees of freedom
## Multiple R-squared: 0.9441, Adjusted R-squared: 0.9329
## F-statistic: 84.45 on 1 and 5 DF, p-value: 0.000256
hwnewdata.df <- data.frame(weight = 100)
( hw.pred <- predict(hw.lm, newdata = hwnewdata.df, interval = "prediction") )
## fit lwr upr
## 1 62.1691 59.05013 65.28806
stdres <- rstandard(hw.lm)
studres <- rstudent(hw.lm)
plot(stdres ~ hw.lm$fitted, xlab = "Fitted values", ylab = "Standardized Residuals")
abline(h = 0)
1.0
Standardized Residuals
0.5
0.0
−1.0 −0.5
62 64 66 68 70 72
Fitted values
9
plot(studres ~ hw.lm$fitted, xlab = "Fitted values", ylab = "Studentized Residuals")
abline(h = 0)
1.0
Studentized Residuals
0.5
−0.5 0.0
−1.5
62 64 66 68 70 72
Fitted values
8. speed <- c(700, 850, 820, 640, 920, 480, 460, 500, 570, 580,
500, 550, 480, 600, 610, 900, 880, 899, 780, 899,
590, 540, 560, 570, 555, 520, 660, 525, 610, 645)
gender <- factor(rep(c("Male", "Female"), times = c(15, 15)))
group <- factor(rep(c("X", "Y", "Z", "X", "Y", "Z"), each = 5))
reading.df <- data.frame(speed = speed, gender = gender, group = group)
# Equally effective => alpha_1=alpha_2=alpha_3=alpha => A_d(alpha,alpha,alpha) = alpha 1
# We use a Fisher test between nested models
anova(lm(speed ~ -1 + group + gender), lm(speed ~ 1 + gender))
## Analysis of Variance Table
##
## Model 1: speed ~ -1 + group + gender
## Model 2: speed ~ 1 + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 26 109476
## 2 28 612691 -2 -503215 59.756 1.892e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Same as: summary(aov(speed ~ group + gender, data = reading.df))
# The three courses are not equally effective
plot.design(reading.df)
10
X
550 600 650 700 750 800
mean of speed
Female
Male
Z
Y
gender group
Factors
par(mfrow = c(1, 2))
plot(reading.df)
1.0 1.2 1.4 1.6 1.8 2.0
900
700
speed
500
1.8
gender
1.4
1.0
3.0
group
2.0
1.0
500 600 700 800 900 1.0 1.5 2.0 2.5 3.0
par(mfrow = c(1, 1))
interaction.plot(group, gender, speed)
11
850
gender
Female
Male
mean of speed
750
650
550
X Y Z
group
# From the plot there doesn't seem to be evidence of an interaction between the
# two factors. Lines for different gender groups are roughly parallel
reading.lm <- lm(speed ~ group * gender, data = reading.df)
( reading.anova <- anova(reading.lm))
## Analysis of Variance Table
##
## Response: speed
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 503215 251608 56.6157 8.186e-10 ***
## gender 1 25404 25404 5.7164 0.0250 *
## group:gender 2 2817 1408 0.3169 0.7314
## Residuals 24 106659 4444
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Same as:
# reading.aov <- aov(speed ~ group * gender, data = reading.df)
# summary(reading.aov)
# The interaction term is not significant. Let's fit the model with no interaction:
( reading.anova <- anova(lm(speed ~ group + gender, data = reading.df)) )
## Analysis of Variance Table
##
## Response: speed
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 503215 251608 59.7557 1.892e-10 ***
## gender 1 25404 25404 6.0334 0.02103 *
## Residuals 26 109476 4211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# It seems that both factors are helpful for explaining variation in the response.
plot(lm(speed ~ group + gender, data = reading.df)) # A few diagnostic plots
12
Residuals vs Fitted
50 100
Residuals
0
−50
1
−150
500 550 600 650 700 750 800 850
Fitted values
lm(speed ~ group + gender)
Normal Q−Q
5
2
Standardized residuals
1
0
−1
1
−2
−2 −1 0 1 2
Theoretical Quantiles
lm(speed ~ group + gender)
13
Scale−Location
4
1.5
5
Standardized residuals
1.0
0.5
0.0 1
500 550 600 650 700 750 800 850
Fitted values
lm(speed ~ group + gender)
Constant Leverage:
Residuals vs Factor Levels
5
2
Standardized residuals
1
0
−1
1
−2
4
−3
group :
X Y Z
Factor Level Combinations
14