toluca <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%201%20Data%20Sets/CH01TA01.txt", sep ="" , header = FALSE)
#Look at the first 6 entries
head(toluca)
V1 V2
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224
colnames(toluca) <- c("lotSize", "hours")
#Look at the first 6 entries
head(toluca)
lotSize hours
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224
dataframe
tolucaDF <- data.frame(toluca)
head(tolucaDF)
lotSize hours
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224
The dot plot for the lot sizes in the Toluca Company example.
library(ggplot2)
ggplot(data = tolucaDF, aes(x = lotSize)) + geom_dotplot(color = "blue", fill = "orchid4", dotsize = .5) + theme_bw()
The Sequence plot for the lot sizes in the Toluca Company example.
plot(tolucaDF$lotSize, type = "b", lty = 2, xlab = "Run", ylab = "Lot Size")
title("Sequence Plot")
Stem-and-leaf plot of the lot sizes for the Toluca Company example. By displaying the last digits, this plot also indicates here that all lot sizes in the Toluca Company example were multiples of 10.
stem(tolucaDF$lotSize, scale = 3)
The decimal point is 1 digit(s) to the right of the |
2 | 0
3 | 000
4 | 00
5 | 000
6 | 0
7 | 000
8 | 000
9 | 0000
10 | 00
11 | 00
12 | 0
Box plot of the lot sizes for the Toluca Company example.
ggplot(data = tolucaDF, aes(x = lotSize)) + geom_boxplot(color = "steelblue", fill = "orchid4") + theme_bw()
# Recall from ch1
toluca_LS_model <- lm(hours ~ lotSize, data = toluca)
summary(toluca_LS_model)
Call:
lm(formula = hours ~ lotSize, data = toluca)
Residuals:
Min 1Q Median 3Q Max
-83.876 -34.088 -5.982 38.826 103.528
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.366 26.177 2.382 0.0259 *
lotSize 3.570 0.347 10.290 4.45e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 48.82 on 23 degrees of freedom
Multiple R-squared: 0.8215, Adjusted R-squared: 0.8138
F-statistic: 105.9 on 1 and 23 DF, p-value: 4.449e-10
library(ggplot2)
ggplot(toluca_LS_model, aes(x = lotSize, y = .resid)) + geom_point(color = "blue", dotsize = .5) + theme_bw() #no need to find residual seperately. Just use ".resid"
ggplot(toluca_LS_model, aes(x = .fitted, y = .resid)) + geom_point(color = "blue", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis
ggplot(toluca_LS_model, aes(x = .resid)) + geom_boxplot(color = "steelblue", fill = "orchid4") + theme_bw()
ggplot(toluca_LS_model, aes(sample = .resid)) + geom_qq(color = "blue") + geom_qq_line()+ theme_bw() #No x or y aesthetics. Just use sample =.resid
\(\mbox{Exp. Value under Normality} = \sqrt(MSE)[ z(\frac{k-0.375}{n+0.25})]\)
Note that MSE can be calculated below as the SSE / DF
. Here I am using deviance(toluca_LS_model)
# Expected value under normality comes from equation (3.6)
cbind(
"Residual" = round(resid(toluca_LS_model), 2), # rounding upresiduals to 2 decimal places
"Rank" = rank(resid(toluca_LS_model)), # this is where we find k value
"Exp. Value under Normality" = round(sqrt(deviance(toluca_LS_model) / df.residual(toluca_LS_model)) *
qnorm((rank(resid(toluca_LS_model)) - 0.375) / (nrow(tolucaDF) + 0.25)), 2)) # deviance funtion finds the SSE and to find MSE we need to divide it by d.f. Then qnorm givres us the z value of (k-.375/n+.25). Again we have use round funtion to get final results in 2 decimal places.
Residual Rank Exp. Value under Normality
1 51.02 22 51.97
2 -48.47 5 -44.10
3 -19.88 10 -14.76
4 -7.68 11 -9.76
5 48.72 21 44.10
6 -52.58 4 -51.97
7 55.21 23 61.48
8 4.02 15 9.76
9 -66.39 2 -74.17
10 -83.88 1 -95.90
11 -45.17 6 -37.25
12 -60.28 3 -61.48
13 5.32 16 14.76
14 -20.77 8 -25.33
15 -20.09 9 -19.93
16 0.61 14 4.85
17 42.53 20 37.25
18 27.12 18 25.33
19 -6.68 12 -4.85
20 -34.09 7 -31.05
21 103.53 25 95.90
22 84.32 24 74.17
23 38.83 19 31.05
24 -5.98 13 0.00
25 10.72 17 19.93
res <- resid(toluca_LS_model)
expRes <- sqrt(deviance(toluca_LS_model) / df.residual(toluca_LS_model)) *
qnorm((rank(resid(toluca_LS_model)) - 0.375) / (nrow(tolucaDF) + 0.25))
cor(res, expRes)
[1] 0.9915055
library(lmtest)
bptest(toluca_LS_model, student = FALSE)
Breusch-Pagan test
data: toluca_LS_model
BP = 0.82092, df = 1, p-value = 0.3649
bankdata <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA04.txt", sep ="" , header = FALSE)
#Look at the first 6 entries
#head(bankdata)
#Rename clumns
colnames(bankdata) <- c("minDeposit", "NoNewAccs") # minDeposit - Size of minimum deposit, NoNewAccs - Number of new accounts
#Look at the first 6 entries
(bankdata)
minDeposit NoNewAccs
1 125 160
2 100 112
3 200 124
4 75 28
5 150 152
6 175 156
7 75 42
8 175 124
9 125 150
10 200 104
11 100 136
#Model
bankModel <- lm(NoNewAccs~minDeposit, data = bankdata)
bankModel
Call:
lm(formula = NoNewAccs ~ minDeposit, data = bankdata)
Coefficients:
(Intercept) minDeposit
50.7225 0.4867
#ANOVA table
anova(bankModel)
Analysis of Variance Table
Response: NoNewAccs
Df Sum Sq Mean Sq F value Pr(>F)
minDeposit 1 5141.3 5141.3 3.1389 0.1102
Residuals 9 14741.6 1638.0
#Scatter Plot
library(ggplot2)
library(latex2exp)
ggplot(bankdata, aes(x = minDeposit, y = NoNewAccs)) +
geom_point() +
labs(x = "Size of minimum deposit", y = "Number of new accounts", title = "Bank example scatter plot") +
geom_smooth(method = "lm", se = FALSE) +
geom_text(aes(x=120, label="yhat = 50.7 + 0.49x", y=85), colour="steelblue", angle=0, text=element_text(size=11)) +
theme_bw()
library(dplyr)
dfbank <- bankdata %>%
group_by(minDeposit) %>%
summarize(NoNewAccs)
dfbank
# A tibble: 11 × 2
# Groups: minDeposit [6]
minDeposit NoNewAccs
<int> <int>
1 75 28
2 75 42
3 100 112
4 100 136
5 125 160
6 125 150
7 150 152
8 175 156
9 175 124
10 200 124
11 200 104
dfMeanbank <- bankdata %>%
group_by(minDeposit) %>%
summarize(MeanAccs = mean(NoNewAccs))
dfMeanbank
# A tibble: 6 × 2
minDeposit MeanAccs
<int> <dbl>
1 75 35
2 100 124
3 125 155
4 150 152
5 175 140
6 200 114
# F test for lack of fit
Reduced <- lm(NoNewAccs ~ minDeposit, data = bankdata) # This is our usual model
summary(Reduced)
##
## Call:
## lm(formula = NoNewAccs ~ minDeposit, data = bankdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.23 -34.06 12.61 32.44 48.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.7225 39.3979 1.287 0.23
## minDeposit 0.4867 0.2747 1.772 0.11
##
## Residual standard error: 40.47 on 9 degrees of freedom
## Multiple R-squared: 0.2586, Adjusted R-squared: 0.1762
## F-statistic: 3.139 on 1 and 9 DF, p-value: 0.1102
Full <- lm(NoNewAccs ~ 0 + as.factor(minDeposit), data = bankdata) # This is the full model
summary(Full)
##
## Call:
## lm(formula = NoNewAccs ~ 0 + as.factor(minDeposit), data = bankdata)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9 10 11
## 5 -12 10 -7 0 16 7 -16 -5 -10 12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(minDeposit)75 35.00 10.71 3.267 0.022282 *
## as.factor(minDeposit)100 124.00 10.71 11.573 8.45e-05 ***
## as.factor(minDeposit)125 155.00 10.71 14.466 2.85e-05 ***
## as.factor(minDeposit)150 152.00 15.15 10.031 0.000168 ***
## as.factor(minDeposit)175 140.00 10.71 13.066 4.68e-05 ***
## as.factor(minDeposit)200 114.00 10.71 10.640 0.000127 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.15 on 5 degrees of freedom
## Multiple R-squared: 0.9933, Adjusted R-squared: 0.9852
## F-statistic: 123.1 on 6 and 5 DF, p-value: 2.893e-05
Testing the hypothesis:
\(H_0: \mbox{Reduced model is good}\) \(\quad\) \(H_a: \mbox{Full model is good}\)
anova (Reduced, Full)
## Analysis of Variance Table
##
## Model 1: NoNewAccs ~ minDeposit
## Model 2: NoNewAccs ~ 0 + as.factor(minDeposit)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9 14742
## 2 5 1148 4 13594 14.801 0.005594 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
salesTrain <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA07.txt", sep ="" , header = FALSE)
#head(salesTrain)
#Rename columns
colnames(salesTrain) <- c("daysTrain", "performance")
head(salesTrain)
daysTrain performance
1 0.5 42.5
2 0.5 50.6
3 1.0 68.5
4 1.0 80.7
5 1.5 89.0
6 1.5 99.6
# Create a scatter plot of these data in R
ggplot(salesTrain, aes(x = daysTrain, y = performance)) +
geom_point() +
labs(x = "Days", y = "Performance", title = "Sales Training Example scatter plot") +
theme_bw()
# Do you see a linear relationship?
# Clearly the regression relation appears to be curvilinear, so the simple linear regression model does not seem to be appropriate.
# What transformation might be appropriate for these data?
# We shall consider a transformation on X. We shall consider initially the square root transformation (or go with the log 10 transformation of X)
library(dplyr)
# Find the transformed X values
salesTrainTrans <- salesTrain %>%
mutate(daysTrainSqrt = sqrt(daysTrain))
salesTrainTrans
daysTrain performance daysTrainSqrt
1 0.5 42.5 0.7071068
2 0.5 50.6 0.7071068
3 1.0 68.5 1.0000000
4 1.0 80.7 1.0000000
5 1.5 89.0 1.2247449
6 1.5 99.6 1.2247449
7 2.0 105.3 1.4142136
8 2.0 111.8 1.4142136
9 2.5 112.3 1.5811388
10 2.5 125.7 1.5811388
# Create a new scatter plot of using transformed data in R
ggplot(salesTrainTrans, aes(x = daysTrainSqrt, y = performance)) +
geom_point(color = "blue") + geom_smooth(method = "lm", se = FALSE)
labs(x = "Square root of Days", y = "Performance", title = "Sales Training Example scatter plot") +
theme_bw()
NULL
# Fit the linear regression model using transformed data
modelTrans <- lm(performance ~ daysTrainSqrt, data = salesTrainTrans)
modelTrans
Call:
lm(formula = performance ~ daysTrainSqrt, data = salesTrainTrans)
Coefficients:
(Intercept) daysTrainSqrt
-10.33 83.45
# Make a NPP and a residual plot.
ggplot(modelTrans, aes(sample = .resid)) + geom_qq(color = "red") + geom_qq_line()+
labs(x = "Expected ", y = "Residuals", title = "NPP plot") +
theme_bw() #No x or y aesthetics. Just use sample =.resid
#residual plot
ggplot(modelTrans, aes(x = daysTrainSqrt, y = .resid)) + geom_point(color = "orchid4", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis
# Perform a correlation test for normality for this data
res <- resid(modelTrans)
expRes <- sqrt(deviance(modelTrans) / df.residual(modelTrans)) *
qnorm((rank(resid(modelTrans)) - 0.375) / (nrow(salesTrainTrans) + 0.25))
cor(res, expRes)
[1] 0.9789286
nrow(salesTrainTrans)
[1] 10
# Notice the correlation between residuals and expected residuals is 0.9789286.
# The critical value when alpha = 0.01, is 0.879. (read from the table in the notes)
# The correlation > critical value. Therefore, we conclude that the distribution of the error terms does not depart substantially from a normal distribution.
# Comment on lack of fit or of strong unequal error variances and normality
# By looking at the residual plot there is no evidence of lack of fit or of strongly unequal error variances.
# By looking at the NPP and above test, No strong indications of substantial departures from normality.
library(ggplot2)
library(dplyr)
plasmaData <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA08.txt", sep ="" , header = FALSE)
plasmaData
V1 V2 V3
1 0 13.44 1.1284
2 0 12.84 1.1086
3 0 11.91 1.0759
4 0 20.09 1.3030
5 0 15.60 1.1931
6 1 10.11 1.0048
7 1 11.38 1.0561
8 1 10.28 1.0120
9 1 8.96 0.9523
10 1 8.59 0.9340
11 2 9.83 0.9926
12 2 9.00 0.9542
13 2 8.65 0.9370
14 2 7.85 0.8949
15 2 8.88 0.9484
16 3 7.94 0.8998
17 3 6.01 0.7789
18 3 5.14 0.7110
19 3 6.90 0.8388
20 3 6.77 0.8306
21 4 4.86 0.6866
22 4 5.10 0.7076
23 4 5.67 0.7536
24 4 5.75 0.7597
25 4 6.23 0.7945
plasmaData <- plasmaData[,-3] # Removed the last column of the data for this example. I only took X and Y columns.
head(plasmaData)
V1 V2
1 0 13.44
2 0 12.84
3 0 11.91
4 0 20.09
5 0 15.60
6 1 10.11
#Rename columns
colnames(plasmaData) <- c("age", "plasmaLevel")
head(plasmaData)
age plasmaLevel
1 0 13.44
2 0 12.84
3 0 11.91
4 0 20.09
5 0 15.60
6 1 10.11
# Create a scatter plot of these data in R
ggplot(plasmaData, aes(x = age, y = plasmaLevel)) +
geom_point() +
labs(x = "Age", y = "Plasma Level", title = "Plasma Levels Example scatter plot") +
theme_bw()
# Do you see a linear relationship?
# Clearly the regression relation appears to be curvilinear, as well as the greater variability for younger children than for older ones. so the simple linear regression model does not seem to be appropriate.
# What transformation might be appropriate for these data?
# We shall consider a transformation on Y. We shall consider initially the log(Y) transformation
library(dplyr)
# Find the transformed Y values
plasmaDataTrans <- plasmaData %>%
mutate(plasmaLevelLog = log10(plasmaLevel)) # here I am creating a new column of data with logY values
plasmaDataTrans
age plasmaLevel plasmaLevelLog
1 0 13.44 1.1283993
2 0 12.84 1.1085650
3 0 11.91 1.0759118
4 0 20.09 1.3029799
5 0 15.60 1.1931246
6 1 10.11 1.0047512
7 1 11.38 1.0561423
8 1 10.28 1.0119931
9 1 8.96 0.9523080
10 1 8.59 0.9339932
11 2 9.83 0.9925535
12 2 9.00 0.9542425
13 2 8.65 0.9370161
14 2 7.85 0.8948697
15 2 8.88 0.9484130
16 3 7.94 0.8998205
17 3 6.01 0.7788745
18 3 5.14 0.7109631
19 3 6.90 0.8388491
20 3 6.77 0.8305887
21 4 4.86 0.6866363
22 4 5.10 0.7075702
23 4 5.67 0.7535831
24 4 5.75 0.7596678
25 4 6.23 0.7944880
# Create a new scatter plot of using transformed data in R
ggplot(plasmaDataTrans, aes(x = age, y = plasmaLevelLog)) +
geom_point(color = "blue") + geom_smooth(method = "lm", se = FALSE)
labs(x = "Age", y = "log(Y)", title = "Plasma Levels Example scatter plot with transformed Y") +
theme_bw()
NULL
# Fit the linear regression model using transformed data
modelPlasmaTrans <- lm(plasmaLevelLog ~ age, data = plasmaDataTrans)
modelPlasmaTrans
Call:
lm(formula = plasmaLevelLog ~ age, data = plasmaDataTrans)
Coefficients:
(Intercept) age
1.1348 -0.1023
# Make a NPP plot.
ggplot(modelPlasmaTrans, aes(sample = .resid)) + geom_qq(color = "red") + geom_qq_line()+ theme_bw() #No x or y aesthetics. Just use sample =.resid
#residual plot
ggplot(modelPlasmaTrans, aes(x = age, y = .resid)) + geom_point(color = "orchid4", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis
# Perform a correlation test for normality for this data
res <- resid(modelPlasmaTrans)
expRes <- sqrt(deviance(modelPlasmaTrans) / df.residual(modelPlasmaTrans)) *
qnorm((rank(resid(modelPlasmaTrans)) - 0.375) / (nrow(plasmaDataTrans) + 0.25))
cor(res, expRes)
[1] 0.9807112
nrow(plasmaDataTrans)
[1] 25
# Notice the correlation between residuals and expected residuals is .9807.
# The critical value when alpha = 0.01, is .939.
# Since the correlation > critical value. Therefore, we conclude that the distribution of the error terms does not depart substantially from a normal distribution.
# Comment on lack of fit or of strong unequal error variances and normality
# By looking at the residual plot there is no evidence of lack of fit or of strongly unequal error variances.
# By looking at the NPP and above test, No strong indications of substantial departures from normality are indicated.
library(ALSM)
obj <- boxcox.sse(plasmaData$age, plasmaData$plasmaLevel, l=seq(-2,2,.1))
obj
lambda SSE
1 -2.0 61.85640
2 -1.9 57.57287
3 -1.8 53.66783
4 -1.7 50.11904
5 -1.6 46.90689
6 -1.5 44.01425
7 -1.4 41.42638
8 -1.3 39.13089
9 -1.2 37.11767
10 -1.1 35.37886
11 -1.0 33.90887
12 -0.9 32.70442
13 -0.8 31.76453
14 -0.7 31.09066
15 -0.6 30.68680
16 -0.5 30.55961
17 -0.4 30.71859
18 -0.3 31.17631
19 -0.2 31.94867
20 -0.1 33.05520
41 0.0 34.51945
21 0.1 36.36939
22 0.2 38.63791
23 0.3 41.36342
24 0.4 44.59051
25 0.5 48.37072
26 0.6 52.76343
27 0.7 57.83686
28 0.8 63.66932
29 0.9 70.35050
30 1.0 77.98306
31 1.1 86.68443
32 1.2 96.58884
33 1.3 107.84974
34 1.4 120.64249
35 1.5 135.16751
36 1.6 151.65385
37 1.7 170.36340
38 1.8 191.59558
39 1.9 215.69290
40 2.0 243.04720
obj[which.min(obj$SSE),]
lambda SSE
16 -0.5 30.55961
toluca <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%201%20Data%20Sets/CH01TA01.txt", sep ="" , header = FALSE)
colnames(toluca) <- c("lotSize", "hours")
tolucaDF <- data.frame(toluca)
head(tolucaDF)
lotSize hours
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224
library(cowplot)
p1 <- ggplot(data = tolucaDF, aes(x = lotSize, y=hours)) +
geom_point(color = "steelblue") +
geom_smooth(method = "loess", se = FALSE)+
theme_bw()
p2 <- ggplot(data = tolucaDF, aes(x = lotSize, y=hours)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, level = 0.99, color = "red")+
geom_smooth(method = "loess", se = FALSE)+
theme_bw()
plot_grid(p1, p2)