Read data from an URL

toluca <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%201%20Data%20Sets/CH01TA01.txt", sep ="" , header = FALSE)


#Look at the first 6 entries
head(toluca)
  V1  V2
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224

Rename columns

colnames(toluca) <- c("lotSize", "hours")

#Look at the first 6 entries
head(toluca)
  lotSize hours
1      80   399
2      30   121
3      50   221
4      90   376
5      70   361
6      60   224

1 Convert your data into a dataframe

tolucaDF <- data.frame(toluca)
head(tolucaDF)
  lotSize hours
1      80   399
2      30   121
3      50   221
4      90   376
5      70   361
6      60   224

2 Diagnostic Plots for Predictor Variable-Toluca Company Example

2.1 Dot plot

The dot plot for the lot sizes in the Toluca Company example.

library(ggplot2)
ggplot(data = tolucaDF, aes(x = lotSize)) + geom_dotplot(color = "blue", fill = "orchid4", dotsize = .5) + theme_bw()

2.2 Sequence plot

The Sequence plot for the lot sizes in the Toluca Company example.

plot(tolucaDF$lotSize, type = "b", lty = 2, xlab = "Run", ylab = "Lot Size")
title("Sequence Plot")

2.3 Stem-and-leaf plot

Stem-and-leaf plot of the lot sizes for the Toluca Company example. By displaying the last digits, this plot also indicates here that all lot sizes in the Toluca Company example were multiples of 10.

stem(tolucaDF$lotSize, scale = 3)

  The decimal point is 1 digit(s) to the right of the |

   2 | 0
   3 | 000
   4 | 00
   5 | 000
   6 | 0
   7 | 000
   8 | 000
   9 | 0000
  10 | 00
  11 | 00
  12 | 0

2.4 Box plot

Box plot of the lot sizes for the Toluca Company example.

ggplot(data = tolucaDF, aes(x = lotSize)) + geom_boxplot(color = "steelblue", fill = "orchid4") + theme_bw()

3 Diagnostics for Residuals (Section 3.3 in the textbook)

3.1 Create the LS model

# Recall from ch1
toluca_LS_model <- lm(hours ~ lotSize, data = toluca)
summary(toluca_LS_model)

Call:
lm(formula = hours ~ lotSize, data = toluca)

Residuals:
    Min      1Q  Median      3Q     Max 
-83.876 -34.088  -5.982  38.826 103.528 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.366     26.177   2.382   0.0259 *  
lotSize        3.570      0.347  10.290 4.45e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 48.82 on 23 degrees of freedom
Multiple R-squared:  0.8215,    Adjusted R-squared:  0.8138 
F-statistic: 105.9 on 1 and 23 DF,  p-value: 4.449e-10

3.2 Plot of residuals against predictor variable

library(ggplot2)
ggplot(toluca_LS_model, aes(x = lotSize, y = .resid)) + geom_point(color = "blue", dotsize = .5) + theme_bw() #no need to find residual seperately. Just use ".resid"

3.3 Plot of residuals against fitted values.

ggplot(toluca_LS_model, aes(x = .fitted, y = .resid)) + geom_point(color = "blue", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis

3.4 Box plot of residuals

ggplot(toluca_LS_model, aes(x = .resid)) + geom_boxplot(color = "steelblue", fill = "orchid4") + theme_bw()

3.5 Normal probability plot of residuals.

ggplot(toluca_LS_model, aes(sample = .resid)) + geom_qq(color = "blue") + geom_qq_line()+ theme_bw() #No x or y aesthetics. Just use sample =.resid

3.6 Nonnormality of Error Terms

\(\mbox{Exp. Value under Normality} = \sqrt(MSE)[ z(\frac{k-0.375}{n+0.25})]\)

Note that MSE can be calculated below as the SSE / DF. Here I am using deviance(toluca_LS_model)

# Expected value under normality comes from equation (3.6)
cbind(
  "Residual"                   = round(resid(toluca_LS_model), 2), # rounding upresiduals to 2 decimal places
  "Rank"                       = rank(resid(toluca_LS_model)), # this is where we find k value
  "Exp. Value under Normality" = round(sqrt(deviance(toluca_LS_model) / df.residual(toluca_LS_model)) * 
                                       qnorm((rank(resid(toluca_LS_model)) - 0.375) / (nrow(tolucaDF) + 0.25)), 2)) # deviance funtion finds the SSE and to find MSE we need to divide it by d.f. Then qnorm givres us the z value of (k-.375/n+.25). Again we have use round funtion to get final results in 2 decimal places.
   Residual Rank Exp. Value under Normality
1     51.02   22                      51.97
2    -48.47    5                     -44.10
3    -19.88   10                     -14.76
4     -7.68   11                      -9.76
5     48.72   21                      44.10
6    -52.58    4                     -51.97
7     55.21   23                      61.48
8      4.02   15                       9.76
9    -66.39    2                     -74.17
10   -83.88    1                     -95.90
11   -45.17    6                     -37.25
12   -60.28    3                     -61.48
13     5.32   16                      14.76
14   -20.77    8                     -25.33
15   -20.09    9                     -19.93
16     0.61   14                       4.85
17    42.53   20                      37.25
18    27.12   18                      25.33
19    -6.68   12                      -4.85
20   -34.09    7                     -31.05
21   103.53   25                      95.90
22    84.32   24                      74.17
23    38.83   19                      31.05
24    -5.98   13                       0.00
25    10.72   17                      19.93

3.7 Correlation Test for Normality

res <- resid(toluca_LS_model)
expRes <- sqrt(deviance(toluca_LS_model) / df.residual(toluca_LS_model)) * 
                                       qnorm((rank(resid(toluca_LS_model)) - 0.375) / (nrow(tolucaDF) + 0.25))
cor(res, expRes)
[1] 0.9915055

3.8 Breusch-Pagan Test for Constancy of Error Variance

library(lmtest)
bptest(toluca_LS_model, student = FALSE)

    Breusch-Pagan test

data:  toluca_LS_model
BP = 0.82092, df = 1, p-value = 0.3649

3.9 Bank example section 3.7

bankdata <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA04.txt", sep ="" , header = FALSE)


#Look at the first 6 entries
#head(bankdata)

#Rename clumns
colnames(bankdata) <- c("minDeposit", "NoNewAccs") # minDeposit - Size of minimum deposit, NoNewAccs - Number of new accounts

#Look at the first 6 entries
(bankdata)
   minDeposit NoNewAccs
1         125       160
2         100       112
3         200       124
4          75        28
5         150       152
6         175       156
7          75        42
8         175       124
9         125       150
10        200       104
11        100       136
#Model
bankModel <- lm(NoNewAccs~minDeposit, data = bankdata)
bankModel

Call:
lm(formula = NoNewAccs ~ minDeposit, data = bankdata)

Coefficients:
(Intercept)   minDeposit  
    50.7225       0.4867  
#ANOVA table
anova(bankModel)
Analysis of Variance Table

Response: NoNewAccs
           Df  Sum Sq Mean Sq F value Pr(>F)
minDeposit  1  5141.3  5141.3  3.1389 0.1102
Residuals   9 14741.6  1638.0               
#Scatter Plot
library(ggplot2)
library(latex2exp)

ggplot(bankdata, aes(x = minDeposit, y = NoNewAccs)) +
  geom_point() +
  labs(x = "Size of minimum deposit", y = "Number of new accounts", title = "Bank example scatter plot") +
  geom_smooth(method = "lm", se = FALSE) +
  geom_text(aes(x=120, label="yhat = 50.7 + 0.49x", y=85), colour="steelblue", angle=0, text=element_text(size=11)) +
  theme_bw()

library(dplyr)

dfbank <- bankdata %>%
  group_by(minDeposit) %>%
  summarize(NoNewAccs)
dfbank
# A tibble: 11 × 2
# Groups:   minDeposit [6]
   minDeposit NoNewAccs
        <int>     <int>
 1         75        28
 2         75        42
 3        100       112
 4        100       136
 5        125       160
 6        125       150
 7        150       152
 8        175       156
 9        175       124
10        200       124
11        200       104
dfMeanbank <- bankdata %>%
  group_by(minDeposit) %>%
  summarize(MeanAccs = mean(NoNewAccs))

dfMeanbank
# A tibble: 6 × 2
  minDeposit MeanAccs
       <int>    <dbl>
1         75       35
2        100      124
3        125      155
4        150      152
5        175      140
6        200      114

F test for lack of fit

# F test for lack of fit

Reduced <- lm(NoNewAccs ~ minDeposit, data = bankdata) # This is our usual model
summary(Reduced)
## 
## Call:
## lm(formula = NoNewAccs ~ minDeposit, data = bankdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.23 -34.06  12.61  32.44  48.44 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  50.7225    39.3979   1.287     0.23
## minDeposit    0.4867     0.2747   1.772     0.11
## 
## Residual standard error: 40.47 on 9 degrees of freedom
## Multiple R-squared:  0.2586, Adjusted R-squared:  0.1762 
## F-statistic: 3.139 on 1 and 9 DF,  p-value: 0.1102
Full <-  lm(NoNewAccs ~ 0 + as.factor(minDeposit), data = bankdata) # This is the full model
summary(Full)
## 
## Call:
## lm(formula = NoNewAccs ~ 0 + as.factor(minDeposit), data = bankdata)
## 
## Residuals:
##   1   2   3   4   5   6   7   8   9  10  11 
##   5 -12  10  -7   0  16   7 -16  -5 -10  12 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## as.factor(minDeposit)75     35.00      10.71   3.267 0.022282 *  
## as.factor(minDeposit)100   124.00      10.71  11.573 8.45e-05 ***
## as.factor(minDeposit)125   155.00      10.71  14.466 2.85e-05 ***
## as.factor(minDeposit)150   152.00      15.15  10.031 0.000168 ***
## as.factor(minDeposit)175   140.00      10.71  13.066 4.68e-05 ***
## as.factor(minDeposit)200   114.00      10.71  10.640 0.000127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 5 degrees of freedom
## Multiple R-squared:  0.9933, Adjusted R-squared:  0.9852 
## F-statistic: 123.1 on 6 and 5 DF,  p-value: 2.893e-05

Testing the hypothesis:

\(H_0: \mbox{Reduced model is good}\) \(\quad\) \(H_a: \mbox{Full model is good}\)

anova (Reduced, Full)
## Analysis of Variance Table
## 
## Model 1: NoNewAccs ~ minDeposit
## Model 2: NoNewAccs ~ 0 + as.factor(minDeposit)
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)   
## 1      9 14742                                
## 2      5  1148  4     13594 14.801 0.005594 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Example on page 129: Sales training example

salesTrain <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA07.txt", sep ="" , header = FALSE)

#head(salesTrain)

#Rename columns
colnames(salesTrain) <- c("daysTrain", "performance") 
head(salesTrain)
  daysTrain performance
1       0.5        42.5
2       0.5        50.6
3       1.0        68.5
4       1.0        80.7
5       1.5        89.0
6       1.5        99.6
# Create a scatter plot of these data in R
ggplot(salesTrain, aes(x = daysTrain, y = performance)) +
  geom_point() +
  labs(x = "Days", y = "Performance", title = "Sales Training Example scatter plot") +
  theme_bw()

# Do you see a linear relationship?
    # Clearly the regression relation appears to be curvilinear, so the simple linear regression model does not seem to be appropriate.

# What transformation might be appropriate for these data?
    # We shall consider a transformation on X. We shall consider initially the square root transformation (or go with the log 10 transformation of X)

library(dplyr)
# Find the transformed X values
    salesTrainTrans <- salesTrain %>%
      mutate(daysTrainSqrt  = sqrt(daysTrain))
    
    salesTrainTrans
   daysTrain performance daysTrainSqrt
1        0.5        42.5     0.7071068
2        0.5        50.6     0.7071068
3        1.0        68.5     1.0000000
4        1.0        80.7     1.0000000
5        1.5        89.0     1.2247449
6        1.5        99.6     1.2247449
7        2.0       105.3     1.4142136
8        2.0       111.8     1.4142136
9        2.5       112.3     1.5811388
10       2.5       125.7     1.5811388
# Create a new scatter plot of using transformed data in R
  ggplot(salesTrainTrans, aes(x = daysTrainSqrt, y = performance)) +
  geom_point(color = "blue") + geom_smooth(method = "lm", se = FALSE)

  labs(x = "Square root of Days", y = "Performance", title = "Sales Training Example scatter plot") +
  theme_bw()
NULL
# Fit the linear regression model using transformed data
  modelTrans <- lm(performance ~ daysTrainSqrt, data = salesTrainTrans)
  modelTrans

Call:
lm(formula = performance ~ daysTrainSqrt, data = salesTrainTrans)

Coefficients:
  (Intercept)  daysTrainSqrt  
       -10.33          83.45  
# Make a NPP and a residual plot. 
  ggplot(modelTrans, aes(sample = .resid)) + geom_qq(color = "red") + geom_qq_line()+
    labs(x = "Expected ", y = "Residuals", title = "NPP plot") +
  theme_bw() #No x or y aesthetics. Just use sample =.resid

  #residual plot
  ggplot(modelTrans, aes(x = daysTrainSqrt, y = .resid)) + geom_point(color = "orchid4", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis

# Perform a correlation test for normality for this data
  
  res <- resid(modelTrans)
  expRes <- sqrt(deviance(modelTrans) / df.residual(modelTrans)) * 
                                       qnorm((rank(resid(modelTrans)) - 0.375) / (nrow(salesTrainTrans) + 0.25))
  cor(res, expRes)
[1] 0.9789286
  nrow(salesTrainTrans)
[1] 10
  # Notice the correlation between residuals and expected residuals is 0.9789286.
  # The critical value when alpha = 0.01, is 0.879. (read from the table in the notes)
  # The correlation > critical value. Therefore, we conclude that the distribution of the error terms does not depart substantially from a normal distribution.
  
# Comment on lack of fit or of strong unequal error variances and normality
  
  # By looking at the residual plot there is no evidence of lack of fit or of strongly unequal error variances.
  # By looking at the NPP and above test, No strong indications of substantial departures from normality.

Example on page 132: Plasma Levels Example

library(ggplot2)
library(dplyr)
plasmaData <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%203%20Data%20Sets/CH03TA08.txt", sep ="" , header = FALSE)

plasmaData
   V1    V2     V3
1   0 13.44 1.1284
2   0 12.84 1.1086
3   0 11.91 1.0759
4   0 20.09 1.3030
5   0 15.60 1.1931
6   1 10.11 1.0048
7   1 11.38 1.0561
8   1 10.28 1.0120
9   1  8.96 0.9523
10  1  8.59 0.9340
11  2  9.83 0.9926
12  2  9.00 0.9542
13  2  8.65 0.9370
14  2  7.85 0.8949
15  2  8.88 0.9484
16  3  7.94 0.8998
17  3  6.01 0.7789
18  3  5.14 0.7110
19  3  6.90 0.8388
20  3  6.77 0.8306
21  4  4.86 0.6866
22  4  5.10 0.7076
23  4  5.67 0.7536
24  4  5.75 0.7597
25  4  6.23 0.7945
plasmaData <- plasmaData[,-3] # Removed the last column of the data for this example. I only took X and Y columns.
head(plasmaData)
  V1    V2
1  0 13.44
2  0 12.84
3  0 11.91
4  0 20.09
5  0 15.60
6  1 10.11
#Rename columns
colnames(plasmaData) <- c("age", "plasmaLevel") 
head(plasmaData)
  age plasmaLevel
1   0       13.44
2   0       12.84
3   0       11.91
4   0       20.09
5   0       15.60
6   1       10.11
# Create a scatter plot of these data in R
ggplot(plasmaData, aes(x = age, y = plasmaLevel)) +
  geom_point() +
  labs(x = "Age", y = "Plasma Level", title = "Plasma Levels Example scatter plot") +
  theme_bw()

# Do you see a linear relationship?
    # Clearly the regression relation appears to be curvilinear, as well as the greater variability for younger children than for older ones. so the simple linear regression model does not seem to be appropriate.

# What transformation might be appropriate for these data?
    # We shall consider a transformation on Y. We shall consider initially the log(Y) transformation

library(dplyr)
# Find the transformed Y values
    plasmaDataTrans <- plasmaData %>%
      mutate(plasmaLevelLog  = log10(plasmaLevel)) # here I am creating a new column of data with logY values
    
    plasmaDataTrans
   age plasmaLevel plasmaLevelLog
1    0       13.44      1.1283993
2    0       12.84      1.1085650
3    0       11.91      1.0759118
4    0       20.09      1.3029799
5    0       15.60      1.1931246
6    1       10.11      1.0047512
7    1       11.38      1.0561423
8    1       10.28      1.0119931
9    1        8.96      0.9523080
10   1        8.59      0.9339932
11   2        9.83      0.9925535
12   2        9.00      0.9542425
13   2        8.65      0.9370161
14   2        7.85      0.8948697
15   2        8.88      0.9484130
16   3        7.94      0.8998205
17   3        6.01      0.7788745
18   3        5.14      0.7109631
19   3        6.90      0.8388491
20   3        6.77      0.8305887
21   4        4.86      0.6866363
22   4        5.10      0.7075702
23   4        5.67      0.7535831
24   4        5.75      0.7596678
25   4        6.23      0.7944880
# Create a new scatter plot of using transformed data in R
  ggplot(plasmaDataTrans, aes(x = age, y = plasmaLevelLog)) +
  geom_point(color = "blue") + geom_smooth(method = "lm", se = FALSE)

  labs(x = "Age", y = "log(Y)", title = "Plasma Levels Example scatter plot with transformed Y") +
  theme_bw()
NULL
# Fit the linear regression model using transformed data
  modelPlasmaTrans <- lm(plasmaLevelLog ~ age, data = plasmaDataTrans)
  modelPlasmaTrans

Call:
lm(formula = plasmaLevelLog ~ age, data = plasmaDataTrans)

Coefficients:
(Intercept)          age  
     1.1348      -0.1023  
# Make a NPP  plot. 
  ggplot(modelPlasmaTrans, aes(sample = .resid)) + geom_qq(color = "red") + geom_qq_line()+ theme_bw() #No x or y aesthetics. Just use sample =.resid

  #residual plot
  ggplot(modelPlasmaTrans, aes(x = age, y = .resid)) + geom_point(color = "orchid4", dotsize = .5) + theme_bw() #Use the model as the data and use ".fitted" to get the fitted values for x axis

# Perform a correlation test for normality for this data
  
  res <- resid(modelPlasmaTrans)
  expRes <- sqrt(deviance(modelPlasmaTrans) / df.residual(modelPlasmaTrans)) * 
                                       qnorm((rank(resid(modelPlasmaTrans)) - 0.375) / (nrow(plasmaDataTrans) + 0.25))
  cor(res, expRes)
[1] 0.9807112
  nrow(plasmaDataTrans)
[1] 25
  # Notice the correlation between residuals and expected residuals is .9807.
  # The critical value when alpha = 0.01, is .939. 
  # Since the correlation > critical value. Therefore, we conclude that the distribution of the error terms does not depart substantially from a normal distribution.
  
# Comment on lack of fit or of strong unequal error variances and normality
  
  # By looking at the residual plot there is no evidence of lack of fit or of strongly unequal error variances.
  # By looking at the NPP and above test, No strong indications of substantial departures from normality are indicated.

Box-Cox transfomations for Plasma Levels Example

library(ALSM)
obj <- boxcox.sse(plasmaData$age, plasmaData$plasmaLevel, l=seq(-2,2,.1))

obj
   lambda       SSE
1    -2.0  61.85640
2    -1.9  57.57287
3    -1.8  53.66783
4    -1.7  50.11904
5    -1.6  46.90689
6    -1.5  44.01425
7    -1.4  41.42638
8    -1.3  39.13089
9    -1.2  37.11767
10   -1.1  35.37886
11   -1.0  33.90887
12   -0.9  32.70442
13   -0.8  31.76453
14   -0.7  31.09066
15   -0.6  30.68680
16   -0.5  30.55961
17   -0.4  30.71859
18   -0.3  31.17631
19   -0.2  31.94867
20   -0.1  33.05520
41    0.0  34.51945
21    0.1  36.36939
22    0.2  38.63791
23    0.3  41.36342
24    0.4  44.59051
25    0.5  48.37072
26    0.6  52.76343
27    0.7  57.83686
28    0.8  63.66932
29    0.9  70.35050
30    1.0  77.98306
31    1.1  86.68443
32    1.2  96.58884
33    1.3 107.84974
34    1.4 120.64249
35    1.5 135.16751
36    1.6 151.65385
37    1.7 170.36340
38    1.8 191.59558
39    1.9 215.69290
40    2.0 243.04720
obj[which.min(obj$SSE),]
   lambda      SSE
16   -0.5 30.55961

3.9.1 lowess Method for Toluca example

toluca <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%201%20Data%20Sets/CH01TA01.txt", sep ="" , header = FALSE)


colnames(toluca) <- c("lotSize", "hours")

tolucaDF <- data.frame(toluca)
head(tolucaDF)
  lotSize hours
1      80   399
2      30   121
3      50   221
4      90   376
5      70   361
6      60   224
library(cowplot)
p1 <- ggplot(data = tolucaDF, aes(x = lotSize, y=hours)) + 
  geom_point(color = "steelblue") + 
  geom_smooth(method = "loess", se = FALSE)+
  theme_bw()

p2 <- ggplot(data = tolucaDF, aes(x = lotSize, y=hours)) + 
  geom_point(color = "steelblue") + 
  geom_smooth(method = "lm", se = TRUE, level = 0.99, color = "red")+
    geom_smooth(method = "loess",  se = FALSE)+
  theme_bw()

plot_grid(p1, p2)