Multiple Linear Regression
## Load data
prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")
## Set occupation type as a factor variable
prestige$type <- as.factor(prestige$occ_type)
## Eliminate the job titles
prestige2 <- prestige %>%
dplyr::select(-title, -blue, -professional, -white)
## Create scatterplot matrix with histograms on diagonals
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(prestige2,
cex = 1, pch = 1, bg = "steelblue",
diag.panel = panel.hist, cex.labels = 1, font.labels = 2)
## Simple linear regression
## Prestige = f(income)
income.model <- lm(prestige ~ income, data=prestige)
summary(income.model)
##
## Call:
## lm(formula = prestige ~ income, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.007 -8.378 -2.378 8.432 32.084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.714e+01 2.268e+00 11.97 <2e-16 ***
## income 2.897e-03 2.833e-04 10.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.09 on 100 degrees of freedom
## Multiple R-squared: 0.5111, Adjusted R-squared: 0.5062
## F-statistic: 104.5 on 1 and 100 DF, p-value: < 2.2e-16
prestige %>%
ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
add_axis("x", title = "Income") %>%
scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Prestige") %>%
scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)
## Prestige = f(education)
education.model <- lm(prestige ~ education, data=prestige)
summary(education.model)
##
## Call:
## lm(formula = prestige ~ education, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.0397 -6.5228 0.6611 6.7430 18.1636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.732 3.677 -2.919 0.00434 **
## education 5.361 0.332 16.148 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.103 on 100 degrees of freedom
## Multiple R-squared: 0.7228, Adjusted R-squared: 0.72
## F-statistic: 260.8 on 1 and 100 DF, p-value: < 2.2e-16
anova(income.model)
## Analysis of Variance Table
##
## Response: prestige
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 15279 15279.3 104.54 < 2.2e-16 ***
## Residuals 100 14616 146.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
prestige %>%
ggvis(x=~education, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
add_axis("x", title = "Education") %>%
scale_numeric("x", domain=c(0, 20), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Prestige") %>%
scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)
## Prestige = f(% women)
percwomn.model <- lm(prestige ~ percwomn, data=prestige)
summary(percwomn.model)
##
## Call:
## lm(formula = prestige ~ percwomn, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.444 -12.391 -4.126 13.034 39.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.69300 2.30760 21.101 <2e-16 ***
## percwomn -0.06417 0.05385 -1.192 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.17 on 100 degrees of freedom
## Multiple R-squared: 0.014, Adjusted R-squared: 0.004143
## F-statistic: 1.42 on 1 and 100 DF, p-value: 0.2362
anova(percwomn.model)
## Analysis of Variance Table
##
## Response: prestige
## Df Sum Sq Mean Sq F value Pr(>F)
## percwomn 1 418.6 418.63 1.4202 0.2362
## Residuals 100 29476.8 294.77
prestige %>%
ggvis(x=~percwomn, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
add_axis("x", title = "Education") %>%
scale_numeric("x", domain=c(0, 100), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Prestige") %>%
scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)
## Prestige = f(income, education)
inc.ed.model <- lm(prestige ~ income + education, data=prestige)
summary(inc.ed.model)
##
## Call:
## lm(formula = prestige ~ income + education, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.4040 -5.3308 0.0154 4.9803 17.6889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.8477787 3.2189771 -2.127 0.0359 *
## income 0.0013612 0.0002242 6.071 2.36e-08 ***
## education 4.1374444 0.3489120 11.858 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.81 on 99 degrees of freedom
## Multiple R-squared: 0.798, Adjusted R-squared: 0.7939
## F-statistic: 195.6 on 2 and 99 DF, p-value: < 2.2e-16
coef(inc.ed.model)
## (Intercept) income education
## -6.847778720 0.001361166 4.137444384
anova(inc.ed.model)
## Analysis of Variance Table
##
## Response: prestige
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 15279.3 15279.3 250.49 < 2.2e-16 ***
## education 1 8577.3 8577.3 140.62 < 2.2e-16 ***
## Residuals 99 6038.9 61.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(inc.ed.model)
## Prestige = f(income, education, %women)
inc.ed.wmn.model <- lm(prestige ~ income + education + percwomn, data=prestige)
summary(inc.ed.wmn.model)
##
## Call:
## lm(formula = prestige ~ income + education + percwomn, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8246 -5.3332 -0.1364 5.1587 17.5045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.7943342 3.2390886 -2.098 0.0385 *
## income 0.0013136 0.0002778 4.729 7.58e-06 ***
## education 4.1866373 0.3887013 10.771 < 2e-16 ***
## percwomn -0.0089052 0.0304071 -0.293 0.7702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.846 on 98 degrees of freedom
## Multiple R-squared: 0.7982, Adjusted R-squared: 0.792
## F-statistic: 129.2 on 3 and 98 DF, p-value: < 2.2e-16
coef(inc.ed.wmn.model)
## (Intercept) income education percwomn
## -6.794334203 0.001313560 4.186637275 -0.008905157
anova(inc.ed.wmn.model)
## Analysis of Variance Table
##
## Response: prestige
## Df Sum Sq Mean Sq F value Pr(>F)
## income 1 15279.3 15279.3 248.1727 <2e-16 ***
## education 1 8577.3 8577.3 139.3167 <2e-16 ***
## percwomn 1 5.3 5.3 0.0858 0.7702
## Residuals 98 6033.6 61.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(inc.ed.wmn.model)
# Compare model with no predictors to model with income & education
no.model <- lm(prestige~1, data=prestige)
anova(income.model, no.model)
## Analysis of Variance Table
##
## Model 1: prestige ~ income
## Model 2: prestige ~ 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 100 14616
## 2 101 29895 -1 -15279 104.54 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# adjusted R-squared manual calculation
0.7982 - ((1-0.7982)*(3/(102-3-1)))
## [1] 0.7920224
## Standardized beta coefficients
coef(lm(scale(prestige) ~ scale(income) + scale(education) + scale(percwomn), data=prestige))
## (Intercept) scale(income) scale(education) scale(percwomn)
## -2.462036e-17 3.241757e-01 6.639551e-01 -1.642104e-02
# Manual calculation for beta (for education predictor)
4.1866*(2.7284/17.204)
## [1] 0.6639572
## Assumptions check
par(mfrow=c(2,2)) # visualize four graphs at once
plot(inc.ed.wmn.model)
par(mfrow=c(1,1)) # reset the graphics defaults
## Compare model 1 vs model 2
anova(income.model, inc.ed.model)
## Analysis of Variance Table
##
## Model 1: prestige ~ income
## Model 2: prestige ~ income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 100 14616.2
## 2 99 6038.9 1 8577.3 140.62 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Confidence interval for a prediction
predict(inc.ed.wmn.model, list(income=5000,education=10,percwomn=40), interval="conf")
## fit lwr upr
## 1 41.28363 39.57498 42.99229
## Prediction interval
predict(inc.ed.wmn.model, list(income=5000,education=10,percwomn=40), interval="pred")
## fit lwr upr
## 1 41.28363 25.61911 56.94816
###########################
###########################
## Height/Weight Example ##
###########################
###########################
## Load data
htwt <- read.csv("http://www.bradthiessen.com/html5/data/htwt.csv")
## Rename female variable to "gender"
htwt <- htwt %>%
rename(gender = female)
## Create scatterplot matrix with histograms on diagonals
pairs(htwt,
cex = 1, pch = 1, bg = "steelblue",
diag.panel = panel.hist, cex.labels = 1, font.labels = 2)
## Summary statistics
table(htwt$gender)
##
## female male
## 509 491
htwt %>%
summarize(mn_height=mean(height), sd_height=sd(height),
mn_weight=mean(weight), sd_weight=sd(weight),
mn_mal=mean(mal), sd_mal=sd(mal))
## mn_height sd_height mn_weight sd_weight mn_mal sd_mal
## 1 166.163 8.025138 57.17209 9.656277 2.591 2.842851
## Reduced model (no predictors)
null.model <- lm(weight ~ 1, data=htwt)
summary(null.model)
##
## Call:
## lm(formula = weight ~ 1, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.252 -6.372 -1.382 4.968 54.188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.1721 0.3054 187.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.656 on 999 degrees of freedom
## Full model
modelx1x2 <- lm(weight ~ height + gender, data=htwt)
coef(modelx1x2)
## (Intercept) height gendermale
## -53.7879605 0.6717494 -1.3438638
summary(modelx1x2)
##
## Call:
## lm(formula = weight ~ height + gender, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.936 -5.398 -1.351 3.172 47.475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.78796 6.31386 -8.519 <2e-16 ***
## height 0.67175 0.03895 17.245 <2e-16 ***
## gendermale -1.34386 0.62501 -2.150 0.0318 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.238 on 997 degrees of freedom
## Multiple R-squared: 0.2736, Adjusted R-squared: 0.2721
## F-statistic: 187.8 on 2 and 997 DF, p-value: < 2.2e-16
anova(modelx1x2)
## Analysis of Variance Table
##
## Response: weight
## Df Sum Sq Mean Sq F value Pr(>F)
## height 1 25173 25172.9 370.9122 < 2e-16 ***
## gender 1 314 313.8 4.6231 0.03178 *
## Residuals 997 67664 67.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(null.model, modelx1x2)
## Analysis of Variance Table
##
## Model 1: weight ~ 1
## Model 2: weight ~ height + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 999 93150
## 2 997 67664 2 25487 187.77 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assumptions check
par(mfrow=c(2,2)) # visualize four graphs at once
plot(modelx1x2)
par(mfrow=c(1,1)) # reset the graphics defaults
## Other models (to get R-squared values)
modelx1 <- lm(weight ~ height, data=htwt)
summary(modelx1)
##
## Call:
## lm(formula = weight ~ height, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.405 -5.462 -1.446 3.306 46.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -46.76375 5.41280 -8.639 <2e-16 ***
## height 0.62551 0.03254 19.224 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.253 on 998 degrees of freedom
## Multiple R-squared: 0.2702, Adjusted R-squared: 0.2695
## F-statistic: 369.6 on 1 and 998 DF, p-value: < 2.2e-16
modelx2 <- lm(weight ~ gender, data=htwt)
summary(modelx2)
##
## Call:
## lm(formula = weight ~ gender, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.597 -6.217 -0.930 4.510 51.843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.9101 0.4159 132.043 < 2e-16 ***
## gendermale 4.6070 0.5935 7.763 2.05e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.382 on 998 degrees of freedom
## Multiple R-squared: 0.05694, Adjusted R-squared: 0.056
## F-statistic: 60.26 on 1 and 998 DF, p-value: 2.05e-14
modelx3 <- lm(weight ~ mal, data=htwt)
summary(modelx3)
##
## Call:
## lm(formula = weight ~ mal, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.360 -6.614 -1.530 4.976 55.355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.6439 0.4129 139.620 <2e-16 ***
## mal -0.1821 0.1074 -1.696 0.0902 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.647 on 998 degrees of freedom
## Multiple R-squared: 0.002874, Adjusted R-squared: 0.001875
## F-statistic: 2.877 on 1 and 998 DF, p-value: 0.09018
modelx1x3 <- lm(weight ~ height +mal, data=htwt)
summary(modelx1x3)
##
## Call:
## lm(formula = weight ~ height + mal, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.561 -5.397 -1.382 3.381 46.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -48.03581 5.52756 -8.690 <2e-16 ***
## height 0.63152 0.03296 19.158 <2e-16 ***
## mal 0.10530 0.09305 1.132 0.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.252 on 997 degrees of freedom
## Multiple R-squared: 0.2712, Adjusted R-squared: 0.2697
## F-statistic: 185.5 on 2 and 997 DF, p-value: < 2.2e-16
modelx2x3 <- lm(weight ~ gender +mal, data=htwt)
summary(modelx2x3)
##
## Call:
## lm(formula = weight ~ gender + mal, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.597 -6.216 -0.933 4.503 51.860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.917679 0.539810 101.735 < 2e-16 ***
## gendermale 4.603995 0.608943 7.561 9.08e-14 ***
## mal -0.002374 0.107137 -0.022 0.982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.387 on 997 degrees of freedom
## Multiple R-squared: 0.05694, Adjusted R-squared: 0.05505
## F-statistic: 30.1 on 2 and 997 DF, p-value: 2.027e-13
modelx1x2x3 <- lm(weight ~ height + gender + mal, data=htwt)
summary(modelx1x2x3)
##
## Call:
## lm(formula = weight ~ height + gender + mal, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.954 -5.452 -1.309 3.222 47.481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54.26812 6.34372 -8.555 <2e-16 ***
## height 0.67323 0.03901 17.260 <2e-16 ***
## gendermale -1.26238 0.63344 -1.993 0.0465 *
## mal 0.07500 0.09415 0.797 0.4259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.24 on 996 degrees of freedom
## Multiple R-squared: 0.2741, Adjusted R-squared: 0.2719
## F-statistic: 125.3 on 3 and 996 DF, p-value: < 2.2e-16
## Compare models to determine if gender matters
anova(modelx1, modelx1x2)
## Analysis of Variance Table
##
## Model 1: weight ~ height
## Model 2: weight ~ height + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 998 67978
## 2 997 67664 1 313.76 4.6231 0.03178 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Split data and run separate regressions
males <- htwt %>%
filter(gender=="male")
females <- htwt %>%
filter(gender=="female")
male.model <- lm(weight ~ height, data=males)
coef(male.model)
## (Intercept) height
## -72.0137587 0.7706638
female.model <- lm(weight ~ height, data=females)
coef(female.model)
## (Intercept) height
## -33.7505460 0.5479189
## Full model including interaction
full.model <- lm(weight ~ height*gender, data=htwt)
coef(full.model)
## (Intercept) height gendermale height:gendermale
## -33.7505460 0.5479189 -38.2632127 0.2227448
anova(full.model)
## Analysis of Variance Table
##
## Response: weight
## Df Sum Sq Mean Sq F value Pr(>F)
## height 1 25173 25172.9 373.5646 < 2.2e-16 ***
## gender 1 314 313.8 4.6562 0.031181 *
## height:gender 1 548 547.8 8.1297 0.004445 **
## Residuals 996 67116 67.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(full.model)
##
## Call:
## lm(formula = weight ~ height * gender, data = htwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.622 -5.428 -1.428 3.263 48.135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.75055 9.43230 -3.578 0.000363 ***
## height 0.54792 0.05825 9.407 < 2e-16 ***
## gendermale -38.26321 12.96338 -2.952 0.003235 **
## height:gendermale 0.22274 0.07812 2.851 0.004445 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.209 on 996 degrees of freedom
## Multiple R-squared: 0.2795, Adjusted R-squared: 0.2773
## F-statistic: 128.8 on 3 and 996 DF, p-value: < 2.2e-16
## Compare full model (with interaction) to model without interaction
anova(modelx1x2, full.model)
## Analysis of Variance Table
##
## Model 1: weight ~ height + gender
## Model 2: weight ~ height * gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 997 67664
## 2 996 67116 1 547.83 8.1297 0.004445 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Manual calculation for omnibus F-test
(.2795-.2736)/((1-.2795)/(1000-3-1))
## [1] 8.156003
## Scatterplot with two regression lines
htwt %>%
ggvis(x=~height, y=~weight, fill := "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.3) %>%
layer_points(fill=~gender) %>%
group_by(gender) %>%
layer_model_predictions(model = "lm", formula=weight~height,
strokeWidth:=5, stroke=~gender, strokeOpacity:=1) %>%
add_axis("x", title = "Height (cm)") %>%
scale_numeric("x", domain=c(140, 190), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Weight (kg)") %>%
scale_numeric("y", domain=c(30, 115), nice=FALSE, clamp=TRUE)
###########################
###########################
####### GPA EXAMPLE #######
###########################
###########################
## Load data
gpadata <- read.csv("http://www.bradthiessen.com/html5/data/f13s14.csv")
## Select GPA, ACT, gender, and hours studied data
gpa <-
gpadata %>%
select(S1_P49FallTermGPA, S1_P49HSGPA, S1_P49HSPerRank,
S1_P49StudentAthlete, S1_P49ACTComposite, S1_NA150, S1_P49Gender)
# Rename variables
gpa <-
gpa %>%
rename(sauGPA = S1_P49FallTermGPA, hsGPA = S1_P49HSGPA,
hsRANK = S1_P49HSPerRank, athlete = S1_P49StudentAthlete,
ACTscore = S1_P49ACTComposite, hoursSTUDY = S1_NA150,
gender = S1_P49Gender)
# Create factor variables for athlete and gender
gpa$gender <- factor(gpa$gender, labels = c("male", "female"))
gpa$athlete <- factor(gpa$athlete, labels = c("not athlete", "athlete"))
# Complete cases only (no missing data)
gpa.data <- gpa[complete.cases(gpa),]
# Save this data (final format for website)
write.table(gpa.data, file="~/Desktop/gpadata.csv", sep=",")
## Load data from website
gpa <- read.csv("http://www.bradthiessen.com/html5/data/gpadata.csv")
# Look at data
head(gpa)
## sauGPA hsGPA hsRANK athlete ACTscore hoursSTUDY gender
## 1 2.87 2.82 43 not athlete 24 5 male
## 2 3.16 3.49 76 not athlete 32 7 male
## 3 2.67 2.97 72 not athlete 22 15 female
## 4 3.00 3.29 63 not athlete 20 15 male
## 5 3.00 3.41 65 not athlete 21 8 female
## 6 1.69 3.26 70 not athlete 21 4 male
tail(gpa)
## sauGPA hsGPA hsRANK athlete ACTscore hoursSTUDY gender
## 250 1.71 3.09 54 athlete 27 14 male
## 251 3.08 3.70 78 not athlete 23 10 female
## 252 0.19 2.86 30 not athlete 20 5 male
## 253 3.06 3.49 68 not athlete 20 5 female
## 254 2.62 2.52 31 not athlete 18 3 male
## 255 2.33 3.12 45 not athlete 23 2 female
# Summarize data
summary(gpa)
## sauGPA hsGPA hsRANK athlete
## Min. :0.00 Min. :1.960 Min. : 6.00 athlete : 89
## 1st Qu.:2.27 1st Qu.:2.900 1st Qu.: 44.00 not athlete:166
## Median :2.80 Median :3.350 Median : 66.00
## Mean :2.65 Mean :3.275 Mean : 63.27
## 3rd Qu.:3.20 3rd Qu.:3.680 3rd Qu.: 83.00
## Max. :3.78 Max. :4.000 Max. :100.00
## ACTscore hoursSTUDY gender
## Min. :17.00 Min. : 0.00 female:144
## 1st Qu.:20.00 1st Qu.: 4.00 male :111
## Median :22.00 Median : 8.00
## Mean :22.96 Mean :10.62
## 3rd Qu.:25.00 3rd Qu.:15.00
## Max. :34.00 Max. :50.00
## Here's a package that gives more detailed summary statistics
## with the describe() command
library(psych)
describe(gpa)
## vars n mean sd median trimmed mad min max range
## sauGPA 1 255 2.65 0.75 2.80 2.73 0.67 0.00 3.78 3.78
## hsGPA 2 255 3.27 0.52 3.35 3.30 0.52 1.96 4.00 2.04
## hsRANK 3 255 63.27 24.48 66.00 64.33 28.17 6.00 100.00 94.00
## athlete* 4 255 1.65 0.48 2.00 1.69 0.00 1.00 2.00 1.00
## ACTscore 5 255 22.96 3.66 22.00 22.62 2.97 17.00 34.00 17.00
## hoursSTUDY 6 255 10.62 8.90 8.00 9.17 5.93 0.00 50.00 50.00
## gender* 7 255 1.44 0.50 1.00 1.42 0.00 1.00 2.00 1.00
## skew kurtosis se
## sauGPA -1.11 1.28 0.05
## hsGPA -0.40 -0.86 0.03
## hsRANK -0.33 -0.93 1.53
## athlete* -0.63 -1.61 0.03
## ACTscore 0.76 0.06 0.23
## hoursSTUDY 1.69 3.16 0.56
## gender* 0.26 -1.94 0.03
## Scatterplot matrix
pairs(gpa,
cex = 1, pch = 1, bg = "steelblue",
diag.panel = panel.hist, cex.labels = 1, font.labels = 2)
## Null model
null.model <- lm(sauGPA ~ 1, data=gpa)
## Full model
# Collinearity problem
cor(hsGPA, hsRANK, data=gpa)
## [1] 0.903263
# Fit model
full.model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
coef(full.model)
## (Intercept) hsGPA athletenot athlete
## -0.54890739 0.68848215 -0.02926881
## ACTscore hoursSTUDY gendermale
## 0.04286319 0.00927878 -0.27532973
confint(full.model)
## 2.5 % 97.5 %
## (Intercept) -1.043328143 -0.05448664
## hsGPA 0.525462014 0.85150228
## athletenot athlete -0.178247137 0.11970951
## ACTscore 0.020047281 0.06567909
## hoursSTUDY 0.001558211 0.01699935
## gendermale -0.422525385 -0.12813407
summary(full.model)
##
## Call:
## lm(formula = sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY +
## gender, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.97642 -0.24947 0.04454 0.33731 1.07124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.548907 0.251034 -2.187 0.029704 *
## hsGPA 0.688482 0.082771 8.318 5.91e-15 ***
## athletenot athlete -0.029269 0.075641 -0.387 0.699129
## ACTscore 0.042863 0.011584 3.700 0.000265 ***
## hoursSTUDY 0.009279 0.003920 2.367 0.018697 *
## gendermale -0.275330 0.074736 -3.684 0.000282 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5248 on 249 degrees of freedom
## Multiple R-squared: 0.5184, Adjusted R-squared: 0.5088
## F-statistic: 53.61 on 5 and 249 DF, p-value: < 2.2e-16
# Compare null and full models
anova(null.model, full.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ 1
## Model 2: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 254 142.414
## 2 249 68.583 5 73.832 53.612 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assumptions check
par(mfrow=c(2,2)) # visualize four graphs at once
plot(full.model)
par(mfrow=c(1,1)) # reset the graphics defaults
## Include all interaction terms
interact.model <- lm(sauGPA ~ hsGPA*athlete*ACTscore*hoursSTUDY*gender, data=gpa)
summary(interact.model)
##
## Call:
## lm(formula = sauGPA ~ hsGPA * athlete * ACTscore * hoursSTUDY *
## gender, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.86303 -0.23787 0.06885 0.32812 1.16645
##
## Coefficients:
## Estimate
## (Intercept) -2.858533
## hsGPA 1.426448
## athletenot athlete -3.421158
## ACTscore 0.187316
## hoursSTUDY -0.189176
## gendermale -2.068469
## hsGPA:athletenot athlete 0.782668
## hsGPA:ACTscore -0.043748
## athletenot athlete:ACTscore 0.114351
## hsGPA:hoursSTUDY 0.059479
## athletenot athlete:hoursSTUDY 0.322663
## ACTscore:hoursSTUDY 0.004147
## hsGPA:gendermale 0.371719
## athletenot athlete:gendermale 12.750745
## ACTscore:gendermale 0.057158
## hoursSTUDY:gendermale 0.189588
## hsGPA:athletenot athlete:ACTscore -0.024624
## hsGPA:athletenot athlete:hoursSTUDY -0.098374
## hsGPA:ACTscore:hoursSTUDY -0.001345
## athletenot athlete:ACTscore:hoursSTUDY -0.007738
## hsGPA:athletenot athlete:gendermale -3.125614
## hsGPA:ACTscore:gendermale -0.010055
## athletenot athlete:ACTscore:gendermale -0.633812
## hsGPA:hoursSTUDY:gendermale -0.042746
## athletenot athlete:hoursSTUDY:gendermale -0.507255
## ACTscore:hoursSTUDY:gendermale -0.005725
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY 0.002488
## hsGPA:athletenot athlete:ACTscore:gendermale 0.155965
## hsGPA:athletenot athlete:hoursSTUDY:gendermale 0.112163
## hsGPA:ACTscore:hoursSTUDY:gendermale 0.001181
## athletenot athlete:ACTscore:hoursSTUDY:gendermale 0.028069
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale -0.006461
## Std. Error t value
## (Intercept) 8.896733 -0.321
## hsGPA 2.616383 0.545
## athletenot athlete 10.001927 -0.342
## ACTscore 0.413151 0.453
## hoursSTUDY 0.984171 -0.192
## gendermale 10.249080 -0.202
## hsGPA:athletenot athlete 2.923433 0.268
## hsGPA:ACTscore 0.117887 -0.371
## athletenot athlete:ACTscore 0.467675 0.245
## hsGPA:hoursSTUDY 0.280278 0.212
## athletenot athlete:hoursSTUDY 1.043032 0.309
## ACTscore:hoursSTUDY 0.041278 0.100
## hsGPA:gendermale 3.018346 0.123
## athletenot athlete:gendermale 12.524280 1.018
## ACTscore:gendermale 0.479190 0.119
## hoursSTUDY:gendermale 1.092736 0.173
## hsGPA:athletenot athlete:ACTscore 0.132866 -0.185
## hsGPA:athletenot athlete:hoursSTUDY 0.295006 -0.333
## hsGPA:ACTscore:hoursSTUDY 0.011449 -0.117
## athletenot athlete:ACTscore:hoursSTUDY 0.044410 -0.174
## hsGPA:athletenot athlete:gendermale 3.679553 -0.849
## hsGPA:ACTscore:gendermale 0.136975 -0.073
## athletenot athlete:ACTscore:gendermale 0.584537 -1.084
## hsGPA:hoursSTUDY:gendermale 0.316015 -0.135
## athletenot athlete:hoursSTUDY:gendermale 1.302175 -0.390
## ACTscore:hoursSTUDY:gendermale 0.045755 -0.125
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY 0.012228 0.203
## hsGPA:athletenot athlete:ACTscore:gendermale 0.165924 0.940
## hsGPA:athletenot athlete:hoursSTUDY:gendermale 0.372868 0.301
## hsGPA:ACTscore:hoursSTUDY:gendermale 0.012903 0.092
## athletenot athlete:ACTscore:hoursSTUDY:gendermale 0.055376 0.507
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale 0.015409 -0.419
## Pr(>|t|)
## (Intercept) 0.748
## hsGPA 0.586
## athletenot athlete 0.733
## ACTscore 0.651
## hoursSTUDY 0.848
## gendermale 0.840
## hsGPA:athletenot athlete 0.789
## hsGPA:ACTscore 0.711
## athletenot athlete:ACTscore 0.807
## hsGPA:hoursSTUDY 0.832
## athletenot athlete:hoursSTUDY 0.757
## ACTscore:hoursSTUDY 0.920
## hsGPA:gendermale 0.902
## athletenot athlete:gendermale 0.310
## ACTscore:gendermale 0.905
## hoursSTUDY:gendermale 0.862
## hsGPA:athletenot athlete:ACTscore 0.853
## hsGPA:athletenot athlete:hoursSTUDY 0.739
## hsGPA:ACTscore:hoursSTUDY 0.907
## athletenot athlete:ACTscore:hoursSTUDY 0.862
## hsGPA:athletenot athlete:gendermale 0.397
## hsGPA:ACTscore:gendermale 0.942
## athletenot athlete:ACTscore:gendermale 0.279
## hsGPA:hoursSTUDY:gendermale 0.893
## athletenot athlete:hoursSTUDY:gendermale 0.697
## ACTscore:hoursSTUDY:gendermale 0.901
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY 0.839
## hsGPA:athletenot athlete:ACTscore:gendermale 0.348
## hsGPA:athletenot athlete:hoursSTUDY:gendermale 0.764
## hsGPA:ACTscore:hoursSTUDY:gendermale 0.927
## athletenot athlete:ACTscore:hoursSTUDY:gendermale 0.613
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale 0.675
##
## Residual standard error: 0.5361 on 223 degrees of freedom
## Multiple R-squared: 0.55, Adjusted R-squared: 0.4874
## F-statistic: 8.791 on 31 and 223 DF, p-value: < 2.2e-16
# Compare to model with no interaction terms
anova(full.model, interact.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
## Model 2: sauGPA ~ hsGPA * athlete * ACTscore * hoursSTUDY * gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 249 68.583
## 2 223 64.091 26 4.492 0.6011 0.9384
# No athlete model
no.athlete.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, data=gpa)
coef(no.athlete.model)
## (Intercept) hsGPA ACTscore hoursSTUDY gendermale
## -0.566182830 0.688931448 0.042495854 0.009302746 -0.264003535
confint(no.athlete.model)
## 2.5 % 97.5 %
## (Intercept) -1.051883884 -0.08048178
## hsGPA 0.526207969 0.85165493
## ACTscore 0.019795834 0.06519587
## hoursSTUDY 0.001596431 0.01700906
## gendermale -0.399206710 -0.12880036
summary(no.athlete.model)
##
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender,
## data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.99086 -0.24437 0.05333 0.33249 1.06572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.566183 0.246612 -2.296 0.022513 *
## hsGPA 0.688931 0.082622 8.338 5.09e-15 ***
## ACTscore 0.042496 0.011526 3.687 0.000278 ***
## hoursSTUDY 0.009303 0.003913 2.377 0.018183 *
## gendermale -0.264004 0.068649 -3.846 0.000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5239 on 250 degrees of freedom
## Multiple R-squared: 0.5181, Adjusted R-squared: 0.5104
## F-statistic: 67.21 on 4 and 250 DF, p-value: < 2.2e-16
# Compare null and full models
anova(no.athlete.model, full.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender
## Model 2: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 250 68.624
## 2 249 68.583 1 0.041239 0.1497 0.6991
# No athlete or hours studying model
no.hours.model <- lm(sauGPA ~ hsGPA + ACTscore + gender, data=gpa)
coef(no.hours.model)
## (Intercept) hsGPA ACTscore gendermale
## -0.64166123 0.70510522 0.04799384 -0.27525872
confint(no.hours.model)
## 2.5 % 97.5 %
## (Intercept) -1.1277550 -0.15556748
## hsGPA 0.5414414 0.86876909
## ACTscore 0.0255507 0.07043699
## gendermale -0.4113817 -0.13913572
summary(no.hours.model)
##
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + gender, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.07674 -0.24683 0.06869 0.34865 1.00116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.64166 0.24682 -2.600 0.00988 **
## hsGPA 0.70511 0.08310 8.485 1.89e-15 ***
## ACTscore 0.04799 0.01140 4.212 3.54e-05 ***
## gendermale -0.27526 0.06912 -3.983 8.94e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5288 on 251 degrees of freedom
## Multiple R-squared: 0.5072, Adjusted R-squared: 0.5014
## F-statistic: 86.13 on 3 and 251 DF, p-value: < 2.2e-16
# Compare to our previous model
anova(no.hours.model, no.athlete.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ hsGPA + ACTscore + gender
## Model 2: sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 251 70.175
## 2 250 68.624 1 1.5516 5.6525 0.01818 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#######################
#######################
## Questions 34a-34c ##
#######################
#######################
## a) ACT scores
act.model <- lm(sauGPA ~ ACTscore, data=gpa)
summary(act.model)
##
## Call:
## lm(formula = sauGPA ~ ACTscore, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3123 -0.2870 0.0755 0.4415 1.1649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13044 0.25245 0.517 0.606
## ACTscore 0.10972 0.01086 10.105 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6333 on 253 degrees of freedom
## Multiple R-squared: 0.2875, Adjusted R-squared: 0.2847
## F-statistic: 102.1 on 1 and 253 DF, p-value: < 2.2e-16
anova(act.model)
## Analysis of Variance Table
##
## Response: sauGPA
## Df Sum Sq Mean Sq F value Pr(>F)
## ACTscore 1 40.948 40.948 102.1 < 2.2e-16 ***
## Residuals 253 101.466 0.401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## b) ACT scores on top of GPA
gpa.model <- lm(sauGPA ~ hsGPA, data=gpa)
gpa.act.model <- lm(sauGPA ~ hsGPA + ACTscore, data=gpa)
anova(gpa.model, gpa.act.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ hsGPA
## Model 2: sauGPA ~ hsGPA + ACTscore
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 253 78.66
## 2 252 74.61 1 4.05 13.679 0.0002662 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## c) Add hours studied per week
gpa.act.hours.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY, data=gpa)
anova(gpa.model, gpa.act.model, gpa.act.hours.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ hsGPA
## Model 2: sauGPA ~ hsGPA + ACTscore
## Model 3: sauGPA ~ hsGPA + ACTscore + hoursSTUDY
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 253 78.660
## 2 252 74.610 1 4.0500 13.9860 0.0002282 ***
## 3 251 72.683 1 1.9262 6.6518 0.0104758 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#######################
#######################
### Questions 35-37 ###
#######################
#######################
## T-test
t.test(sauGPA ~ athlete, var.equal=TRUE, sig.level=0.05, data=gpa)
##
## Two Sample t-test
##
## data: sauGPA by athlete
## t = -2.3323, df = 253, p-value = 0.02047
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.41952830 -0.03539793
## sample estimates:
## mean in group athlete mean in group not athlete
## 2.501573 2.729036
t.test(sauGPA ~ athlete, var.equal=TRUE, conf.level=0.95, data=gpa)$conf.int
## [1] -0.41952830 -0.03539793
## attr(,"conf.level")
## [1] 0.95
## Athlete alone
ath.model <- lm(sauGPA ~ athlete, data=gpa)
summary(ath.model)
##
## Call:
## lm(formula = sauGPA ~ athlete, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7290 -0.3453 0.1410 0.5410 1.2784
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.50157 0.07869 31.792 <2e-16 ***
## athletenot athlete 0.22746 0.09753 2.332 0.0205 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7423 on 253 degrees of freedom
## Multiple R-squared: 0.02105, Adjusted R-squared: 0.01718
## F-statistic: 5.44 on 1 and 253 DF, p-value: 0.02047
(.02105)/((1-.02105)/(255-1-1))
## [1] 5.440165
## ACT
act.model <- lm(sauGPA ~ ACTscore, data=gpa)
summary(act.model)
##
## Call:
## lm(formula = sauGPA ~ ACTscore, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3123 -0.2870 0.0755 0.4415 1.1649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13044 0.25245 0.517 0.606
## ACTscore 0.10972 0.01086 10.105 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6333 on 253 degrees of freedom
## Multiple R-squared: 0.2875, Adjusted R-squared: 0.2847
## F-statistic: 102.1 on 1 and 253 DF, p-value: < 2.2e-16
## Athlete + ACT
ath.act.model <- lm(sauGPA ~ ACTscore + athlete, data=gpa)
summary(ath.act.model)
##
## Call:
## lm(formula = sauGPA ~ ACTscore + athlete, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3498 -0.2638 0.1023 0.4435 1.1081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08294 0.25309 0.328 0.7434
## ACTscore 0.10779 0.01088 9.909 <2e-16 ***
## athletenot athlete 0.14093 0.08335 1.691 0.0921 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.631 on 252 degrees of freedom
## Multiple R-squared: 0.2955, Adjusted R-squared: 0.2899
## F-statistic: 52.86 on 2 and 252 DF, p-value: < 2.2e-16
## Model comparison
anova(act.model, ath.act.model)
## Analysis of Variance Table
##
## Model 1: sauGPA ~ ACTscore
## Model 2: sauGPA ~ ACTscore + athlete
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 253 101.47
## 2 252 100.33 1 1.1381 2.8587 0.09212 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################
####################
# Cross-validation #
####################
####################
library(DAAG)
cross.validated.model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
CVlm(gpa, cross.validated.model, m=10)
## Analysis of Variance Table
##
## Response: sauGPA
## Df Sum Sq Mean Sq F value Pr(>F)
## hsGPA 1 63.8 63.8 231.47 < 2e-16 ***
## athlete 1 0.5 0.5 1.70 0.19340
## ACTscore 1 4.0 4.0 14.40 0.00019 ***
## hoursSTUDY 1 1.9 1.9 6.92 0.00907 **
## gender 1 3.7 3.7 13.57 0.00028 ***
## Residuals 249 68.6 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in CVlm(gpa, cross.validated.model, m = 10):
##
## As there is >1 explanatory variable, cross-validation
## predicted values for a fold are not a linear function
## of corresponding overall predicted values. Lines that
## are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 25
## 9 48 65 71 86 92 99 100 101
## Predicted 2.764 2.026 3.376 2.91 3.050 3.285 2.41 2.040 3.127
## cvpred 2.814 2.024 3.347 2.90 3.025 3.235 2.43 2.068 3.129
## sauGPA 2.400 1.210 3.470 3.29 3.290 3.730 2.18 1.420 3.470
## CV residual -0.414 -0.814 0.123 0.39 0.265 0.495 -0.25 -0.648 0.341
## 106 109 119 122 130 150 151 175 187
## Predicted 3.4890 3.0162 3.69 3.088 3.038 2.266 2.419 2.411 2.4800
## cvpred 3.4773 2.9837 3.64 3.116 3.036 2.242 2.422 2.461 2.5609
## sauGPA 3.4500 3.0600 3.77 2.730 3.590 3.000 2.710 1.830 2.5000
## CV residual -0.0273 0.0763 0.13 -0.386 0.554 0.758 0.288 -0.631 -0.0609
## 191 215 217 227 229 236 240
## Predicted 1.751 2.4334 2.428 2.149 2.21 2.982 1.791
## cvpred 1.699 2.4568 2.474 2.172 2.27 2.962 1.826
## sauGPA 2.220 2.4000 2.790 2.290 0.92 2.860 1.670
## CV residual 0.521 -0.0568 0.316 0.118 -1.35 -0.102 -0.156
##
## Sum of squares = 5.69 Mean square = 0.23 n = 25
##
## fold 2
## Observations in test set: 26
## 11 26 34 42 43 47 53 55 59
## Predicted 3.277 2.684 3.187 3.409 2.844 1.964 3.1188 1.772 3.165
## cvpred 3.287 2.712 3.218 3.421 2.852 1.998 3.1302 1.783 3.176
## sauGPA 3.130 2.500 2.820 3.030 3.250 1.210 3.1200 1.440 3.330
## CV residual -0.157 -0.212 -0.398 -0.391 0.398 -0.788 -0.0102 -0.343 0.154
## 62 63 68 78 83 104 107 127 146
## Predicted 3.627 2.20 3.0608 2.351 3.309 3.519 2.68 2.228 2.603
## cvpred 3.638 2.24 3.0823 2.363 3.311 3.525 2.71 2.252 2.589
## sauGPA 3.490 0.58 3.0400 2.490 3.200 3.750 3.56 2.470 2.360
## CV residual -0.148 -1.66 -0.0423 0.127 -0.111 0.225 0.85 0.218 -0.229
## 149 196 204 207 218 235 245 246
## Predicted 2.8837 2.007 2.92 2.301 3.094 3.436 2.843 2.174
## cvpred 2.9025 2.004 2.94 2.316 3.051 3.392 2.864 2.207
## sauGPA 3.0000 2.870 1.67 2.130 3.500 3.110 3.300 1.860
## CV residual 0.0975 0.866 -1.27 -0.186 0.449 -0.282 0.436 -0.347
##
## Sum of squares = 7.98 Mean square = 0.31 n = 26
##
## fold 3
## Observations in test set: 26
## 2 27 31 36 37 39 41 61 76
## Predicted 2.986 3.099 2.690 3.059 2.339 3.3850 2.792 3.205 2.769
## cvpred 2.935 3.137 2.666 3.044 2.374 3.3653 2.817 3.158 2.755
## sauGPA 3.160 2.630 3.060 2.930 1.860 3.3800 2.210 3.730 3.290
## CV residual 0.225 -0.507 0.394 -0.114 -0.514 0.0147 -0.607 0.572 0.535
## 77 79 105 155 163 164 168 177 181
## Predicted 3.031 2.803 2.889 2.39 3.2956 1.737 2.190 2.6837 2.188
## cvpred 3.037 2.806 2.897 2.38 3.2299 1.744 2.181 2.6696 2.192
## sauGPA 3.470 2.500 3.000 3.39 3.1800 1.420 2.930 2.5800 2.380
## CV residual 0.433 -0.306 0.103 1.01 -0.0499 -0.324 0.749 -0.0896 0.188
## 183 192 206 219 241 243 252 255
## Predicted 1.742 2.692 1.5063 1.619 2.161 2.82 2.02 2.574
## cvpred 1.761 2.693 1.5203 1.645 2.168 2.85 2.03 2.589
## sauGPA 1.000 2.420 1.4300 1.920 2.600 3.38 0.19 2.330
## CV residual -0.761 -0.273 -0.0903 0.275 0.432 0.53 -1.84 -0.259
##
## Sum of squares = 8.42 Mean square = 0.32 n = 26
##
## fold 4
## Observations in test set: 26
## 4 14 15 46 57 87 89 91 113
## Predicted 2.408 3.0860 3.110 2.330 2.1444 3.058 3.3833 3.34 3.04
## cvpred 2.398 3.0747 3.081 2.324 2.1181 3.043 3.3711 3.35 3.02
## sauGPA 3.000 3.0600 3.400 2.190 2.0700 3.500 3.4100 3.50 3.17
## CV residual 0.602 -0.0147 0.319 -0.134 -0.0481 0.457 0.0389 0.15 0.15
## 131 136 143 157 170 173 174 194 210 213
## Predicted 3.52 2.803 3.341 2.577 1.721 1.755 2.18 1.788 2.338 1.93
## cvpred 3.50 2.791 3.322 2.558 1.696 1.724 2.15 1.754 2.312 1.90
## sauGPA 3.20 3.200 3.460 2.360 2.030 2.270 3.25 2.270 2.430 2.50
## CV residual -0.30 0.409 0.138 -0.198 0.334 0.546 1.10 0.516 0.118 0.60
## 216 222 223 228 234 242 247
## Predicted 1.670 2.456 1.6917 1.62 2.4865 2.176 2.58
## cvpred 1.639 2.427 1.6558 1.60 2.4697 2.159 2.56
## sauGPA 2.160 3.070 1.5600 0.21 2.5100 1.880 3.00
## CV residual 0.521 0.643 -0.0958 -1.39 0.0403 -0.279 0.44
##
## Sum of squares = 6.22 Mean square = 0.24 n = 26
##
## fold 5
## Observations in test set: 26
## 6 13 25 44 50 58 66 82 84
## Predicted 2.328 2.6402 3.156 2.722 2.1290 3.686 3.58 3.2844 2.375
## cvpred 2.323 2.7101 3.147 2.706 2.1383 3.721 3.60 3.2764 2.387
## sauGPA 1.690 2.6700 3.500 3.020 2.0800 3.140 3.47 3.2500 2.800
## CV residual -0.633 -0.0401 0.353 0.314 -0.0583 -0.581 -0.13 -0.0264 0.413
## 102 103 115 121 123 125 153 160 161
## Predicted 3.6079 3.207 3.010 3.1638 2.681 2.657 2.257 2.950 1.796
## cvpred 3.6355 3.238 3.002 3.1571 2.719 2.641 2.243 2.953 1.767
## sauGPA 3.6700 3.130 3.440 3.2100 2.130 2.790 2.570 3.060 2.080
## CV residual 0.0345 -0.108 0.438 0.0529 -0.589 0.149 0.327 0.107 0.313
## 179 200 205 211 221 225 250 253
## Predicted 3.212 2.3787 2.463 2.697 2.517 2.582 2.590 2.728
## cvpred 3.212 2.3961 2.453 2.677 2.529 2.588 2.615 2.715
## sauGPA 2.230 2.3800 2.860 2.370 2.360 2.790 1.710 3.060
## CV residual -0.982 -0.0161 0.407 -0.307 -0.169 0.202 -0.905 0.345
##
## Sum of squares = 4.18 Mean square = 0.16 n = 26
##
## fold 6
## Observations in test set: 26
## 8 22 29 40 49 51 70 72 75
## Predicted 3.202 3.601 2.887 3.214 2.414 2.022 2.185 2.85 3.181
## cvpred 3.262 3.608 2.881 3.227 2.421 2.001 2.153 2.81 3.201
## sauGPA 2.890 3.780 3.380 2.670 2.950 2.460 2.540 2.63 2.870
## CV residual -0.372 0.172 0.499 -0.557 0.529 0.459 0.387 -0.18 -0.331
## 95 96 110 118 139 145 159 162 165
## Predicted 2.9407 1.86 3.297 2.534 2.217 2.5178 2.751 1.8508 1.4683
## cvpred 2.9617 1.90 3.303 2.534 2.212 2.4752 2.721 1.8763 1.4592
## sauGPA 2.8700 1.08 3.140 2.390 2.670 2.5600 3.000 1.7900 1.4000
## CV residual -0.0917 -0.82 -0.163 -0.144 0.458 0.0848 0.279 -0.0863 -0.0592
## 176 182 186 199 201 208 212 239
## Predicted 2.588 3.103 2.2307 3.068 2.153 2.145 1.889 2.149
## cvpred 2.593 3.104 2.2027 3.077 2.177 2.175 1.904 2.166
## sauGPA 2.330 2.620 2.2500 3.400 2.270 2.000 2.190 1.280
## CV residual -0.263 -0.484 0.0473 0.323 0.093 -0.175 0.286 -0.886
##
## Sum of squares = 3.86 Mean square = 0.15 n = 26
##
## fold 7
## Observations in test set: 25
## 1 12 19 20 45 52 54 73 74 81
## Predicted 2.163 2.425 2.776 2.923 2.83 2.859 2.90 2.919 2.8078 2.51
## cvpred 2.112 2.376 2.755 2.957 2.85 2.881 2.89 2.923 2.8108 2.52
## sauGPA 2.870 2.470 3.380 2.570 3.05 2.220 2.67 2.420 2.8400 2.42
## CV residual 0.758 0.094 0.625 -0.387 0.20 -0.661 -0.22 -0.503 0.0292 -0.10
## 85 94 97 112 138 147 156 158 167
## Predicted 2.128 3.163 2.881 2.781 2.736 3.039 3.2923 2.886 2.576
## cvpred 2.104 3.199 2.895 2.768 2.741 3.034 3.3075 2.895 2.545
## sauGPA 1.730 3.400 3.270 3.080 2.470 3.310 3.2500 3.070 2.200
## CV residual -0.374 0.201 0.375 0.312 -0.271 0.276 -0.0575 0.175 -0.345
## 172 185 198 230 251 254
## Predicted 2.58 1.690 1.59 1.5666 3.0478 1.681
## cvpred 2.54 1.646 1.54 1.5002 3.0518 1.623
## sauGPA 2.98 2.360 2.20 1.5800 3.0800 2.620
## CV residual 0.44 0.714 0.66 0.0798 0.0282 0.997
##
## Sum of squares = 4.78 Mean square = 0.19 n = 25
##
## fold 8
## Observations in test set: 25
## 3 5 7 18 21 23 28 30 32
## Predicted 2.549 2.744 3.150 3.628666 3.300 1.900 2.915 3.647 3.958
## cvpred 2.541 2.737 3.173 3.630152 3.297 1.879 2.911 3.686 3.971
## sauGPA 2.670 3.000 2.930 3.630000 3.590 1.070 3.400 3.570 3.670
## CV residual 0.129 0.263 -0.243 -0.000152 0.293 -0.809 0.489 -0.116 -0.301
## 33 98 111 114 120 124 126 132 133
## Predicted 3.0907 3.604 2.460 3.074 2.769 2.9925 2.908 2.105 2.453
## cvpred 3.0865 3.609 2.461 3.124 2.769 3.0159 2.907 2.098 2.442
## sauGPA 3.0200 3.240 2.760 2.600 3.250 2.9200 3.140 1.850 2.000
## CV residual -0.0665 -0.369 0.299 -0.524 0.481 -0.0959 0.233 -0.248 -0.442
## 135 137 148 152 180 189 248
## Predicted 3.249 2.755 2.966 2.175 2.813 2.683 2.831
## cvpred 3.248 2.783 2.961 2.195 2.841 2.663 2.833
## sauGPA 3.400 1.830 2.650 2.670 2.470 3.470 2.980
## CV residual 0.152 -0.953 -0.311 0.475 -0.371 0.807 0.147
##
## Sum of squares = 4.35 Mean square = 0.17 n = 25
##
## fold 9
## Observations in test set: 25
## 10 24 64 67 90 93 108 116 134 140
## Predicted 2.506 1.9830 1.83 3.3982 3.032 1.945 3.06 2.055 1.790 2.001
## cvpred 2.501 1.9711 1.83 3.3633 3.035 1.942 3.06 2.019 1.788 2.001
## sauGPA 3.330 2.0000 0.00 3.4000 3.270 1.670 2.93 2.470 2.670 2.670
## CV residual 0.829 0.0289 -1.83 0.0367 0.235 -0.272 -0.13 0.451 0.882 0.669
## 144 166 169 171 178 184 195 202 203 224
## Predicted 2.702 1.622 2.155 2.78 2.419 2.38 2.101 2.85 2.481 2.594
## cvpred 2.699 1.602 2.136 2.78 2.404 2.37 2.057 2.85 2.484 2.575
## sauGPA 2.920 1.870 2.400 3.15 3.190 3.33 2.260 3.07 1.600 2.730
## CV residual 0.221 0.268 0.264 0.37 0.786 0.96 0.203 0.22 -0.884 0.155
## 231 232 237 238 249
## Predicted 3.327 2.00 2.31 2.511 2.9773
## cvpred 3.314 2.01 2.31 2.508 2.9757
## sauGPA 3.450 0.83 2.54 2.800 3.0600
## CV residual 0.136 -1.18 0.23 0.292 0.0843
##
## Sum of squares = 9.91 Mean square = 0.4 n = 25
##
## fold 10
## Observations in test set: 25
## 16 17 35 38 56 60 69 80 88
## Predicted 3.58 2.98 1.708 3.759 2.7714 2.354 3.228 3.1268 2.805
## cvpred 3.63 3.07 1.732 3.779 2.7754 2.284 3.176 3.1396 2.849
## sauGPA 3.47 0.00 1.000 3.500 2.6900 2.870 3.610 3.2000 3.220
## CV residual -0.16 -3.07 -0.732 -0.279 -0.0854 0.586 0.434 0.0604 0.371
## 117 128 129 141 142 154 188 190 193
## Predicted 2.410 2.954 1.79 3.433 2.645 2.326 2.882 2.760 3.3891
## cvpred 2.404 2.959 1.84 3.485 2.659 2.333 2.949 2.823 3.4945
## sauGPA 1.440 2.270 0.27 3.610 3.060 2.540 2.880 2.930 3.4000
## CV residual -0.964 -0.689 -1.57 0.125 0.401 0.207 -0.069 0.107 -0.0945
## 197 209 214 220 226 233 244
## Predicted 2.332 2.82 2.857 1.907 2.849 2.684 1.833
## cvpred 2.383 2.83 2.854 1.941 2.857 2.712 1.863
## sauGPA 3.060 3.20 2.640 1.290 3.270 3.000 1.980
## CV residual 0.677 0.37 -0.214 -0.651 0.413 0.288 0.117
##
## Sum of squares = 16.2 Mean square = 0.65 n = 25
##
## Overall (Sum over all 25 folds)
## ms
## 0.281
## I'm going to make these comments so they won't print out.
## These are the commands I used to get the other avg. mse values
# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)
# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore + gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)
# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)
# cross.validated.model <- lm(sauGPA ~ hsGPA, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)
# cross.validated.model <- lm(sauGPA ~ hsGPA*athlete*ACTscore*hoursSTUDY*gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)
####################
####################
## Best subsets ###
####################
####################
# All Subsets Regression
library(leaps)
leaps<-regsubsets(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender,
data=gpa, nbest=10)
# view results
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY +
## gender, data = gpa, nbest = 10)
## 5 Variables (and intercept)
## Forced in Forced out
## hsGPA FALSE FALSE
## athletenot athlete FALSE FALSE
## ACTscore FALSE FALSE
## hoursSTUDY FALSE FALSE
## gendermale FALSE FALSE
## 10 subsets of each size up to 5
## Selection Algorithm: exhaustive
## hsGPA athletenot athlete ACTscore hoursSTUDY gendermale
## 1 ( 1 ) "*" " " " " " " " "
## 1 ( 2 ) " " " " "*" " " " "
## 1 ( 3 ) " " " " " " "*" " "
## 1 ( 4 ) " " " " " " " " "*"
## 1 ( 5 ) " " "*" " " " " " "
## 2 ( 1 ) "*" " " "*" " " " "
## 2 ( 2 ) "*" " " " " " " "*"
## 2 ( 3 ) "*" " " " " "*" " "
## 2 ( 4 ) "*" "*" " " " " " "
## 2 ( 5 ) " " " " "*" " " "*"
## 2 ( 6 ) " " " " "*" "*" " "
## 2 ( 7 ) " " "*" "*" " " " "
## 2 ( 8 ) " " " " " " "*" "*"
## 2 ( 9 ) " " "*" " " "*" " "
## 2 ( 10 ) " " "*" " " " " "*"
## 3 ( 1 ) "*" " " "*" " " "*"
## 3 ( 2 ) "*" " " " " "*" "*"
## 3 ( 3 ) "*" " " "*" "*" " "
## 3 ( 4 ) "*" "*" "*" " " " "
## 3 ( 5 ) "*" "*" " " " " "*"
## 3 ( 6 ) "*" "*" " " "*" " "
## 3 ( 7 ) " " " " "*" "*" "*"
## 3 ( 8 ) " " "*" "*" " " "*"
## 3 ( 9 ) " " "*" "*" "*" " "
## 3 ( 10 ) " " "*" " " "*" "*"
## 4 ( 1 ) "*" " " "*" "*" "*"
## 4 ( 2 ) "*" "*" "*" " " "*"
## 4 ( 3 ) "*" "*" "*" "*" " "
## 4 ( 4 ) "*" "*" " " "*" "*"
## 4 ( 5 ) " " "*" "*" "*" "*"
## 5 ( 1 ) "*" "*" "*" "*" "*"
# plot a table of models showing variables in each model.
plot(leaps,scale="r2")
# plot statistic by subset size
library(car)
#subsets(leaps, statistic="rsq")
####################
####################
#### Stepwise ####
####################
####################
# Using AIC
library(MASS)
model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
stepAIC(model, scale = 0,
direction = c("both"),
trace = 0, use.start = FALSE,
k = 2)
##
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender,
## data = gpa)
##
## Coefficients:
## (Intercept) hsGPA ACTscore hoursSTUDY gendermale
## -0.5662 0.6889 0.0425 0.0093 -0.2640
# Note: Change to trace = 1 above to see more
# detailed output
####################
####################
#### Ridge Reg ###
####################
####################
# Find coefficients of linear model
coef(lm(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender,
data=gpa))
## (Intercept) hsGPA hsRANK
## -0.66883 0.75144 -0.00156
## athletenot athlete ACTscore hoursSTUDY
## -0.03159 0.04356 0.00933
## gendermale
## -0.28076
# Run the ridge regression
ridge <- lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender,
data=gpa, lambda = seq(0, 50, .1))
# Find the value of lambda
select(lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender,
data=gpa, lambda = seq(0, 50, .01)))
## modified HKB estimator is 5.45
## modified L-W estimator is 3.81
## smallest value of GCV at 6.43
# Get coefficients for ridge regression using lambda = 6.43
lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender,
data=gpa, lambda = 6.43)
## hsGPA hsRANK
## -0.488398 0.662211 0.000206
## athletenot athlete ACTscore hoursSTUDY
## -0.024268 0.043171 0.009273
## gendermale
## -0.270204
# Load a library to plot the coefficients
library(genridge)
# Plot the coefficients
traceplot(ridge)
abline(v=6.43, lty=1, lw=3)