# Install and load all necessary packages
list.of.packages <- c("tidyverse", "broom", "mosaic", "ggExtra")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(mosaic)
library(broom)
library(modelr)
library(ggExtra)
The class will choose the criteria we use to declare the winner.
Celebrity | Guess | Actual | ??? |
---|---|---|---|
1. Oprah | |||
2. Tom Hanks | |||
3. Betty White | |||
4. Pelé | |||
5. Michael Jordan | |||
6. Taylor Swift | |||
7. Will Smith | |||
8. George Washington | |||
9. (students’ choice) | |||
10. (students’ choice) | |||
score = __________ |
Suppose you work in the admissions office and are tasked with admitting students who are predicted to be successful at St. Ambrose.
# Read data
sau <- read.csv("http://www.bradthiessen.com/html5/data/sau.csv")
# Convert female and minority variables to factors
sau <- sau %>%
mutate(female = factor(female, labels = c("male", "female")),
minority = factor(minority, labels = c("white", "minority")))
p <- ggplot(sau, aes(x = hsgpa, y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(0.5,4), breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
theme(legend.position="none") +
labs(
x = "High School GPA",
y = "Fall semester GPA at St. Ambrose"
) +
annotate("text", x=1.1, y=1.5, label="y = 0.058 + 0.826x")
# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
# Warning: Removed 26 rows containing non-finite values (stat_smooth).
# Warning: Removed 26 rows containing non-finite values (stat_smooth).
# Warning: Removed 26 rows containing missing values (geom_point).
# Warning: Removed 26 rows containing non-finite values (stat_bin).
p <- ggplot(sau, aes(x = jitter(ACTtotal), y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(15,36), breaks=seq(16, 36, 2), minor_breaks=seq(15, 35, 2)) +
theme(legend.position="none") +
labs(
x = "ACT Composite",
y = "Fall semester GPA at St. Ambrose"
)
# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
# Warning: Removed 26 rows containing non-finite values (stat_smooth).
# Warning: Removed 26 rows containing non-finite values (stat_smooth).
# Warning: Removed 26 rows containing missing values (geom_point).
# Warning: Removed 26 rows containing non-finite values (stat_bin).
p <- ggplot(sau, aes(x = jitter(ACTmath), y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(14,36), breaks=seq(14, 36, 2), minor_breaks=seq(15, 35, 2)) +
theme(legend.position="none") +
labs(
x = "ACT Math",
y = "Fall semester GPA at St. Ambrose"
)
ggMarginal(p, type="histogram", fill="steelblue", color="white")
# Warning: Removed 31 rows containing non-finite values (stat_smooth).
# Warning: Removed 3 rows containing non-finite values (stat_bin).
# Warning: Removed 30 rows containing non-finite values (stat_smooth).
# Warning: Removed 27 rows containing missing values (geom_point).
# Warning: Removed 4 rows containing non-finite values (stat_bin).
# Warning: Removed 26 rows containing non-finite values (stat_bin).
ggplot(sau, aes(x = hsgpa, y=jitter(retention, .1))) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0,1), breaks=seq(0, 1, 1), minor_breaks=NULL) +
theme(legend.position="none") +
labs(
x = "high school GPA",
y = "1 = student returned for sophomore year"
)
# Warning: Removed 963 rows containing non-finite values (stat_smooth).
# Warning: Removed 949 rows containing missing values (geom_point).
Here’s some simulated data showing the relationship between ACT scores and 1st-semester college GPAs.
# Simulate ACT and first-semester GPA data
set.seed(123)
firstyear <- tibble(
act = c(rep(c(14, 16, 18, 20, 22, 24, 26, 28, 30, 32),3)),
gpa = round((act/9) + rnorm(30, 0, .3),2),
act2 = act + rnorm(30,0,.1)
)
# Plot the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_point() +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
Let’s randomly generate 300 lines of the form \(y = b_0+b_1x\) to display on our data:
# Looks linear. Randomly choose 250 slopes and y-intercepts.
set.seed(123)
models <- tibble(
b0 = runif(300, -2, 3.5),
b1 = runif(300, -.2, .2)
)
# Plot all 300 models on top of the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_abline(aes(intercept = b0, slope = b1), data = models, alpha = 0.25) +
geom_point() +
geom_abline(aes(intercept = 0.0638, slope = .1077), data = models, alpha = 0.25) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Copy our data to new data frame
withpredictions <- firstyear
# Choose a slope and y-intercept
b00 <- 42/24
b10 <- 1/24
# Predict GPAs with this slope and y-intercept
withpredictions$pred2 <- b00 + (b10 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid2 <- withpredictions$gpa - ( b00 + (b10 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b00, slope = b10), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred2, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid2^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid2)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Choose a slope and y-intercept
b01 <- 3/50
b11 <- 1/9
# Predict GPAs with this slope and y-intercept
withpredictions$pred1 <- b01 + (b11 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid1 <- withpredictions$gpa - ( b01 + (b11 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b01, slope = b11), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred1, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid1^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid1)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Choose a slope and y-intercept
b02 <- 3.5
b12 <- -1/42
# Predict GPAs with this slope and y-intercept
withpredictions$pred3 <- b02 + (b12 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid3 <- withpredictions$gpa - ( b02 + (b12 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b02, slope = b12), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred3, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid3^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid3)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
We can calculate this root mean square deviation for all 300 lines we plotted earlier. Let’s display the ten best models (with the smallest RMSE).
# Don't worry about this code.
# Function to calculate predictions based on slope and y-intercept
model1 <- function(a, data) {
a[1] + data$act * a[2]
}
# Function to calculate vertical error distances
measure_distance <- function(mod, data) {
diff <- data$gpa - model1(mod, data)
sqrt(mean(diff ^ 2))
}
# Compute distance for all 300 models
sim1_dist <- function(b0, b1) {
measure_distance(c(b0, b1), firstyear)
}
# Add distances to the models data frame
models <- models %>%
mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))
# Find 10 best models (with smallest rmse) and plot on data
ggplot(firstyear, aes(act, gpa)) +
geom_point(size = 2, colour = "black") +
geom_abline(
aes(intercept = b0, slope = b1, colour = -dist),
data = filter(models, rank(dist) <= 10)
) +
labs(title = "10 best models (with smallest RMSE)")
Each of the ten lines displayed above has a y-intercept and a slope, so we can think of each model as a point: \((b_0, \ b_1)\). Let’s plot these “points” for all 300 linear models:
# Plot the slopes and y-intercepts of all 300 models
# Highlight best combinations of slope and intercept with red circles
ggplot(models, aes(b0, b1)) +
geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
The points with lighter shades of blue represent models with smaller amounts of error. The red circles highlight the 10 best models that were displayed earlier.
Let’s try a grid search to find the best combination of slope and y-intercept.
# Create the grid by selecting ranges for b0 and b1
grid <- expand.grid(
b0 = seq(-1, 1, length = 25),
b1 = seq(0.05, .15, length = 25)
) %>%
mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))
# Zoom and enhance
grid %>%
ggplot(aes(b0, b1)) +
geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
Let’s take these 10 best models and display them back on the data.
ggplot(firstyear, aes(act, gpa)) +
geom_point(size = 2, colour = "grey30") +
geom_abline(
aes(intercept = b0, slope = b1, colour = -dist),
data = filter(grid, rank(dist) <= 10)
)
We could continue this zoom-and-enhance method until we really narrowed down our best y-intercept and slope. Thankfully, there’s an easier way.
# Choose a slope and y-intercept
b0 <- 0.06384
b1 <- 0.10772
# Predict GPAs with this slope and y-intercept
withpredictions$pred <- b0 + (b1 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid <- withpredictions$gpa - ( b0 + (b1 * withpredictions$act) )
# Plot vertical errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred, color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend= ( (withpredictions$gpa - b0) / b1),
y=withpredictions$gpa, yend=withpredictions$gpa, color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend= ((withpredictions$act2 + b1 * withpredictions$gpa - b0 * b1) / (1 + b1^2)),
y=withpredictions$gpa, yend = (b0 + b1 * withpredictions$act2), color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
geom_abline(aes(intercept = 200, slope=-3)) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL)
# Plot squared errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL)
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_line(color="blue", size=1) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_smooth(method="loess", se=FALSE, span=.4) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
We want to minimize the squared vertical distances from predictions made by our model, \(\hat{y}_i=b_0+b_1x_i\) to each observed data point, \((x_i, y_i)\).
Each of these vertical distances can be written as: \(e_i = y_i-(b_0+b_1x_i)=y_i-b_0-b_1x_i\)
Our goal is to find values for the y-intercept \((b_0)\) and slope \((b_1)\) that minimize: \(Q =\sum_{i=1}^{n}(y_i-b_0-b_1x_i)^2\).
To minimize a function, we can set its first derivative equal to zero and solve (and verify the second derivative is positive at that value). Since we have two variables in \(Q\), we’ll need to take partial derivatives:
\(\frac{\partial Q }{\partial b_0}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_0}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum(y_i-b_0-b_1x_i)\)
Set this partial derivative equal to zero:
\(-2\sum(y_i-b_0-b_1x_i)=0\)
\(\sum(y_i-b_0-b_1x_i)=0\)
\(\sum{y_i}=nb_0+b_1\sum{x_i}\)
\(\frac{\partial Q }{\partial b_1}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_1}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum{x_i}(y_i-b_0-b_1x_i)\)
Set this partial derivative equal to zero:
\(-2\sum{x_i}(y_i-b_0-b_1x_i)=0\)
\(\sum{x_i}(y_i-b_0-b_1x_i)=0\)
\(\sum{x_iy_i}-b_0\sum{x_i}-b_1\sum{x_i^2}=0\)
\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}=0\)
\(\sum{y_i}=nb_0+b_1\sum{x_i}\)
\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}\)
We can solve this system to get:
\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}\)
and
\(b_0=\frac{\sum{x_i^2}\sum{y_i}-\sum{x_i}\sum{x_iy_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{\sum{y_i}}{n}-b_1\frac{\sum{x_i}}{n}=\overline{y}-b_1\overline{x}\)
Using formulas for variance and covariance, we can rewrite \(b_1\) as:
\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{s_{xy}}{s_x^2}=r\frac{s_y}{s_x}\)
# Get summary statistics
firstyear %>%
summarize(meanx = mean(act), sdx = sd(act),
meany = mean(gpa), sdy = sd(gpa),
n = n(), cor = cor(act, gpa))
# # A tibble: 1 x 6
# meanx sdx meany sdy n cor
# <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 23 5.842767 2.541333 0.6944397 30 0.9062938
# Calculate b1
firstyear %>%
summarize(b1 = cor(gpa, act) * sd(gpa) / sd(act),
b0 = mean(gpa) - (b1 * mean(act)) )
# # A tibble: 1 x 2
# b1 b0
# <dbl> <dbl>
# 1 0.1077172 0.06383838
In R, we can use the lm()
function to fit linear models. Expand the code –>
# lm = linear model
# lm(y ~ x, data)
lm(gpa ~ act, data = firstyear)
#
# Call:
# lm(formula = gpa ~ act, data = firstyear)
#
# Coefficients:
# (Intercept) act
# 0.06384 0.10772
#
# The output shows coefficients
# It's typically more useful to store the model
model1 <- lm(gpa ~ act, data = firstyear)
#
# We can then investigate this model
# Summary statistics
summary(model1)
#
# Call:
# lm(formula = gpa ~ act, data = firstyear)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.55992 -0.17981 -0.02775 0.17302 0.55095
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.063838 0.225053 0.284 0.779
# act 0.107717 0.009493 11.347 5.51e-12 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.2987 on 28 degrees of freedom
# Multiple R-squared: 0.8214, Adjusted R-squared: 0.815
# F-statistic: 128.7 on 1 and 28 DF, p-value: 5.515e-12
#
# Coefficients
coef(model1)
# (Intercept) act
# 0.06383838 0.10771717
#
# Broom package: tidy(), glance(), augment() functions
# Tidy the model coefficients, standard errors, and p-values
tidy(model1)
# # A tibble: 2 x 5
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 0.06383838 0.225052612 0.2836598 7.787580e-01
# 2 act 0.10771717 0.009493271 11.3466861 5.514804e-12
#
# Glance at summary statistics for the model
glance(model1)
# # A tibble: 1 x 11
# r.squared adj.r.squared sigma statistic p.value df
# <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# 1 0.8213685 0.8149888 0.2986988 128.7473 5.514804e-12 2
# # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
# # deviance <dbl>, df.residual <int>
#
# Augment the data with predictions and residuals
head(augment(model1))
# # A tibble: 6 x 9
# gpa act .fitted .se.fit .resid .hat .sigma
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.39 14 1.571879 0.10136040 -0.18187879 0.11515152 0.3018954
# 2 1.71 16 1.787313 0.08596523 -0.07731313 0.08282828 0.3037830
# 3 2.47 18 2.002747 0.07229860 0.46725253 0.05858586 0.2897173
# 4 2.24 20 2.218182 0.06152343 0.02181818 0.04242424 0.3041497
# 5 2.48 22 2.433616 0.05535481 0.04638384 0.03434343 0.3040443
# 6 3.18 24 2.649051 0.05535481 0.53094949 0.03434343 0.2858551
# # ... with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
#
#
# Modelr package: add_predictions() function
# Add predictions to our dataset
firstyear <- firstyear %>%
add_predictions(model1)
#
# Now we can plot these predictions as a line
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_point() +
geom_line(aes(y = pred), color="blue", size=1) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
Just because we’re able to calculate the best line, it doesn’t mean that line is meaningful. For example, here’s the line that best fits this data:
# Simulate data forming a parabola
sim_data <- tibble(
x = c(-5:5),
y = x^2
)
# Plot the data
# geom_smooth(method = "lm") plots the line of best fit
ggplot(data = sim_data, aes(x = x, y = y)) +
geom_point(size=2) +
geom_smooth(method="lm", se=FALSE)
It looks like we’ll need some way of measuring just how well a model (a line) fits a given dataset. Let’s derive some of these measures for three different datasets:
We’ll complete the following table to see the value of each measure in each case:
# Later, we'll see what this ANOVA table represents
# For now, it gives us SSE (Sum Sq Residuals)
anova(model1)
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 11.486959 11.48695919 128.7473 5.514804e-12
# 2 28 2.498187 0.08922098 NA NA
# We can also calculate this directly with the residuals
sum(residuals(model1)^2)
# [1] 2.498187
Perhaps a better measure of fit would be the average squared error (the average squared distance from observation to the prediction line). One simple estimate of this average squared error would be MSE:
\(s_{y|x}^2=\frac{SS_E}{df_E}=\frac{\Sigma(y_i-\hat{y})^2}{n-2}\)
# MSE is displayed in the ANOVA summary table. Calculate directly with:
sum(residuals(model1)^2) / model1$df.residual
# [1] 0.08922098
# We can calculate it directly with...
sqrt( sum(residuals(model1)^2) / model1$df.residual )
# [1] 0.2986988
# We can also glance at it (sigma) with the broom package
glance(model1)
# # A tibble: 1 x 11
# r.squared adj.r.squared sigma statistic p.value df
# <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# 1 0.8213685 0.8149888 0.2986988 128.7473 5.514804e-12 2
# # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
# # deviance <dbl>, df.residual <int>
# The modelr package has a rmse() function
rmse(model1, firstyear)
# [1] 0.2885705
# Calculate it directly. length = sample size
sqrt( sum(residuals(model1)^2) / length(firstyear$act) )
# [1] 0.2885705
# The modelr package has an rsquare() function
1 - rsquare(model1, firstyear)
# [1] 0.1786315
# The modelr package has an rsquare() function
rsquare(model1, firstyear)
# [1] 0.8213685
# We can glance at it with the broom package
glance(model1)
# # A tibble: 1 x 11
# r.squared adj.r.squared sigma statistic p.value df
# <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# 1 0.8213685 0.8149888 0.2986988 128.7473 5.514804e-12 2
# # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
# # deviance <dbl>, df.residual <int>
# It also appears in the summary
summary(model1)
#
# Call:
# lm(formula = gpa ~ act, data = firstyear)
#
# Residuals:
# Min 1Q Median 3Q Max
# -0.55992 -0.17981 -0.02775 0.17302 0.55095
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.063838 0.225053 0.284 0.779
# act 0.107717 0.009493 11.347 5.51e-12 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.2987 on 28 degrees of freedom
# Multiple R-squared: 0.8214, Adjusted R-squared: 0.815
# F-statistic: 128.7 on 1 and 28 DF, p-value: 5.515e-12
# We can also get it from the ANOVA table
anova(model1)
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 11.486959 11.48695919 128.7473 5.514804e-12
# 2 28 2.498187 0.08922098 NA NA
11.487/(11.487+2.4982)
# [1] 0.8213683
Validity: The data we’re analyzing maps to the research question we are trying to answer
Diagnosis: Look at plots of observed vs predicted or residuals vs predicted values. The points should be symmetrically distributed around a diagonal line in the former plot or around horizontal line in the latter plot, with a roughly constant variance.
Diagnosis: If you have time series data, be careful that consecutive errors are not related.
Diagnosis: Look at the plot of residuals vs predicted values. If the residuals grow larger as a function of X, you have a problem.
Diagnosis: Look at a P-P or Q-Q plot of the residuals. The residuals should fall near the diagonal line. You could also run a test for normality, like the Shapiro-Wilk or Kologorov-Smirnov tests. Note that the dependent and independent variables in a regression model do not need to be normally distributed by themselves–only the prediction errors need to be normally distributed
plot()
function displays some diagnostic plots for our model. Evaluate the conditions listed above based on our dataset, question of interest, and the diagnostic plots.plot(model1)
Let’s finally use our real dataset. To change things up a bit, let’s regress Fall semester GPAs on high school GPAs. Make sure you can interpret all this output:
full_model <- lm(fallgpa ~ hsgpa, data = sau)
summary(full_model)
#
# Call:
# lm(formula = fallgpa ~ hsgpa, data = sau)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.12108 -0.30133 0.07284 0.36865 1.74310
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.05767 0.07984 0.722 0.47
# hsgpa 0.82572 0.02425 34.046 <2e-16 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.5832 on 1902 degrees of freedom
# Multiple R-squared: 0.3787, Adjusted R-squared: 0.3783
# F-statistic: 1159 on 1 and 1902 DF, p-value: < 2.2e-16
rmse(full_model, data = sau)
# [1] 0.5828609
plot(full_model)
Earlier, we learned that we always expect to get a non-zero correlation in any sample of data. Similarly, we expect linear models with non-zero slopes in any sample of data (even if we have reason to believe the variables are uncorrelated).
Our model to predict fall semester GPAs is: FallGPA = 0.058 + 0.926(hsGPA)
Does the magnitude of this slope (0.926) imply high school and college GPAs have a relationship for our population of interest or could we have obtained a slope of this magnitude even if the variables are uncorrelated?
# Store our observed slope of 0826
observed_slope <- full_model$coefficients[2]
# Run 10,000 randomizations, shuffling the high school GPA
rand_slopes <- Do(10000) * lm(fallgpa ~ shuffle(hsgpa), data = sau)
# The slopes are stored in the "hsgpa" variable. Let's plot and estimate the p-value.
ggplot(data = rand_slopes, aes(x = hsgpa)) +
geom_histogram(fill="lightblue", color="white", alpha = 0.8) +
labs(
title = "Randomized slopes",
x = "Slope"
) +
scale_x_continuous(breaks=seq(-.1, .1, .025), minor_breaks=NULL) +
theme(
axis.text.x = element_text(size = 11, color="grey10"),
legend.position = "none",
panel.grid.major.y = element_line(colour = "white"),
panel.grid.major.x = element_line(colour = "white", size=.15),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "grey93")
) +
annotate("text", x = .075, y = 600, label = paste("p = 0"))
# `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boot <- Do(10000) * lm(fallgpa ~ hsgpa, data = sample(sau, replace=TRUE))
bootstrapCI <- confint(boot$hsgpa, level = 0.95, method = "quantile")
lower <- as.numeric(bootstrapCI[1]) # Store lower CI bound
upper <- as.numeric(bootstrapCI[2]) # Store upper CI bound
# Density plot
ggplot(data = boot, aes(x = hsgpa)) +
geom_density(fill="lightblue", color="white", alpha = 0.8) +
labs(
title = "Bootstrap distribution of correlations",
x = "bootstrap correlations"
) +
scale_x_continuous(breaks=seq(.7, 1, 0.05), minor_breaks=NULL) +
theme(
axis.text.x = element_text(size = 11, color="grey10"),
legend.position = "none",
panel.grid.major.y = element_line(colour = "white"),
panel.grid.major.x = element_line(colour = "white", size=.15),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "grey93")
) +
annotate("text", x = lower, y = 5, label = round(lower,3)) +
annotate("text", x = upper, y = 5, label = round(upper,3)) +
annotate("text", x = median(boot$hsgpa), y = 6, label = paste("95%")) +
annotate("segment", x = lower, xend = upper, y = 4, yend = 4, color = "red")
In the 14th century, William of Ockham wrote Entia non sunt multiplicanda praeter necessitatem, which translates to: More things should not be used than are necessary.
When comparing two competing theories, we might use Ockham’s Razor and prefer theories that are simpler (more parsimonious in that they require fewer assumptions or are easier to explain). On the other hand, we might prefer more complex theories if they more accurately predict future events.
When we compare competing statistical models, we’ll have to trade-off accuracy and simplicity. We’ll have to learn to navigate between:
and
If we’re interested in the best prediction of fall semester GPAs, one approach we might take is:
We could also start with a complicated model, remove a predictor, and determine if the fit worsens noticeably.
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = mean(gpa), slope = 0), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - mean(withpredictions$gpa)),
ymin=mean(withpredictions$gpa), ymax=withpredictions$gpa, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL) +
labs(title = paste("Reduced model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2]+anova(lm(gpa ~ act, data = firstyear))[1,2],5)))
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL) +
labs(title = paste("Full model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2],5)))
anova(lm(gpa ~ act, data = firstyear))
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 11.486959 11.48695919 128.7473 5.514804e-12
# 2 28 2.498187 0.08922098 NA NA
The only difference between our full and reduced models is the \(b_1\) coefficient (the slope). If \(b_1=0\), the full model would be the same as our reduced model. Another way, then, to compare our full and reduced models would be to test the hypothesis: \(H_0:b_1=0\). We can test this hypothesis with a t-test.
t <- cor(firstyear$gpa, firstyear$act) / sqrt( (1-cor(firstyear$gpa, firstyear$act)^2) / (length(firstyear$act)-2))
t
# [1] 11.34669
t^2
# [1] 128.7473
anova()
function.# Manual calculation
Rf <- cor(firstyear$gpa, firstyear$act)
Rr <- 0
N <- length(firstyear$gpa)
kf <- 1
kr <- 0
F <- ( ((Rf^2 - Rr^2) / (kf - kr)) / ((1-Rf^2)/(N-kf-1)) )
F
# [1] 128.7473
#
# ANOVA function
fullmodel <- lm(gpa ~ act, data=firstyear)
reducedmodel <- lm(gpa ~ 1, data=firstyear)
anova(reducedmodel, fullmodel)
# # A tibble: 2 x 6
# Res.Df RSS Df `Sum of Sq` F `Pr(>F)`
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 29 13.985147 NA NA NA NA
# 2 28 2.498187 1 11.48696 128.7473 5.514804e-12
As we’ll soon see, R-squared and p-values resulting from the omnibus F-test aren’t the best measures to use when comparing models. Here are a couple alternatives we’ll build upon later:
The likelihood of a model is the probability it produces our data given its parameter estimates. If we assume all our observations are independent, then we can write our likelihood function as:
We want to calculate \(P(\textrm{observation 1} \ \cap \ \textrm{observation 2} \ \cap \ ...)\). When we assume independence, we can calculate this as: \(P(\textrm{observation 1})P(\textrm{observation 2})...\).
Given our model with \(b_0\) and \(b_1\) – along with our assumption that errors are normally distributed – we can use a normal distribution to find \(P(\textrm{observation 1})\). We then simply have to multiply these probabilities for each observation in our dataset.
As an example, our first observation is: ACT = 14
with GPA = 1.39
. Recall our model has \(b_0=0.06384\), \(b_1=0.10772\), and \(RSE=\sqrt{MS_E}=\sqrt{0.0892}=0.2987\).
For all students with an ACT score of 14, our model expects a GPA of: \(\hat{y}=0.06384+0.10772(14)=1.5719\).
Given that expectation, we can find the probability of actually observing a GPA of 1.39:
\(P[y=1.39 \ | \ y \sim \textrm{ND}(\mu=1.5719, \sigma=0.2987)]\)
Expand the code to see how to calculate this manually in R.
# We'll use dnorm to calculate the likelihood of each observation
# Here's the likelihood of the first observation:
dnorm(1.39, mean = (b0 + (b1 * 14)), sd = 0.2987)
# [1] 1.109504
# Let's calculate the likelihood for ALL observations
# and multiply those together to get the model likelihood
# We'll also calculate the log-likelihood (which we'll see in a bit)
firstyear %>%
mutate(likelihood = dnorm(gpa, mean = (b0 + (b1 * act)), sd = 0.2987)) %>%
summarize(L = round(prod(likelihood),3 ),
logL = round( log(L),2 ))
# # A tibble: 1 x 2
# L logL
# <dbl> <dbl>
# 1 0.005 -5.3
Typically, it’s easier to work with the natural log of the likelihood. This log-likelihood will always be negative, with values closer to zero indicating better model fit.
We can use log-likelihood to compare full and reduced models (by calculating the likelihood ratio):
If the full model is much better than the reduced model, we would expect:
To estimate a p-value from this likelihood ratio, we can use a chi-square distribution with \(df = df_{\textrm{full}} - df_{\textrm{reduced}}\)
Expand the code to see this calculation in R:
# Let's first calculate log-likelihood
# We can glance at the with the broom package
glance(fullmodel)
# # A tibble: 1 x 11
# r.squared adj.r.squared sigma statistic p.value df
# <dbl> <dbl> <dbl> <dbl> <dbl> <int>
# 1 0.8213685 0.8149888 0.2986988 128.7473 5.514804e-12 2
# # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
# # deviance <dbl>, df.residual <int>
# We can calculate it directly with logLik()
logLik(fullmodel)
# 'log Lik.' -5.283677 (df=3)
logLik(reducedmodel)
# 'log Lik.' -31.12013 (df=2)
# Calculate the likelihood ratio
LR <- 2 * (logLik(fullmodel) - logLik(reducedmodel))
LR
# 'log Lik.' 51.67291 (df=3)
# Calculate the p-value
pchisq(LR, df=1, lower.tail=FALSE)
# 'log Lik.' 6.556111e-13 (df=3)
# If you install the lmtest package, you can use lrtest
library(lmtest)
# Loading required package: zoo
#
# Attaching package: 'zoo'
# The following objects are masked from 'package:base':
#
# as.Date, as.Date.numeric
lrtest(reducedmodel, fullmodel)
# # A tibble: 2 x 5
# `#Df` LogLik Df Chisq `Pr(>Chisq)`
# * <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2 -31.120132 NA NA NA
# 2 3 -5.283677 1 51.67291 6.556111e-13
Akaike’s Information Criterion is another measure to compare competing models. AIC seeks to find the model that has a good fit to the data with relatively few parameters (predictors). It’s defined as:
where b = the number of coefficients estimated in our model (slope(s) and intercept)
and p = the number of predictors in our model
. When comparing models, we prefer the model that produces the smaller AIC value.
Expand the code to see the AIC() function in action:
AIC(reducedmodel, fullmodel)
# # A tibble: 2 x 2
# df AIC
# * <dbl> <dbl>
# 1 2 66.24026
# 2 3 16.56735
anova(fullmodel)
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 11.486959 11.48695919 128.7473 5.514804e-12
# 2 28 2.498187 0.08922098 NA NA
neuron <- read.csv("http://www.bradthiessen.com/html5/data/violin.csv")
# Convert years played to a factor variable. I'll create a new variable.
neuron <- neuron %>%
mutate(years_group = case_when(
.$years == 0 ~ "a) 0 years",
.$years > 0 & .$years < 10 ~ "b) <10 years",
.$years > 10 ~ "c) >10 years"),
years_group = as.factor(years_group))
anova(aov(neural ~ years_group, data = neuron))
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 2 800.36548 400.182738 92.93477 1.287295e-07
# 2 11 47.36667 4.306061 NA NA
# Create the models
full_neuron <- lm(neural ~ years, data = neuron)
reduced_neuron <- lm(neural ~ 1, data = neuron)
# Summarize the full model
summary(full_neuron)
#
# Call:
# lm(formula = neural ~ years, data = neuron)
#
# Residuals:
# Min 1Q Median 3Q Max
# -4.8644 -2.3730 0.1614 2.3713 4.6471
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 8.3873 1.1149 7.523 4.35e-06 ***
# years 0.9971 0.1110 8.980 6.18e-07 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 3.009 on 13 degrees of freedom
# Multiple R-squared: 0.8612, Adjusted R-squared: 0.8505
# F-statistic: 80.63 on 1 and 13 DF, p-value: 6.178e-07
# ANOVA summary table
anova(full_neuron)
# # A tibble: 2 x 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 730.2060 730.206005 80.63275 6.178311e-07
# 2 13 117.7273 9.055948 NA NA
# Model comparison
AIC(full_neuron, reduced_neuron)
# # A tibble: 2 x 2
# df AIC
# * <dbl> <dbl>
# 1 3 79.47297
# 2 2 107.08943
lrtest(reducedmodel, fullmodel)
# # A tibble: 2 x 5
# `#Df` LogLik Df Chisq `Pr(>Chisq)`
# * <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2 -31.120132 NA NA NA
# 2 3 -5.283677 1 51.67291 6.556111e-13
# Get predicted values and residuals
neuron <- neuron %>% add_predictions(full_neuron)
neuron <- neuron %>% add_residuals(full_neuron)
# Plot model on data
ggplot(data = neuron, aes(x = years, y = neural)) +
geom_point() +
geom_line(aes(y = pred), color="blue", size=1) +
scale_y_continuous(breaks=seq(0, 30, 10), minor_breaks=seq(5, 25, 5)) +
scale_x_continuous(breaks=seq(0, 20, 5), minor_breaks=NULL)
# Plot residuals vs. predicted values
ggplot(data = neuron, aes(x = pred, y = resid)) +
geom_point() +
geom_hline(yintercept = 0)
prestige_data <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_point(color="steelblue") +
geom_smooth(method="lm", se=FALSE) +
annotate("text", x=15000, y=35, label="R-squared = 0.511") +
annotate("text", x=15000, y=28, label="y = 27.141 + 0.0029x")
prestige_model <- lm(prestige ~ income, data=prestige_data)
tidy(prestige_model)
# # A tibble: 2 x 5
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) 27.141176368 2.2677036186 11.96857 5.135229e-21
# 2 income 0.002896799 0.0002833245 10.22432 3.192004e-17
tidy(anova(prestige_model))
# # A tibble: 2 x 6
# term df sumsq meansq statistic p.value
# <chr> <int> <dbl> <dbl> <dbl> <dbl>
# 1 income 1 15279.26 15279.2567 104.5367 3.192004e-17
# 2 Residuals 100 14616.17 146.1617 NA NA
Confidence interval for predictions: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\) where \(s_{y|x}=\sqrt{MS_E}\)
A 95% confidence interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 2.38\)
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_point(color="steelblue") +
geom_smooth(method = "lm", se=TRUE, color="red")
Prediction Interval: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{1+\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\)
A 95% prediction interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{1+\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 24.10\)
We can get a rough estimate of this interval by simply taking: \(\hat{y} \ \pm \ 2\sqrt{MS_E}\)
# Get predictions (actual and lower/upper confidence)
predictions <- predict(prestige_model, interval="prediction")
# Warning in predict.lm(prestige_model, interval = "prediction"): predictions on current data refer to _future_ responses
# Add them to data frame
prestige_data <- cbind(prestige_data, predictions)
# Plot
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill="grey70", alpha=.5) +
geom_point(color="steelblue") +
geom_smooth(method="lm", se=FALSE)
Lowess regression
Robust regression
Box-Cox transforms
Entropy
Sources
This document is released under a Creative Commons Attribution-ShareAlike 3.0 Unported license.