# Install and load all necessary packages
list.of.packages <- c("tidyverse", "broom", "mosaic", "ggExtra")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(mosaic)
library(broom)
library(modelr)
library(ggExtra)
The class will choose the criteria we use to declare the winner.
Celebrity | Guess | Actual | ??? |
---|---|---|---|
1. Oprah | |||
2. Tom Hanks | |||
3. Betty White | |||
4. Pelé | |||
5. Michael Jordan | |||
6. Taylor Swift | |||
7. Will Smith | |||
8. George Washington | |||
9. (students’ choice) | |||
10. (students’ choice) | |||
score = __________ |
Suppose you work in the admissions office and are tasked with admitting students who are predicted to be successful at St. Ambrose.
# Read data
sau <- read.csv("http://www.bradthiessen.com/html5/data/sau.csv")
# Convert female and minority variables to factors
sau <- sau %>%
mutate(female = factor(female, labels = c("male", "female")),
minority = factor(minority, labels = c("white", "minority")))
p <- ggplot(sau, aes(x = hsgpa, y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(0.5,4), breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
theme(legend.position="none") +
labs(
x = "High School GPA",
y = "Fall semester GPA at St. Ambrose"
) +
annotate("text", x=1.1, y=1.5, label="y = 0.058 + 0.826x")
# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
p <- ggplot(sau, aes(x = jitter(ACTtotal), y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(15,36), breaks=seq(16, 36, 2), minor_breaks=seq(15, 35, 2)) +
theme(legend.position="none") +
labs(
x = "ACT Composite",
y = "Fall semester GPA at St. Ambrose"
)
# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
p <- ggplot(sau, aes(x = jitter(ACTmath), y=fallgpa)) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(limits = c(14,36), breaks=seq(14, 36, 2), minor_breaks=seq(15, 35, 2)) +
theme(legend.position="none") +
labs(
x = "ACT Math",
y = "Fall semester GPA at St. Ambrose"
)
ggMarginal(p, type="histogram", fill="steelblue", color="white")
ggplot(sau, aes(x = hsgpa, y=jitter(retention, .1))) +
geom_point(alpha=.3, color="steelblue") +
geom_smooth(method = "lm", se=FALSE, color="black") +
scale_y_continuous(limits = c(0,1), breaks=seq(0, 1, 1), minor_breaks=NULL) +
theme(legend.position="none") +
labs(
x = "high school GPA",
y = "1 = student returned for sophomore year"
)
Here’s some simulated data showing the relationship between ACT scores and 1st-semester college GPAs.
# Simulate ACT and first-semester GPA data
set.seed(123)
firstyear <- tibble(
act = c(rep(c(14, 16, 18, 20, 22, 24, 26, 28, 30, 32),3)),
gpa = round((act/9) + rnorm(30, 0, .3),2),
act2 = act + rnorm(30,0,.1)
)
# Plot the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_point() +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
Let’s randomly generate 300 lines of the form \(y = b_0+b_1x\) to display on our data:
# Looks linear. Randomly choose 250 slopes and y-intercepts.
set.seed(123)
models <- tibble(
b0 = runif(300, -2, 3.5),
b1 = runif(300, -.2, .2)
)
# Plot all 300 models on top of the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_abline(aes(intercept = b0, slope = b1), data = models, alpha = 0.25) +
geom_point() +
geom_abline(aes(intercept = 0.0638, slope = .1077), data = models, alpha = 0.25) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Copy our data to new data frame
withpredictions <- firstyear
# Choose a slope and y-intercept
b00 <- 42/24
b10 <- 1/24
# Predict GPAs with this slope and y-intercept
withpredictions$pred2 <- b00 + (b10 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid2 <- withpredictions$gpa - ( b00 + (b10 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b00, slope = b10), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred2, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid2^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid2)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Choose a slope and y-intercept
b01 <- 3/50
b11 <- 1/9
# Predict GPAs with this slope and y-intercept
withpredictions$pred1 <- b01 + (b11 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid1 <- withpredictions$gpa - ( b01 + (b11 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b01, slope = b11), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred1, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid1^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid1)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
# Choose a slope and y-intercept
b02 <- 3.5
b12 <- -1/42
# Predict GPAs with this slope and y-intercept
withpredictions$pred3 <- b02 + (b12 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid3 <- withpredictions$gpa - ( b02 + (b12 * withpredictions$act) )
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b02, slope = b12), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred3, color="red") +
annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =",
round( sqrt(mean(withpredictions$resid3^2)), 3))) +
annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =",
round( sum(abs(withpredictions$resid3)), 3))) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
We can calculate this root mean square deviation for all 300 lines we plotted earlier. Let’s display the ten best models (with the smallest RMSE).
# Don't worry about this code.
# Function to calculate predictions based on slope and y-intercept
model1 <- function(a, data) {
a[1] + data$act * a[2]
}
# Function to calculate vertical error distances
measure_distance <- function(mod, data) {
diff <- data$gpa - model1(mod, data)
sqrt(mean(diff ^ 2))
}
# Compute distance for all 300 models
sim1_dist <- function(b0, b1) {
measure_distance(c(b0, b1), firstyear)
}
# Add distances to the models data frame
models <- models %>%
mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))
# Find 10 best models (with smallest rmse) and plot on data
ggplot(firstyear, aes(act, gpa)) +
geom_point(size = 2, colour = "black") +
geom_abline(
aes(intercept = b0, slope = b1, colour = -dist),
data = filter(models, rank(dist) <= 10)
) +
labs(title = "10 best models (with smallest RMSE)")
Each of the ten lines displayed above has a y-intercept and a slope, so we can think of each model as a point: \((b_0, \ b_1)\). Let’s plot these “points” for all 300 linear models:
# Plot the slopes and y-intercepts of all 300 models
# Highlight best combinations of slope and intercept with red circles
ggplot(models, aes(b0, b1)) +
geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
The points with lighter shades of blue represent models with smaller amounts of error. The red circles highlight the 10 best models that were displayed earlier.
Let’s try a grid search to find the best combination of slope and y-intercept.
# Create the grid by selecting ranges for b0 and b1
grid <- expand.grid(
b0 = seq(-1, 1, length = 25),
b1 = seq(0.05, .15, length = 25)
) %>%
mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))
# Zoom and enhance
grid %>%
ggplot(aes(b0, b1)) +
geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
Let’s take these 10 best models and display them back on the data.
ggplot(firstyear, aes(act, gpa)) +
geom_point(size = 2, colour = "grey30") +
geom_abline(
aes(intercept = b0, slope = b1, colour = -dist),
data = filter(grid, rank(dist) <= 10)
)
We could continue this zoom-and-enhance method until we really narrowed down our best y-intercept and slope. Thankfully, there’s an easier way.
# Choose a slope and y-intercept
b0 <- 0.06384
b1 <- 0.10772
# Predict GPAs with this slope and y-intercept
withpredictions$pred <- b0 + (b1 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid <- withpredictions$gpa - ( b0 + (b1 * withpredictions$act) )
# Plot vertical errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend=withpredictions$act2,
y=withpredictions$gpa, yend=withpredictions$pred, color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend= ( (withpredictions$gpa - b0) / b1),
y=withpredictions$gpa, yend=withpredictions$gpa, color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("segment", x=withpredictions$act2, xend= ((withpredictions$act2 + b1 * withpredictions$gpa - b0 * b1) / (1 + b1^2)),
y=withpredictions$gpa, yend = (b0 + b1 * withpredictions$act2), color="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
geom_abline(aes(intercept = 200, slope=-3)) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL)
# Plot squared errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL)
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_line(color="blue", size=1) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_smooth(method="loess", se=FALSE, span=.4) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
ggplot(data = withpredictions, aes(x = act, y = gpa)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
labs(x = "ACT", y = "GPA")
We want to minimize the squared vertical distances from predictions made by our model, \(\hat{y}_i=b_0+b_1x_i\) to each observed data point, \((x_i, y_i)\).
Each of these vertical distances can be written as: \(e_i = y_i-(b_0+b_1x_i)=y_i-b_0-b_1x_i\)
Our goal is to find values for the y-intercept \((b_0)\) and slope \((b_1)\) that minimize: \(Q =\sum_{i=1}^{n}(y_i-b_0-b_1x_i)^2\).
To minimize a function, we can set its first derivative equal to zero and solve (and verify the second derivative is positive at that value). Since we have two variables in \(Q\), we’ll need to take partial derivatives:
\(\frac{\partial Q }{\partial b_0}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_0}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum(y_i-b_0-b_1x_i)\)
Set this partial derivative equal to zero:
\(-2\sum(y_i-b_0-b_1x_i)=0\)
\(\sum(y_i-b_0-b_1x_i)=0\)
\(\sum{y_i}=nb_0+b_1\sum{x_i}\)
\(\frac{\partial Q }{\partial b_1}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_1}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum{x_i}(y_i-b_0-b_1x_i)\)
Set this partial derivative equal to zero:
\(-2\sum{x_i}(y_i-b_0-b_1x_i)=0\)
\(\sum{x_i}(y_i-b_0-b_1x_i)=0\)
\(\sum{x_iy_i}-b_0\sum{x_i}-b_1\sum{x_i^2}=0\)
\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}=0\)
\(\sum{y_i}=nb_0+b_1\sum{x_i}\)
\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}\)
We can solve this system to get:
\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}\)
and
\(b_0=\frac{\sum{x_i^2}\sum{y_i}-\sum{x_i}\sum{x_iy_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{\sum{y_i}}{n}-b_1\frac{\sum{x_i}}{n}=\overline{y}-b_1\overline{x}\)
Using formulas for variance and covariance, we can rewrite \(b_1\) as:
\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{s_{xy}}{s_x^2}=r\frac{s_y}{s_x}\)
# Get summary statistics
firstyear %>%
summarize(meanx = mean(act), sdx = sd(act),
meany = mean(gpa), sdy = sd(gpa),
n = n(), cor = cor(act, gpa))
# Calculate b1
firstyear %>%
summarize(b1 = cor(gpa, act) * sd(gpa) / sd(act),
b0 = mean(gpa) - (b1 * mean(act)) )
In R, we can use the lm()
function to fit linear models. Expand the code –>
# lm = linear model
# lm(y ~ x, data)
lm(gpa ~ act, data = firstyear)
#
# The output shows coefficients
# It's typically more useful to store the model
model1 <- lm(gpa ~ act, data = firstyear)
#
# We can then investigate this model
# Summary statistics
summary(model1)
#
# Coefficients
coef(model1)
#
# Broom package: tidy(), glance(), augment() functions
# Tidy the model coefficients, standard errors, and p-values
tidy(model1)
#
# Glance at summary statistics for the model
glance(model1)
#
# Augment the data with predictions and residuals
head(augment(model1))
#
#
# Modelr package: add_predictions() function
# Add predictions to our dataset
firstyear <- firstyear %>%
add_predictions(model1)
#
# Now we can plot these predictions as a line
ggplot(data = firstyear, aes(x = act, y = gpa)) +
geom_point() +
geom_line(aes(y = pred), color="blue", size=1) +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)
Just because we’re able to calculate the best line, it doesn’t mean that line is meaningful. For example, here’s the line that best fits this data:
# Simulate data forming a parabola
sim_data <- tibble(
x = c(-5:5),
y = x^2
)
# Plot the data
# geom_smooth(method = "lm") plots the line of best fit
ggplot(data = sim_data, aes(x = x, y = y)) +
geom_point(size=2) +
geom_smooth(method="lm", se=FALSE)
It looks like we’ll need some way of measuring just how well a model (a line) fits a given dataset. Let’s derive some of these measures for three different datasets:
We’ll complete the following table to see the value of each measure in each case:
# Later, we'll see what this ANOVA table represents
# For now, it gives us SSE (Sum Sq Residuals)
anova(model1)
# We can also calculate this directly with the residuals
sum(residuals(model1)^2)
Perhaps a better measure of fit would be the average squared error (the average squared distance from observation to the prediction line). One simple estimate of this average squared error would be MSE:
\(s_{y|x}^2=\frac{SS_E}{df_E}=\frac{\Sigma(y_i-\hat{y})^2}{n-2}\)
# MSE is displayed in the ANOVA summary table. Calculate directly with:
sum(residuals(model1)^2) / model1$df.residual
# We can calculate it directly with...
sqrt( sum(residuals(model1)^2) / model1$df.residual )
# We can also glance at it (sigma) with the broom package
glance(model1)
# The modelr package has a rmse() function
rmse(model1, firstyear)
# Calculate it directly. length = sample size
sqrt( sum(residuals(model1)^2) / length(firstyear$act) )
# The modelr package has an rsquare() function
1 - rsquare(model1, firstyear)
# The modelr package has an rsquare() function
rsquare(model1, firstyear)
# We can glance at it with the broom package
glance(model1)
# It also appears in the summary
summary(model1)
# We can also get it from the ANOVA table
anova(model1)
11.487/(11.487+2.4982)
Validity: The data we’re analyzing maps to the research question we are trying to answer
Diagnosis: Look at plots of observed vs predicted or residuals vs predicted values. The points should be symmetrically distributed around a diagonal line in the former plot or around horizontal line in the latter plot, with a roughly constant variance.
Diagnosis: If you have time series data, be careful that consecutive errors are not related.
Diagnosis: Look at the plot of residuals vs predicted values. If the residuals grow larger as a function of X, you have a problem.
Diagnosis: Look at a P-P or Q-Q plot of the residuals. The residuals should fall near the diagonal line. You could also run a test for normality, like the Shapiro-Wilk or Kologorov-Smirnov tests. Note that the dependent and independent variables in a regression model do not need to be normally distributed by themselves–only the prediction errors need to be normally distributed
plot()
function displays some diagnostic plots for our model. Evaluate the conditions listed above based on our dataset, question of interest, and the diagnostic plots.plot(model1)
Let’s finally use our real dataset. To change things up a bit, let’s regress Fall semester GPAs on high school GPAs. Make sure you can interpret all this output:
full_model <- lm(fallgpa ~ hsgpa, data = sau)
summary(full_model)
rmse(full_model, data = sau)
plot(full_model)
Earlier, we learned that we always expect to get a non-zero correlation in any sample of data. Similarly, we expect linear models with non-zero slopes in any sample of data (even if we have reason to believe the variables are uncorrelated).
Our model to predict fall semester GPAs is: FallGPA = 0.058 + 0.926(hsGPA)
Does the magnitude of this slope (0.926) imply high school and college GPAs have a relationship for our population of interest or could we have obtained a slope of this magnitude even if the variables are uncorrelated?
# Store our observed slope of 0826
observed_slope <- full_model$coefficients[2]
# Run 10,000 randomizations, shuffling the high school GPA
rand_slopes <- Do(10000) * lm(fallgpa ~ shuffle(hsgpa), data = sau)
# The slopes are stored in the "hsgpa" variable. Let's plot and estimate the p-value.
ggplot(data = rand_slopes, aes(x = hsgpa)) +
geom_histogram(fill="lightblue", color="white", alpha = 0.8) +
labs(
title = "Randomized slopes",
x = "Slope"
) +
scale_x_continuous(breaks=seq(-.1, .1, .025), minor_breaks=NULL) +
theme(
axis.text.x = element_text(size = 11, color="grey10"),
legend.position = "none",
panel.grid.major.y = element_line(colour = "white"),
panel.grid.major.x = element_line(colour = "white", size=.15),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "grey93")
) +
annotate("text", x = .075, y = 600, label = paste("p = 0"))
boot <- Do(10000) * lm(fallgpa ~ hsgpa, data = sample(sau, replace=TRUE))
bootstrapCI <- confint(boot$hsgpa, level = 0.95, method = "quantile")
lower <- as.numeric(bootstrapCI[1]) # Store lower CI bound
upper <- as.numeric(bootstrapCI[2]) # Store upper CI bound
# Density plot
ggplot(data = boot, aes(x = hsgpa)) +
geom_density(fill="lightblue", color="white", alpha = 0.8) +
labs(
title = "Bootstrap distribution of correlations",
x = "bootstrap correlations"
) +
scale_x_continuous(breaks=seq(.7, 1, 0.05), minor_breaks=NULL) +
theme(
axis.text.x = element_text(size = 11, color="grey10"),
legend.position = "none",
panel.grid.major.y = element_line(colour = "white"),
panel.grid.major.x = element_line(colour = "white", size=.15),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "grey93")
) +
annotate("text", x = lower, y = 5, label = round(lower,3)) +
annotate("text", x = upper, y = 5, label = round(upper,3)) +
annotate("text", x = median(boot$hsgpa), y = 6, label = paste("95%")) +
annotate("segment", x = lower, xend = upper, y = 4, yend = 4, color = "red")
In the 14th century, William of Ockham wrote Entia non sunt multiplicanda praeter necessitatem, which translates to: More things should not be used than are necessary.
When comparing two competing theories, we might use Ockham’s Razor and prefer theories that are simpler (more parsimonious in that they require fewer assumptions or are easier to explain). On the other hand, we might prefer more complex theories if they more accurately predict future events.
When we compare competing statistical models, we’ll have to trade-off accuracy and simplicity. We’ll have to learn to navigate between:
and
If we’re interested in the best prediction of fall semester GPAs, one approach we might take is:
We could also start with a complicated model, remove a predictor, and determine if the fit worsens noticeably.
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = mean(gpa), slope = 0), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - mean(withpredictions$gpa)),
ymin=mean(withpredictions$gpa), ymax=withpredictions$gpa, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL) +
labs(title = paste("Reduced model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2]+anova(lm(gpa ~ act, data = firstyear))[1,2],5)))
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
geom_point() +
geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) +
annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
labs(x = "ACT", y=NULL) +
labs(title = paste("Full model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2],5)))
anova(lm(gpa ~ act, data = firstyear))
The only difference between our full and reduced models is the \(b_1\) coefficient (the slope). If \(b_1=0\), the full model would be the same as our reduced model. Another way, then, to compare our full and reduced models would be to test the hypothesis: \(H_0:b_1=0\). We can test this hypothesis with a t-test.
t <- cor(firstyear$gpa, firstyear$act) / sqrt( (1-cor(firstyear$gpa, firstyear$act)^2) / (length(firstyear$act)-2))
t
t^2
anova()
function.# Manual calculation
Rf <- cor(firstyear$gpa, firstyear$act)
Rr <- 0
N <- length(firstyear$gpa)
kf <- 1
kr <- 0
F <- ( ((Rf^2 - Rr^2) / (kf - kr)) / ((1-Rf^2)/(N-kf-1)) )
F
#
# ANOVA function
fullmodel <- lm(gpa ~ act, data=firstyear)
reducedmodel <- lm(gpa ~ 1, data=firstyear)
anova(reducedmodel, fullmodel)
As we’ll soon see, R-squared and p-values resulting from the omnibus F-test aren’t the best measures to use when comparing models. Here are a couple alternatives we’ll build upon later:
The likelihood of a model is the probability it produces our data given its parameter estimates. If we assume all our observations are independent, then we can write our likelihood function as:
We want to calculate \(P(\textrm{observation 1} \ \cap \ \textrm{observation 2} \ \cap \ ...)\). When we assume independence, we can calculate this as: \(P(\textrm{observation 1})P(\textrm{observation 2})...\).
Given our model with \(b_0\) and \(b_1\) – along with our assumption that errors are normally distributed – we can use a normal distribution to find \(P(\textrm{observation 1})\). We then simply have to multiply these probabilities for each observation in our dataset.
As an example, our first observation is: ACT = 14
with GPA = 1.39
. Recall our model has \(b_0=0.06384\), \(b_1=0.10772\), and \(RSE=\sqrt{MS_E}=\sqrt{0.0892}=0.2987\).
For all students with an ACT score of 14, our model expects a GPA of: \(\hat{y}=0.06384+0.10772(14)=1.5719\).
Given that expectation, we can find the probability of actually observing a GPA of 1.39:
\(P[y=1.39 \ | \ y \sim \textrm{ND}(\mu=1.5719, \sigma=0.2987)]\)
Expand the code to see how to calculate this manually in R.
# We'll use dnorm to calculate the likelihood of each observation
# Here's the likelihood of the first observation:
dnorm(1.39, mean = (b0 + (b1 * 14)), sd = 0.2987)
# Let's calculate the likelihood for ALL observations
# and multiply those together to get the model likelihood
# We'll also calculate the log-likelihood (which we'll see in a bit)
firstyear %>%
mutate(likelihood = dnorm(gpa, mean = (b0 + (b1 * act)), sd = 0.2987)) %>%
summarize(L = round(prod(likelihood),3 ),
logL = round( log(L),2 ))
Typically, it’s easier to work with the natural log of the likelihood. This log-likelihood will always be negative, with values closer to zero indicating better model fit.
We can use log-likelihood to compare full and reduced models (by calculating the likelihood ratio):
If the full model is much better than the reduced model, we would expect:
To estimate a p-value from this likelihood ratio, we can use a chi-square distribution with \(df = df_{\textrm{full}} - df_{\textrm{reduced}}\)
Expand the code to see this calculation in R:
# Let's first calculate log-likelihood
# We can glance at the with the broom package
glance(fullmodel)
# We can calculate it directly with logLik()
logLik(fullmodel)
logLik(reducedmodel)
# Calculate the likelihood ratio
LR <- 2 * (logLik(fullmodel) - logLik(reducedmodel))
LR
# Calculate the p-value
pchisq(LR, df=1, lower.tail=FALSE)
# If you install the lmtest package, you can use lrtest
library(lmtest)
lrtest(reducedmodel, fullmodel)
Akaike’s Information Criterion is another measure to compare competing models. AIC seeks to find the model that has a good fit to the data with relatively few parameters (predictors). It’s defined as:
where b = the number of coefficients estimated in our model (slope(s) and intercept)
and p = the number of predictors in our model
. When comparing models, we prefer the model that produces the smaller AIC value.
Expand the code to see the AIC() function in action:
AIC(reducedmodel, fullmodel)
anova(fullmodel)
neuron <- read.csv("http://www.bradthiessen.com/html5/data/violin.csv")
# Convert years played to a factor variable. I'll create a new variable.
neuron <- neuron %>%
mutate(years_group = case_when(
.$years == 0 ~ "a) 0 years",
.$years > 0 & .$years < 10 ~ "b) <10 years",
.$years > 10 ~ "c) >10 years"),
years_group = as.factor(years_group))
anova(aov(neural ~ years_group, data = neuron))
prestige_data <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_point(color="steelblue") +
geom_smooth(method="lm", se=FALSE) +
annotate("text", x=15000, y=35, label="R-squared = 0.511") +
annotate("text", x=15000, y=28, label="y = 27.141 + 0.0029x")
prestige_model <- lm(prestige ~ income, data=prestige_data)
tidy(prestige_model)
tidy(anova(prestige_model))
Confidence interval for predictions: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\) where \(s_{y|x}=\sqrt{MS_E}\)
A 95% confidence interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 2.38\)
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_point(color="steelblue") +
geom_smooth(method = "lm", se=TRUE, color="red")
Prediction Interval: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{1+\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\)
A 95% prediction interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{1+\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 24.10\)
We can get a rough estimate of this interval by simply taking: \(\hat{y} \ \pm \ 2\sqrt{MS_E}\)
# Get predictions (actual and lower/upper confidence)
predictions <- predict(prestige_model, interval="prediction")
# Add them to data frame
prestige_data <- cbind(prestige_data, predictions)
# Plot
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill="grey70", alpha=.5) +
geom_point(color="steelblue") +
geom_smooth(method="lm", se=FALSE)
Lowess regression
Robust regression
Box-Cox transforms
Entropy
Sources
This document is released under a Creative Commons Attribution-ShareAlike 3.0 Unported license.