Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
########################
########################
### Polynomial Reg. ###
########################
########################
## Load data
geiger <- read.csv("http://www.bradthiessen.com/html5/data/geiger.csv")
head(geiger)
## time counts
## 1 0 126.6
## 2 1 101.8
## 3 2 71.6
## 4 3 85.1
## 5 4 101.6
## 6 5 67.5
tail(geiger)
## time counts
## 26 25 13.4
## 27 26 26.8
## 28 27 9.8
## 29 28 18.8
## 30 29 25.9
## 31 30 19.3
cor(counts, time, data=geiger)
## [1] -0.8767747
## Fit linear model
lin.mod <- lm(counts ~ time, data=geiger)
summary(lin.mod)
##
## Call:
## lm(formula = counts ~ time, data = geiger)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.045 -10.021 -2.083 5.126 40.461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.1389 5.0272 17.134 < 2e-16 ***
## time -2.8262 0.2879 -9.818 9.99e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.34 on 29 degrees of freedom
## Multiple R-squared: 0.7687, Adjusted R-squared: 0.7608
## F-statistic: 96.4 on 1 and 29 DF, p-value: 9.991e-11
anova(lin.mod)
## Analysis of Variance Table
##
## Response: counts
## Df Sum Sq Mean Sq F value Pr(>F)
## time 1 19809.5 19809.5 96.397 9.991e-11 ***
## Residuals 29 5959.5 205.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## R-squared is 0.7687 (looks good)
## Assumptions check
par(mfrow=c(2,2)) # visualize four graphs at once
plot(lin.mod)
par(mfrow=c(1,1)) # reset the graphics defaults
## Show linear model
geiger %>%
ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
add_axis("x", title = "time") %>%
scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "counts") %>%
scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)
## Show lowess
geiger %>%
ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
add_axis("x", title = "time") %>%
scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "counts") %>%
scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)
## LOWESS with slider for span
library(shiny)
geiger %>%
ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red",
span = input_slider(min=0.1, max=1, value=.5, step=.1,
label="smoothing span")) %>%
add_axis("x", title = "time") %>%
scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "counts") %>%
scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
## Add a quadratic term
quad.mod = lm(counts ~ time + I(time^2), data=geiger)
summary(quad.mod)
##
## Call:
## lm(formula = counts ~ time + I(time^2), data = geiger)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.287 -6.047 -1.605 4.231 20.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107.97130 4.65611 23.189 < 2e-16 ***
## time -7.34330 0.71841 -10.222 5.92e-11 ***
## I(time^2) 0.15057 0.02314 6.507 4.74e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.205 on 28 degrees of freedom
## Multiple R-squared: 0.9079, Adjusted R-squared: 0.9014
## F-statistic: 138.1 on 2 and 28 DF, p-value: 3.143e-15
## R-squared = 0.9079
## Check assumptions
par(mfrow=c(2,2)) # visualize four graphs at once
plot(quad.mod)
par(mfrow=c(1,1)) # reset the graphics defaults
## Plot model
plot(counts~time, data=geiger)
curve(107.97130 - 7.3433*x + 0.15057*x^2,add=T, lw=4, col=2)
## Compare to linear model
null.mod <- lm(counts ~ 1, data=geiger)
anova(null.mod, lin.mod, quad.mod)
## Analysis of Variance Table
##
## Model 1: counts ~ 1
## Model 2: counts ~ time
## Model 3: counts ~ time + I(time^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 25769.0
## 2 29 5959.5 1 19809.5 233.797 4.037e-15 ***
## 3 28 2372.4 1 3587.1 42.335 4.736e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add a cubic term
cub.mod = lm(counts ~ time + I(time^2) + I(time^3), data=geiger)
summary(cub.mod)
##
## Call:
## lm(formula = counts ~ time + I(time^2) + I(time^3), data = geiger)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.707 -5.615 -1.108 6.758 21.680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.251350 5.760812 19.659 < 2e-16 ***
## time -9.646057 1.690640 -5.706 4.61e-06 ***
## I(time^2) 0.345644 0.132206 2.614 0.0144 *
## I(time^3) -0.004335 0.002894 -1.498 0.1458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.007 on 27 degrees of freedom
## Multiple R-squared: 0.915, Adjusted R-squared: 0.9056
## F-statistic: 96.88 on 3 and 27 DF, p-value: 1.442e-14
## R-squared = 0.9150
## Check assumptions
par(mfrow=c(2,2)) # visualize four graphs at once
plot(quad.mod)
par(mfrow=c(1,1)) # reset the graphics defaults
## Plot model
plot(counts~time, data=geiger)
curve(113.25135 - 9.646057*x + 0.345644*x^2 - 0.004335*x^3,add=T, lw=4, col=2)
## Compare to other model
anova(null.mod, lin.mod, quad.mod, cub.mod)
## Analysis of Variance Table
##
## Model 1: counts ~ 1
## Model 2: counts ~ time
## Model 3: counts ~ time + I(time^2)
## Model 4: counts ~ time + I(time^2) + I(time^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 25769.0
## 2 29 5959.5 1 19809.5 244.176 4.763e-15 ***
## 3 28 2372.4 1 3587.1 44.215 3.897e-07 ***
## 4 27 2190.5 1 182.0 2.243 0.1458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Let's go to a 7th degree model
seven.mod = lm(counts ~ time + I(time^2) + I(time^3) +
I(time^4) + I(time^5) + I(time^6) + I(time^7), data=geiger)
summary(seven.mod)
##
## Call:
## lm(formula = counts ~ time + I(time^2) + I(time^3) + I(time^4) +
## I(time^5) + I(time^6) + I(time^7), data = geiger)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9967 -5.0016 -0.3222 4.3678 23.4455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.238e+02 8.224e+00 15.057 2.11e-13 ***
## time -3.164e+01 1.112e+01 -2.845 0.00917 **
## I(time^2) 9.881e+00 4.686e+00 2.109 0.04606 *
## I(time^3) -1.712e+00 8.409e-01 -2.036 0.05345 .
## I(time^4) 1.524e-01 7.576e-02 2.011 0.05614 .
## I(time^5) -7.139e-03 3.601e-03 -1.982 0.05951 .
## I(time^6) 1.677e-04 8.623e-05 1.945 0.06408 .
## I(time^7) -1.558e-06 8.191e-07 -1.902 0.06979 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.766 on 23 degrees of freedom
## Multiple R-squared: 0.9314, Adjusted R-squared: 0.9105
## F-statistic: 44.63 on 7 and 23 DF, p-value: 6.731e-12
## R-squared = 0.9314
## Check assumptions
par(mfrow=c(2,2)) # visualize four graphs at once
plot(seven.mod)
par(mfrow=c(1,1)) # reset the graphics defaults
## Plot model
coef(seven.mod)
## (Intercept) time I(time^2) I(time^3) I(time^4)
## 1.238349e+02 -3.164351e+01 9.881051e+00 -1.711959e+00 1.523853e-01
## I(time^5) I(time^6) I(time^7)
## -7.138673e-03 1.677408e-04 -1.557870e-06
plot(counts~time, data=geiger)
curve(123.8349 - 31.64351*x + 9.881051*x^2 - 1.711959*x^3 +
0.1523853*x^4 - 0.00713867*x^5 + 0.0001677408*x^6 -
0.00000155787*x^7 ,add=T, lw=4, col=2)
## Compare to other model
anova(quad.mod, seven.mod)
## Analysis of Variance Table
##
## Model 1: counts ~ time + I(time^2)
## Model 2: counts ~ time + I(time^2) + I(time^3) + I(time^4) + I(time^5) +
## I(time^6) + I(time^7)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 28 2372.4
## 2 23 1767.2 5 605.22 1.5754 0.2066
## Overfits the data
## Cross-validate to choose best model
library(DAAG)
geiger$time2 <- geiger$time^2
geiger$time3 <- geiger$time^3
geiger$time4 <- geiger$time^4
cross.validated.model <- lm(counts ~ time + time2 + time3 + time4, data=geiger)
## This command yields ms = 110
# CVlm(geiger, cross.validated.model, m=10)
cross.validated.model <- lm(counts ~ time + time2 + time3, data=geiger)
## This command yields ms = 105
# CVlm(geiger, cross.validated.model, m=10)
cross.validated.model <- lm(counts ~ time + time2, data=geiger)
## This command yields ms = 103
# CVlm(geiger, cross.validated.model, m=10)
cross.validated.model <- lm(counts ~ time, data=geiger)
## This command yields ms = 219
# CVlm(geiger, cross.validated.model, m=10)
## The model with the squared term has the lowest average mean square error
### Another example - speed vs stopping distance
data(cars)
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## Scatterplot with lowess
cars %>%
ggvis(x=~speed, y=~dist, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
add_axis("x", title = "speed") %>%
scale_numeric("x", domain=c(0,26), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "stopping distance") %>%
scale_numeric("y", domain=c(0, 125), nice=FALSE, clamp=TRUE)
## Set-up models
null.model <- lm(dist ~ 1, data=cars)
linear.model <- lm(dist ~ speed, data=cars)
quadratic.model <- lm(dist ~ speed + I(speed^2), data=cars)
cubic.model <- lm(dist ~ speed + I(speed^2) + I(speed^3), data=cars)
## Test models
anova(null.model, linear.model, quadratic.model, cubic.model)
## Analysis of Variance Table
##
## Model 1: dist ~ 1
## Model 2: dist ~ speed
## Model 3: dist ~ speed + I(speed^2)
## Model 4: dist ~ speed + I(speed^2) + I(speed^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 49 32539
## 2 48 11354 1 21185.5 91.6398 1.601e-12 ***
## 3 47 10825 1 528.8 2.2874 0.1373
## 4 46 10634 1 190.4 0.8234 0.3689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cars, main="lowess cars")
plot(dist~speed, data=cars)
curve(-17.579095 + 3.932409*x , add=T, lw=3)
curve(2.4701378 + 0.9132876*x + 0.0999593*x^2 , add=T, col=2, lw=3)
## Load data
prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")
## Set-up models
mod0 <- lm(prestige ~ 1, data=prestige)
mod1 <- lm(prestige ~ income, data=prestige)
mod2 <- lm(prestige ~ income + I(income^2), data=prestige)
mod3 <- lm(prestige ~ income + I(income^2) + I(income^3), data=prestige)
## Test models
anova(mod0, mod1, mod2, mod3)
## Analysis of Variance Table
##
## Model 1: prestige ~ 1
## Model 2: prestige ~ income
## Model 3: prestige ~ income + I(income^2)
## Model 4: prestige ~ income + I(income^2) + I(income^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 101 29895
## 2 100 14616 1 15279.3 124.0617 < 2.2e-16 ***
## 3 99 12077 1 2539.3 20.6179 1.597e-05 ***
## 4 98 12070 1 7.4 0.0598 0.8073
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(prestige~income, data=prestige)
curve(27.141176368 + 0.002896799*x , add=T, lw=3)
curve(14.18319 + 0.006153506*x - 0.0000001433097*x^2 , add=T, col=2, lw=3)
## Cross validate
prestige$income2 <- prestige$income^2
prestige$income3 <- prestige$income^3
prestige$income4 <- prestige$income^4
cross.validated.model <- lm(prestige ~ income + income2 + income3 + income4, data=prestige)
## This command yields ms = 134
# CVlm(prestige, cross.validated.model, m=10)
cross.validated.model <- lm(prestige ~ income + income2 + income3, data=prestige)
## This command yields ms = 131
# CVlm(prestige, cross.validated.model, m=10)
cross.validated.model <- lm(prestige ~ income + income2, data=prestige)
## This command yields ms = 130
# CVlm(prestige, cross.validated.model, m=10)
cross.validated.model <- lm(prestige ~ income, data=prestige)
## This command yields ms = 154
# CVlm(prestige, cross.validated.model, m=10)
### Cross-validation indicates to include quadratic term
## Check assumptions
par(mfrow=c(2,2)) # visualize four graphs at once
plot(mod2)
par(mfrow=c(1,1)) # reset the graphics defaults
########################
########################
#### Nonlinear Reg. #### -- Skip?
########################
########################
## Temperature and vapor pressure of mercury (mm of mercury)
# Handbook of Chemistry and Physics, CRC Press (1973)
data(pressure)
head(pressure)
## temperature pressure
## 1 0 0.0002
## 2 20 0.0012
## 3 40 0.0060
## 4 60 0.0300
## 5 80 0.0900
## 6 100 0.2700
# Change temperature to Kelvin
pressure$temperature = pressure$temperature + 273.15
# Change pressure to kiloPascals
pressure$pressure = pressure$pressure * .1333
# Source: http://ww2.coastal.edu/kingw/statistics/R-tutorials/simplenonlinear.html
# When a liquid is placed in an evacuated, closed chamber
# (i.e., in a "vacuum"), it begins to evaporate. This
# vaporization will continue until the vapor and the liquid
# reach a dynamic equilibrium (the number of particles of
# liquid that go into the vapor is equal to the number of
# particles of vapor that go into the liquid). At this point,
# the pressure exerted by the vapor on the liquid is called
# the vapor pressure (or equilibrium vapor pressure) of the
# substance. Vapor pressure has a complex relationship with
# temperature, and so far as I know, there is no law in theory
# that specifies this relationship (at least not in the case of
# mercury). Since we are dealing with a liquid/vapor phase
# transition, and not just a gas, the simple gas laws you
# learned back in high school chemistry don't apply. The vapor
# pressure of mercury and its relationship to temperature has
# been an important question, for example, in the calibration
# of scientific instruments like mercury thermometers and barometers.
plot(pressure)
summary(pressure)
## temperature pressure
## Min. :273.1 Min. : 0.00003
## 1st Qu.:363.1 1st Qu.: 0.02399
## Median :453.1 Median : 1.17304
## Mean :453.1 Mean : 16.57408
## 3rd Qu.:543.1 3rd Qu.: 16.86245
## Max. :633.1 Max. :107.43980
# Plot various log transformations
## Identity, log(y), power, log(x)
## Identity, exponential, power, log
par(mfrow=c(1,4)) # one row of four graphs
plot(pressure ~ temperature, data=pressure,
main="Vapor Pressure\nof Mercury",
xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)")
plot(pressure ~ temperature, data=pressure,
main="Vapor Pressure\nof Mercury",
xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="y")
plot(pressure ~ temperature, data=pressure,
main="Vapor Pressure\nof Mercury",
xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="xy")
plot(pressure ~ temperature, data=pressure,
main="Vapor Pressure\nof Mercury",
xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="x")
par(mfrow=c(1,1)) # reset the graphics defaults
## Plot power model
linear.model <- lm(pressure ~ temperature, data=pressure)
summary(linear.model)
##
## Call:
## lm(formula = pressure ~ temperature, data = pressure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.072 -15.604 -4.378 9.637 54.577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -74.7835 19.6282 -3.810 0.001400 **
## temperature 0.2016 0.0421 4.788 0.000171 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.1 on 17 degrees of freedom
## Multiple R-squared: 0.5742, Adjusted R-squared: 0.5492
## F-statistic: 22.93 on 1 and 17 DF, p-value: 0.000171
quad.model <- lm(pressure ~ temperature + I(temperature^2), data=pressure)
summary(quad.model)
##
## Call:
## lm(formula = pressure ~ temperature + I(temperature^2), data = pressure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6824 -7.2504 -0.1803 6.4301 22.7109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.272e+02 4.229e+01 5.373 6.22e-05 ***
## temperature -1.214e+00 1.941e-01 -6.255 1.15e-05 ***
## I(temperature^2) 1.562e-03 2.129e-04 7.336 1.67e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.92 on 16 degrees of freedom
## Multiple R-squared: 0.9024, Adjusted R-squared: 0.8902
## F-statistic: 74 on 2 and 16 DF, p-value: 8.209e-09
power.model <- lm(log(pressure) ~ log(temperature), data=pressure)
summary(power.model)
##
## Call:
## lm(formula = log(pressure) ~ log(temperature), data = pressure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2390 -0.3594 0.2142 0.4625 0.6045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -108.1138 3.1744 -34.06 <2e-16 ***
## log(temperature) 17.6150 0.5212 33.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5736 on 17 degrees of freedom
## Multiple R-squared: 0.9853, Adjusted R-squared: 0.9845
## F-statistic: 1142 on 1 and 17 DF, p-value: < 2.2e-16
newdf <- data.frame(temperature=seq(min(pressure$temperature),
max(pressure$temperature), len=19))
plot(pressure ~ temperature, data=pressure)
lines(newdf$temperature, exp(predict(power.model, newdf)), col=2, lw=4)
lines(newdf$temperature, predict(linear.model, newdf), col=3, lw=4)
lines(newdf$temperature, predict(quad.model, newdf), col=4, lw=4)
text(600, .75, substitute(plain("R-square: ") * r2, list(r2=summary(power.model)$r.squared)))
## Try with prestige
par(mfrow=c(1,4)) # one row of four graphs
plot(prestige ~ income, data=prestige,
xlab="Income", ylab="Prestige")
plot(prestige ~ income, data=prestige,
xlab="Income", ylab="Prestige", log="x")
plot(prestige ~ income, data=prestige,
xlab="Income", ylab="Prestige", log="y")
plot(prestige ~ income, data=prestige,
xlab="Income", ylab="Prestige", log="xy")
par(mfrow=c(1,1)) # reset the graphics defaults
linear.model <- lm(prestige ~ income, data=prestige)
summary(linear.model)
##
## Call:
## lm(formula = prestige ~ income, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.007 -8.378 -2.378 8.432 32.084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.714e+01 2.268e+00 11.97 <2e-16 ***
## income 2.897e-03 2.833e-04 10.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.09 on 100 degrees of freedom
## Multiple R-squared: 0.5111, Adjusted R-squared: 0.5062
## F-statistic: 104.5 on 1 and 100 DF, p-value: < 2.2e-16
power.model <- lm(log(prestige) ~ log(income), data=prestige)
summary(power.model)
##
## Call:
## lm(formula = log(prestige) ~ log(income), data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67077 -0.16653 -0.02778 0.18921 0.61675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.54292 0.37880 -1.433 0.155
## log(income) 0.49855 0.04364 11.425 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2594 on 100 degrees of freedom
## Multiple R-squared: 0.5662, Adjusted R-squared: 0.5619
## F-statistic: 130.5 on 1 and 100 DF, p-value: < 2.2e-16
logx.model <- lm(prestige ~ log(income), data=prestige)
summary(logx.model)
##
## Call:
## lm(formula = prestige ~ log(income), data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.114 -9.342 -1.218 8.101 30.454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -139.856 16.954 -8.249 6.6e-13 ***
## log(income) 21.556 1.953 11.037 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.61 on 100 degrees of freedom
## Multiple R-squared: 0.5492, Adjusted R-squared: 0.5447
## F-statistic: 121.8 on 1 and 100 DF, p-value: < 2.2e-16
logy.model <- lm(log(prestige) ~ income, data=prestige)
summary(logy.model)
##
## Call:
## lm(formula = log(prestige) ~ income, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72380 -0.16408 -0.01177 0.22431 0.64408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.353e+00 5.466e-02 61.338 < 2e-16 ***
## income 6.208e-05 6.829e-06 9.091 9.73e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2914 on 100 degrees of freedom
## Multiple R-squared: 0.4525, Adjusted R-squared: 0.447
## F-statistic: 82.64 on 1 and 100 DF, p-value: 9.727e-15
newdf <- data.frame(income=seq(min(prestige$income),
max(prestige$income), len=100))
plot(prestige ~ income, data=prestige)
lines(newdf$income, exp(predict(power.model, newdf)), col=2, lw=4)
lines(newdf$income, predict(linear.model, newdf), col=3, lw=4)
lines(newdf$income, predict(logx.model, newdf), col=4, lw=4)
## Lowess for prestige
prestige %>%
ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_smooths(stroke:="red", strokeWidth:=5) %>%
add_axis("x", title = "Income") %>%
scale_numeric("x", domain=c(0,25000), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Prestige") %>%
scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)
########################
########################
##### Robust Reg. ######
########################
########################
# First, fit linear model
linear.model <- lm(prestige ~ income + education + percwomn, data=prestige)
summary(linear.model)
##
## Call:
## lm(formula = prestige ~ income + education + percwomn, data = prestige)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8246 -5.3332 -0.1364 5.1587 17.5045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.7943342 3.2390886 -2.098 0.0385 *
## income 0.0013136 0.0002778 4.729 7.58e-06 ***
## education 4.1866373 0.3887013 10.771 < 2e-16 ***
## percwomn -0.0089052 0.0304071 -0.293 0.7702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.846 on 98 degrees of freedom
## Multiple R-squared: 0.7982, Adjusted R-squared: 0.792
## F-statistic: 129.2 on 3 and 98 DF, p-value: < 2.2e-16
# Now, take 5000 bootstrap samples using residual approach
# (described in activity)
library(car)
linear.boot <- Boot(linear.model, R=5000, method="residual")
summary(linear.boot)
## R original bootBias bootSE bootMed
## (Intercept) 5000 -6.7943342 -1.6143e-02 3.26057222 -6.8127110
## income 5000 0.0013136 -4.9195e-06 0.00027123 0.0013092
## education 5000 4.1866373 4.3619e-03 0.38286896 4.1985342
## percwomn 5000 -0.0089052 -6.1002e-05 0.03058230 -0.0090783
## Fictitious data with outlier
x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 25)
y <- c(1.1, 1.9, 3.1, 3.9, 5.1, 5.9, 7.1, 7.9, 9.1, 0)
df <- data.frame(x,y)
# Create linear model
linear.model <- lm(y ~ x, df)
summary(linear.model)
##
## Call:
## lm(formula = y ~ x, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8414 -2.6515 -0.1898 2.2720 4.7338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0133 1.4767 3.395 0.00943 **
## x -0.0719 0.1548 -0.464 0.65467
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.172 on 8 degrees of freedom
## Multiple R-squared: 0.02626, Adjusted R-squared: -0.09546
## F-statistic: 0.2158 on 1 and 8 DF, p-value: 0.6547
# Plot linear model
newdf <- data.frame(x=seq(min(df$x), max(df$x), len=100))
plot(y ~ x, data=df)
lines(newdf$x, predict(linear.model, newdf), col=2, lw=4)
# Create bootstrap model with randomly sampled cases
boot2 <- Boot(linear.model, R = 10000, method="case")
summary(boot2)
## R original bootBias bootSE bootMed
## (Intercept) 10000 5.013333 -1.49143 2.73935 4.403217
## x 10000 -0.071905 0.33049 0.53793 -0.071905
# Plot bootstrap model
plot(y ~ x, data=df)
curve((5.0133-1.586) + ((-0.0719 + 0.347)*x), add=T, lw=3, col=2)
# Get predicted values from the bootstrap regression
df$pred <- 3.43 + 0.275*x
## Fit lowess
df %>%
ggvis(x=~x, y=~y, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
layer_points() %>%
layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
add_axis("x", title = "x") %>%
scale_numeric("x", domain=c(0,25), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "y") %>%
scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE)
# Create bootstrap model for prestige data
linear.model <- lm(prestige ~ income + education + percwomn, data=prestige)
boot3 <- Boot(linear.model, R = 5000, method="case")
summary(boot3)
## R original bootBias bootSE bootMed
## (Intercept) 5000 -6.7943342 -4.3558e-02 3.16021675 -6.7595832
## income 5000 0.0013136 9.7042e-05 0.00043847 0.0013219
## education 5000 4.1866373 -6.3514e-02 0.46671924 4.1713369
## percwomn 5000 -0.0089052 4.1023e-03 0.03737987 -0.0067788
########################
########################
##### Quantile Reg #####
########################
########################
## Load data
hsb <- read.csv("http://www.bradthiessen.com/html5/data/hsb.csv")
head(hsb)
## schid minority female ses mathach size schtype meanses
## 1 1224 0 1 -1.528 5.876 842 0 -0.428
## 2 1224 0 1 -0.588 19.708 842 0 -0.428
## 3 1224 0 0 -0.528 20.349 842 0 -0.428
## 4 1224 0 0 -0.668 8.781 842 0 -0.428
## 5 1224 0 0 -0.158 17.898 842 0 -0.428
## 6 1224 0 0 0.022 4.583 842 0 -0.428
hsb %>%
group_by(schid) %>%
summarize(students=n(), school.type=mean(schtype), mean.mathach=mean(mathach))
## Source: local data frame [160 x 4]
##
## schid students school.type mean.mathach
## 1 1224 47 0 9.715447
## 2 1288 25 0 13.510800
## 3 1296 48 0 7.635958
## 4 1308 20 1 16.255500
## 5 1317 48 1 13.177687
## 6 1358 30 0 11.206233
## 7 1374 28 0 9.728464
## 8 1433 35 1 19.719143
## 9 1436 44 1 18.111614
## 10 1461 33 0 16.842636
## .. ... ... ... ...
## Create scatterplot matrix
hsb2 <- data.frame(hsb$minority, hsb$female, scale(hsb$ses), scale(hsb$mathach), hsb$size)
pairs(hsb2, col=rgb(0, 0, 0, 0.01))
# Is math achievement significantly related to SES?
linear.model <- lm(scale(mathach) ~ scale(ses), data=hsb)
summary(linear.model)
##
## Call:
## lm(formula = scale(mathach) ~ scale(ses), data = hsb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.82605 -0.69174 0.03393 0.73636 2.31174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.116e-15 1.100e-02 0.00 1
## scale(ses) 3.608e-01 1.100e-02 32.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9327 on 7183 degrees of freedom
## Multiple R-squared: 0.1301, Adjusted R-squared: 0.13
## F-statistic: 1075 on 1 and 7183 DF, p-value: < 2.2e-16
anova(linear.model)
## Analysis of Variance Table
##
## Response: scale(mathach)
## Df Sum Sq Mean Sq F value Pr(>F)
## scale(ses) 1 935 934.96 1074.7 < 2.2e-16 ***
## Residuals 7183 6249 0.87
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Show linear model
hsb %>%
ggvis(x=~scale(ses), y=~scale(mathach), fill := "steelblue", stroke:="steelblue", fillOpacity:=.05, strokeOpacity:=.1) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red", se=T) %>%
add_axis("x", title = "SES") %>%
scale_numeric("x", domain=c(-4,3), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Math Achievement") %>%
scale_numeric("y", domain=c(-3, 2), nice=FALSE, clamp=TRUE)
# Quantile regression asks whether a relation
# changes across the distribution of the outcome.
# Is the relation of SES with math achievement
# different across levels of math achievement?
library(quantreg)
# Median regression
q50 <- rq(scale(mathach) ~ scale(ses), tau=0.50, data=hsb)
summary(q50)
##
## Call: rq(formula = scale(mathach) ~ scale(ses), tau = 0.5, data = hsb)
##
## tau: [1] 0.5
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) 0.02945 0.01528 1.92685 0.05404
## scale(ses) 0.44536 0.01539 28.93027 0.00000
# Graph median regression
plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4)
abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line
# 10th-90th percentiles
quantregtens <- rq(scale(mathach) ~ scale(ses), data=hsb,
tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
0.60, 0.70, 0.80, 0.90))
quantreg.plot <- summary(quantregtens)
## Warning in summary.rq(xi, ...): 1 non-positive fis
## Warning in summary.rq(xi, ...): 1 non-positive fis
plot(quantreg.plot, mfrow=c(1,2))
plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4)
abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2)
}
## Test difference in slopes at 25th and 50th percentiles
q25 <- rq(scale(mathach) ~ scale(ses), tau=0.25, data=hsb)
summary(q25)
##
## Call: rq(formula = scale(mathach) ~ scale(ses), tau = 0.25, data = hsb)
##
## tau: [1] 0.25
##
## Coefficients:
## Value Std. Error t value Pr(>|t|)
## (Intercept) -0.68503 0.01659 -41.29596 0.00000
## scale(ses) 0.40577 0.01590 25.52552 0.00000
anova(q25,q50)
## Quantile Regression Analysis of Deviance Table
##
## Model: scale(mathach) ~ scale(ses)
## Joint Test of Equality of Slopes: tau in { 0.25 0.5 }
##
## Df Resid Df F value Pr(>F)
## 1 1 14369 7.4503 0.00635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Minority achievement gap
# 10th-90th percentiles
quantregtens <- rq(scale(mathach) ~ minority, data=hsb,
tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
0.60, 0.70, 0.80, 0.90))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
quantreg.plot <- summary(quantregtens)
plot(quantreg.plot, mfrow=c(1,2))
plot(hsb$minority, scale(hsb$mathach),cex=.25,type="n",xlab="Minority", ylab="Math Achievement")
points(hsb$minority, scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
abline(lm(scale(hsb$mathach)~hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
abline(rq(scale(hsb$mathach)~hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2)
}
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
# Multiple Quantile Regression
quantregtens <- rq(scale(mathach) ~ minority + scale(ses), data=hsb,
tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
0.60, 0.70, 0.80, 0.90))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
quantreg.plot <- summary(quantregtens)
## Warning in summary.rq(xi, ...): 1 non-positive fis
plot(quantreg.plot, mfrow=c(1,3))
# Compare 25th and 50th percentiles
q25<-rq(scale(mathach)~scale(ses)+minority, tau=.25,data=hsb)
q50<-rq(scale(mathach)~scale(ses)+minority, tau=.50,data=hsb)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
anova(q25, q50)
## Quantile Regression Analysis of Deviance Table
##
## Model: scale(mathach) ~ scale(ses) + minority
## Joint Test of Equality of Slopes: tau in { 0.25 0.5 }
##
## Df Resid Df F value Pr(>F)
## 1 2 14368 5.4878 0.004145 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot quantile lines
plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
abline(lm(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line
## Warning in abline(lm(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority),
## : only using the first two of 3 regression coefficients
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2)
}
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
########################
########################
##### ANOVA as Reg #####
########################
########################
#### Ambiguous prose
#### Load the data from the web
ambiguous <- read.csv("http://www.bradthiessen.com/html5/data/ambiguousprose.csv")
str(ambiguous)
## 'data.frame': 57 obs. of 2 variables:
## $ Condition : Factor w/ 3 levels "After","Before",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Comprehension: int 6 5 4 2 1 3 5 3 3 2 ...
dotPlot( ~ Comprehension | Condition, data = ambiguous, nint = 17,
endpoints = c(-2, 10), layout = c(1,3), aspect = .35, cex=.7,
xlab = "Comprehension score (0-7)")
#### Run an ANOVA modeling comprehension as a function of condition (treatments)
mod1 <- aov(Comprehension ~ Condition, data=ambiguous)
anova(mod1)
## Analysis of Variance Table
##
## Response: Comprehension
## Df Sum Sq Mean Sq F value Pr(>F)
## Condition 2 35.053 17.5263 10.012 0.0002002 ***
## Residuals 54 94.526 1.7505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mean(Comprehension ~ Condition, data=ambiguous)
## After Before None
## 3.210526 4.947368 3.368421
## Create possible coding (ordinal code for condition)
## Note that this is a bad idea!
ambiguous$condcode[ambiguous$Condition=="After"] <- 1
ambiguous$condcode[ambiguous$Condition=="Before"] <- 2
ambiguous$condcode[ambiguous$Condition=="None"] <- 3
## Run a linear regression with this condition code
## Still a bad idea!
cond.code <- lm(Comprehension ~ condcode, data=ambiguous)
summary(cond.code)
##
## Call:
## lm(formula = Comprehension ~ condcode, data = ambiguous)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.92105 -0.92105 0.07895 1.15789 3.15789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.68421 0.53740 6.856 6.5e-09 ***
## condcode 0.07895 0.24877 0.317 0.752
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.534 on 55 degrees of freedom
## Multiple R-squared: 0.001828, Adjusted R-squared: -0.01632
## F-statistic: 0.1007 on 1 and 55 DF, p-value: 0.7522
## Show linear model
ambiguous %>%
ggvis(x=~condcode, y=~Comprehension, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
layer_points() %>%
layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
add_axis("x", title = "time") %>%
scale_numeric("x", domain=c(1,3), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "counts") %>%
scale_numeric("y", domain=c(0, 7), nice=FALSE, clamp=TRUE)
## Create dummy variables x1, x2, x3
## Note that creating the 3rd dummy variable is a bad idea!
ambiguous$x1 <- ifelse(ambiguous$Condition =="After", 1, 0)
ambiguous$x2 <- ifelse(ambiguous$Condition =="Before", 1, 0)
ambiguous$x3 <- ifelse(ambiguous$Condition =="None", 1, 0)
## Run linear regression with all 3 dummy variables
## Bad idea!
lin.mod <- lm(Comprehension ~ x1 + x2 + x3, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
##
## Response: Comprehension
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 11.368 11.3684 6.4944 0.0136980 *
## x2 1 23.684 23.6842 13.5301 0.0005422 ***
## Residuals 54 94.526 1.7505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
##
## Call:
## lm(formula = Comprehension ~ x1 + x2 + x3, data = ambiguous)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.94737 -0.94737 0.05263 1.05263 2.78947
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3684 0.3035 11.097 1.51e-15 ***
## x1 -0.1579 0.4293 -0.368 0.714436
## x2 1.5789 0.4293 3.678 0.000542 ***
## x3 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared: 0.2705, Adjusted R-squared: 0.2435
## F-statistic: 10.01 on 2 and 54 DF, p-value: 0.0002002
## Run linear regression with the first two dummy variables
## Bad idea!
lin.mod <- lm(Comprehension ~ x1 + x2, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
##
## Response: Comprehension
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 11.368 11.3684 6.4944 0.0136980 *
## x2 1 23.684 23.6842 13.5301 0.0005422 ***
## Residuals 54 94.526 1.7505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
##
## Call:
## lm(formula = Comprehension ~ x1 + x2, data = ambiguous)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.94737 -0.94737 0.05263 1.05263 2.78947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3684 0.3035 11.097 1.51e-15 ***
## x1 -0.1579 0.4293 -0.368 0.714436
## x2 1.5789 0.4293 3.678 0.000542 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared: 0.2705, Adjusted R-squared: 0.2435
## F-statistic: 10.01 on 2 and 54 DF, p-value: 0.0002002
## Run linear regression with factor variable
## Same output as above
lin.mod <- lm(Comprehension ~ Condition, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
##
## Response: Comprehension
## Df Sum Sq Mean Sq F value Pr(>F)
## Condition 2 35.053 17.5263 10.012 0.0002002 ***
## Residuals 54 94.526 1.7505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
##
## Call:
## lm(formula = Comprehension ~ Condition, data = ambiguous)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.94737 -0.94737 0.05263 1.05263 2.78947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2105 0.3035 10.577 9.03e-15 ***
## ConditionBefore 1.7368 0.4293 4.046 0.000167 ***
## ConditionNone 0.1579 0.4293 0.368 0.714436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared: 0.2705, Adjusted R-squared: 0.2435
## F-statistic: 10.01 on 2 and 54 DF, p-value: 0.0002002
##############
## AxB ANOVA #
##############
data(ToothGrowth)
str(ToothGrowth)
## 'data.frame': 60 obs. of 3 variables:
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
# AxB ANOVA
aov.out = aov(len ~ supp * dose, data=ToothGrowth)
summary(aov.out)
## Df Sum Sq Mean Sq F value Pr(>F)
## supp 1 205.4 205.4 15.572 0.000231 ***
## dose 2 2426.4 1213.2 92.000 < 2e-16 ***
## supp:dose 2 108.3 54.2 4.107 0.021860 *
## Residuals 54 712.1 13.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# As regression
tooth.model <- lm(len ~ supp * dose, data=ToothGrowth)
anova(tooth.model)
## Analysis of Variance Table
##
## Response: len
## Df Sum Sq Mean Sq F value Pr(>F)
## supp 1 205.35 205.35 15.572 0.0002312 ***
## dose 2 2426.43 1213.22 92.000 < 2.2e-16 ***
## supp:dose 2 108.32 54.16 4.107 0.0218603 *
## Residuals 54 712.11 13.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(tooth.model)
##
## Call:
## lm(formula = len ~ supp * dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.20 -2.72 -0.27 2.65 8.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.230 1.148 11.521 3.60e-16 ***
## suppVC -5.250 1.624 -3.233 0.00209 **
## dose1 9.470 1.624 5.831 3.18e-07 ***
## dose2 12.830 1.624 7.900 1.43e-10 ***
## suppVC:dose1 -0.680 2.297 -0.296 0.76831
## suppVC:dose2 5.330 2.297 2.321 0.02411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.631 on 54 degrees of freedom
## Multiple R-squared: 0.7937, Adjusted R-squared: 0.7746
## F-statistic: 41.56 on 5 and 54 DF, p-value: < 2.2e-16