library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
I’m going to use a dataset that comes with a standard R installation. The dataset is from a study on the effect of vitamin C on tooth growth in guinea pigs
len
= tooth length
supp
= the way the supplement was administered (OJ = orange juice) (VC = absorbic acid)
dose
= amount of supplement in mg (levels are 0.5, 1.0, 2.0)
## Load data
data(ToothGrowth)
## The data has 60 observations on 3 variables:
head(ToothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
## This shows that the "dose" variable is numeric
## We want to change it to a factor variable
## where 0.5 = low, 1.0 = med, 2.0 = high
str(ToothGrowth)
## 'data.frame': 60 obs. of 3 variables:
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
ToothGrowth$dose = factor(ToothGrowth$dose, levels=c(0.5,1.0,2.0),
labels=c("low","med","high"))
## Let's look at the sample data per cell
ToothGrowth %>%
group_by(supp, dose) %>%
summarize(n=n(), mean=mean(len), sd=sd(len))
## Source: local data frame [6 x 5]
## Groups: supp
##
## supp dose n mean sd
## 1 OJ low 10 13.23 4.459709
## 2 OJ med 10 22.70 3.910953
## 3 OJ high 10 26.06 2.655058
## 4 VC low 10 7.98 2.746634
## 5 VC med 10 16.77 2.515309
## 6 VC high 10 26.14 4.797731
Here are the means and (standard deviations) for each cell. Note that each cell has a sample size of n=10.
…… | Orange Juice | .AbsorbAcid. | Total
—— | ——————- | —————– | ————————-
Low | 13.23 (4.460) | 07.98 (2.747) | 10.605 (4.4998)
Med | 22.70 (3.911) | 16.77 (2.515) | 19.735 (4.4154)
High | 26.06 (2.655) | 26.14 (4.798) | 26.100 (3.7742)
—— | ——————- | —————– | ————————-
Total | 20.66 (6.606) | 16.96 (8.266) | N=60, 18.813 (7.6493)
There’s an easier way to get these summary statistics, but it requires running the ANOVA first.
### Let's take a look at some visualizations of this data
## Density plots to investigate normality
densityplot(~len|dose*supp, data=ToothGrowth,
main="Tooth Growth by Dose and Supplement",
xlab="Tooth Growth",
layout=c(3,2))
## Let's also run a test to check for homogeneity of variances
## Bartlett's test (run separately for dose and supplement groups)
bartlett.test(len~dose, data=ToothGrowth)
##
## Bartlett test of homogeneity of variances
##
## data: len by dose
## Bartlett's K-squared = 0.6655, df = 2, p-value = 0.717
bartlett.test(len~supp, data=ToothGrowth)
##
## Bartlett test of homogeneity of variances
##
## data: len by supp
## Bartlett's K-squared = 1.4217, df = 1, p-value = 0.2331
## What do those relatively high p-values tell us?
## Boxplots to investigate group differences
bwplot(supp~len|dose, data=ToothGrowth,
ylab="supplement", xlab="Tooth Growth",
main="Tooth Growth by Dose and Supplement",
layout=(c(1,3)))
## Let's take a look at a means plot to check for interaction
with(ToothGrowth, interaction.plot(x.factor=dose, trace.factor=supp,
response=len, fun=mean, type="b", legend=T,
ylab="Tooth Length", main="Interaction Plot",
pch=c(1,19)))
** We’re now ready to run the AxB ANOVA**
## AxB ANOVA
# Note that it models the LEN as a function of
# supp * dose. This means our model is:
# len ~ f(supp + dose + suppXdose)
aov.out = aov(len ~ supp * dose, data=ToothGrowth)
## Let's get a summary of the model
summary(aov.out)
## Df Sum Sq Mean Sq F value Pr(>F)
## supp 1 205.4 205.4 15.572 0.000231 ***
## dose 2 2426.4 1213.2 92.000 < 2e-16 ***
## supp:dose 2 108.3 54.2 4.107 0.021860 *
## Residuals 54 712.1 13.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assuming we're using an alpha of 0.05, we have a significant
## interaction. We can now split the data by dose and compare
## supplements using a Tukey test
TukeyHSD(aov.out, which=c("supp"), conf.level=.99)
## Tukey multiple comparisons of means
## 99% family-wise confidence level
##
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
##
## $supp
## diff lwr upr p adj
## VC-OJ -3.7 -6.203448 -1.196552 0.0002312
## The p-value indicates the difference is statistically significant
## The confidence interval for the difference ranges from -1.2 to -6.2.
## We can plot this confidence interval
plot(TukeyHSD(aov.out, which=c("supp")))
## We could have also split our data by supplement and compare doses
TukeyHSD(aov.out, which=c("dose"), conf.level=.99)
## Tukey multiple comparisons of means
## 99% family-wise confidence level
##
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
##
## $dose
## diff lwr upr p adj
## med-low 9.130 5.637681 12.622319 0.0e+00
## high-low 15.495 12.002681 18.987319 0.0e+00
## high-med 6.365 2.872681 9.857319 2.7e-06
plot(TukeyHSD(aov.out, which=c("dose")))
## Finally, we could have also run Tukey tests to compare all cell means
TukeyHSD(aov.out, conf.level=.99)
## Tukey multiple comparisons of means
## 99% family-wise confidence level
##
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
##
## $supp
## diff lwr upr p adj
## VC-OJ -3.7 -6.203448 -1.196552 0.0002312
##
## $dose
## diff lwr upr p adj
## med-low 9.130 5.637681 12.622319 0.0e+00
## high-low 15.495 12.002681 18.987319 0.0e+00
## high-med 6.365 2.872681 9.857319 2.7e-06
##
## $`supp:dose`
## diff lwr upr p adj
## VC:low-OJ:low -5.25 -11.012826 0.5128257 0.0242521
## OJ:med-OJ:low 9.47 3.707174 15.2328257 0.0000046
## VC:med-OJ:low 3.54 -2.222826 9.3028257 0.2640208
## OJ:high-OJ:low 12.83 7.067174 18.5928257 0.0000000
## VC:high-OJ:low 12.91 7.147174 18.6728257 0.0000000
## OJ:med-VC:low 14.72 8.957174 20.4828257 0.0000000
## VC:med-VC:low 8.79 3.027174 14.5528257 0.0000210
## OJ:high-VC:low 18.08 12.317174 23.8428257 0.0000000
## VC:high-VC:low 18.16 12.397174 23.9228257 0.0000000
## VC:med-OJ:med -5.93 -11.692826 -0.1671743 0.0073930
## OJ:high-OJ:med 3.36 -2.402826 9.1228257 0.3187361
## VC:high-OJ:med 3.44 -2.322826 9.2028257 0.2936430
## OJ:high-VC:med 9.29 3.527174 15.0528257 0.0000069
## VC:high-VC:med 9.37 3.607174 15.1328257 0.0000058
## VC:high-OJ:high 0.08 -5.682826 5.8428257 1.0000000
plot(TukeyHSD(aov.out))
## We could also use t-tests with the Bonferroni adjustment
## to compare pairs of means
# This compares tooth lengths by dose levels. The table reports p-values:
with(ToothGrowth, pairwise.t.test(len, dose, p.adjust.method="bonferroni"))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: len and dose
##
## low med
## med 2.0e-08 -
## high 4.4e-16 4.3e-05
##
## P value adjustment method: bonferroni
# This compares tooth lengths by supplements. The table reports p-values:
with(ToothGrowth, pairwise.t.test(len, supp, p.adjust.method="bonferroni"))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: len and supp
##
## OJ
## VC 0.06
##
## P value adjustment method: bonferroni
## Let's take another look at our ANOVA summary table
summary(aov.out)
## Df Sum Sq Mean Sq F value Pr(>F)
## supp 1 205.4 205.4 15.572 0.000231 ***
## dose 2 2426.4 1213.2 92.000 < 2e-16 ***
## supp:dose 2 108.3 54.2 4.107 0.021860 *
## Residuals 54 712.1 13.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Let's calculate an eta-squared for our model
## SStotal = 205 + 2426 + 108 + 712 = 3451
## SSerror = 712
## SSmodel = SS explained by our model = SSsupp + SSdose + SSinteraction
### SSmodel = 205 + 2426 + 108 = 2739
#### eta-squared = 2739 / 3451 = 0.79
### This eta-squared will be reported as R-squared in
### the following table. We'll understand everything
### about these tables once we get to the next unit.
summary.lm(aov.out)
##
## Call:
## aov(formula = len ~ supp * dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.20 -2.72 -0.27 2.65 8.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.230 1.148 11.521 3.60e-16 ***
## suppVC -5.250 1.624 -3.233 0.00209 **
## dosemed 9.470 1.624 5.831 3.18e-07 ***
## dosehigh 12.830 1.624 7.900 1.43e-10 ***
## suppVC:dosemed -0.680 2.297 -0.296 0.76831
## suppVC:dosehigh 5.330 2.297 2.321 0.02411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.631 on 54 degrees of freedom
## Multiple R-squared: 0.7937, Adjusted R-squared: 0.7746
## F-statistic: 41.56 on 5 and 54 DF, p-value: < 2.2e-16
## Now that we've conducted our AxB ANOVA, let's
## evaluate the assumptions
## This command creates a series of plots...
plot(aov.out)
## The first plot shows residuals (errors) as a function of
## fitted (predicted) values. We want to see equal variances
## in our residuals across all levels of the fitted values.
## This plot looks fine.
#### The second plot shows a Q-Q plot of the residuals. We
#### assume they follow a normal distribution. This assumption
#### is a bit off, but that's to be expected with such low
#### sample sizes. We might want to run a different analysis
#### to check our conclusions
## The third and fourth plots go beyond what we'll study in this class.
##### Finally, let's take a look at a plot of our model effects
plot.design(len~supp*dose, data=ToothGrowth)
Let’s analyze the cholesterol dataset (from class) using a randomization-based approach.
## Cholesterol example from activity
chol <- read.csv("http://www.bradthiessen.com/html5/data/cholesterol.csv")
# Run the AxB ANOVA
chol_model <- lm(cholest ~ drug*obese, data=chol)
anova(chol_model)
## Analysis of Variance Table
##
## Response: cholest
## Df Sum Sq Mean Sq F value Pr(>F)
## drug 1 46.34 46.34 0.6322 0.431743
## obese 1 752.82 752.82 10.2704 0.002831 **
## drug:obese 1 355.24 355.24 4.8465 0.034197 *
## Residuals 36 2638.77 73.30
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(cholest ~ drug*obese, data=chol))
## Analysis of Variance Table
##
## Response: cholest
## Df Sum Sq Mean Sq F value Pr(>F)
## drug 1 46.34 46.34 0.6322 0.431743
## obese 1 752.82 752.82 10.2704 0.002831 **
## drug:obese 1 355.24 355.24 4.8465 0.034197 *
## Residuals 36 2638.77 73.30
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run it again but collect the eta-squared value
chol_aov <- do(1) * lm(cholest ~ drug*obese, data=chol)
observed_r_squared <- chol_aov$r.squared
observed_r_squared
## [1] 0.304337
# Shuffle interaction term 10,000 times and record eta-squared
random_interact <- do(10000) *
lm(cholest ~ drug + obese + shuffle(drug):shuffle(obese), data=chol)
# Plot those 10,000 eta-squared values
histogram(~r.squared, data = random_interact, col="grey", xlim=c(.2, .45), xlab = "eta-squared", width=.005)
# Find how unlikely our observed eta-squared was
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_interact)
##
## TRUE FALSE
## 0.0328 0.9672
## That p-value is similar to what we got from our AxB ANOVA.
# Shuffle drug
random_drug <- do(10000) * lm(cholest ~ shuffle(drug) + obese + drug:obese, data=chol)
histogram(~r.squared, data = random_drug, col="grey", xlim=c(.2, .45), xlab = "eta-squared", width=.005)
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_drug)
##
## TRUE FALSE
## 0.3214 0.6786
# Shuffle obese
random_obese <- do(10000) * lm(cholest ~ drug + shuffle(obese) + drug:obese, data=chol)
histogram(~r.squared, data = random_obese, col="grey", xlim=c(0, .35), xlab = "eta-squared", width=.005)
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_obese)
##
## TRUE FALSE
## 0.0003 0.9997
# Run the AxB ANOVA
tooth_model = lm(len ~ supp + dose + supp:dose, data=ToothGrowth)
anova(tooth_model)
## Analysis of Variance Table
##
## Response: len
## Df Sum Sq Mean Sq F value Pr(>F)
## supp 1 205.35 205.35 15.572 0.0002312 ***
## dose 2 2426.43 1213.22 92.000 < 2.2e-16 ***
## supp:dose 2 108.32 54.16 4.107 0.0218603 *
## Residuals 54 712.11 13.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run it again but collect the eta-squared value
tooth_aov <- do(1) * lm(len ~ supp + dose + supp:dose, data=ToothGrowth)
observed_r_squared <- tooth_aov$r.squared
observed_r_squared
## [1] 0.7937246
# Shuffle interaction term 10,000 times and record eta-squared
random_interact <- do(10000) *
lm(len ~ supp + dose + shuffle(supp):shuffle(dose), data=ToothGrowth)
# Plot those 10,000 eta-squared values
histogram(~r.squared, data = random_interact, col="grey", xlim=c(.7, .9), xlab = "eta-squared", width=.005)
# Find how unlikely our observed eta-squared was
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_interact)
##
## TRUE FALSE
## 0.1906 0.8094
# Shuffle supplement
random_supp <- do(10000) * lm(len ~ shuffle(supp) + dose + supp:dose, data=ToothGrowth)
histogram(~r.squared, data = random_supp, col="grey", xlim=c(.75, .85), xlab = "eta-squared", width=.003)
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_supp)
##
## TRUE FALSE
## 0.9997 0.0003
# Shuffle dose
random_dose <- do(10000) * lm(len ~ supp + shuffle(dose) + supp:dose, data=ToothGrowth)
histogram(~r.squared, data = random_dose, col="grey", xlim=c(.75, .9), xlab = "eta-squared", width=.005)
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_dose)
##
## TRUE FALSE
## 1 0