# Load the tidyverse and mosaic packages
library(tidyverse)
library(mosaic)
What affect does a disability have on perceived job interview performance?
Two male actors recorded five videos of a job interview. Each video followed the same script (to show an applicant of average qualifications). The videos only differed in how the applicant was portrayed:
70 undergraduates were randomly split into five groups of n=14
, with each group viewing one video. The students were asked to complete a 10-item Applicant Qualification Scale to assess the applicant’s job qualifications. Each item was scored on a 9-point scale and the overall average rating was recorded from each student.
Suppose we’re interested in two questions:
Here are the interview scores from all 70 students:
# Download data from my website and store it as "interviews"
interviews <- read.csv("http://www.bradthiessen.com/html5/data/disability.csv")
# See the first several rows of data to get a sense of the contents
head(interviews)
# # A tibble: 6 × 2
# disability score
# * <fctr> <dbl>
# 1 none 1.9
# 2 none 2.5
# 3 none 3.0
# 4 none 3.6
# 5 none 4.1
# 6 none 4.2
interviews %>%
ggplot(aes(x = disability, y = score)) +
geom_dotplot(aes(fill=disability), binaxis = "y", binpositions="all", color = "white") +
scale_y_continuous(breaks=seq(0, 9, 1), minor_breaks=NULL) +
theme(axis.title.x = element_text(size = 12, color = "#777777")) +
theme(axis.text.x = element_text(size = 12)) +
theme(axis.title.y = element_text(size = 12, color="#777777")) +
theme(axis.text.y = element_text(size = 12)) +
theme(legend.position="none") +
labs(
title = "Qualifications by Disability"
) +
coord_flip(xlim = NULL, ylim = NULL, expand = TRUE)
interviews %>%
ggplot(aes(x = disability, y = score)) +
geom_dotplot(aes(fill=disability), binaxis = "y", binpositions="all", color = "white", alpha=.3) +
geom_boxplot(fill = "white", color = "black", alpha = 0.6) +
scale_y_continuous(breaks=seq(0, 9, 1), minor_breaks=NULL) +
theme(axis.title.x = element_text(size = 12, color = "#777777")) +
theme(axis.text.x = element_text(size = 12)) +
theme(axis.title.y = element_text(size = 12, color="#777777")) +
theme(axis.text.y = element_text(size = 12)) +
theme(legend.position="none") +
labs(
title = "Qualifications by Disability"
) +
coord_flip(xlim = NULL, ylim = NULL, expand = TRUE)
interviews %>%
ggplot(aes(x = disability, y = score)) +
stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.y = mean, geom = "line", aes(group = 1),
colour = "Blue", linetype = "dashed") +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
labs(x = "Treatment", y = "Mean Comprehension Score")
# Install car package if it's not already installed
list.of.packages <- c("car")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(car)
# Levene Test
leveneTest(score ~ disability, data = interviews, center = mean)
# # A tibble: 2 × 3
# Df `F value` `Pr(>F)`
# * <int> <dbl> <dbl>
# 1 4 0.1960975 0.9395723
# 2 65 NA NA
# Shapiro-Wilk test
tapply(interviews$score, interviews$disability, shapiro.test)
# $Amputee
#
# Shapiro-Wilk normality test
#
# data: X[[i]]
# W = 0.96508, p-value = 0.805
#
#
# $Crutches
#
# Shapiro-Wilk normality test
#
# data: X[[i]]
# W = 0.94916, p-value = 0.5478
#
#
# $Hearing
#
# Shapiro-Wilk normality test
#
# data: X[[i]]
# W = 0.97464, p-value = 0.9318
#
#
# $none
#
# Shapiro-Wilk normality test
#
# data: X[[i]]
# W = 0.97932, p-value = 0.9707
#
#
# $Wheelchair
#
# Shapiro-Wilk normality test
#
# data: X[[i]]
# W = 0.94052, p-value = 0.4251
model <- aov(score ~ disability, data = interviews)
anova(model)
# # A tibble: 2 × 5
# Df `Sum Sq` `Mean Sq` `F value` `Pr(>F)`
# * <int> <dbl> <dbl> <dbl> <dbl>
# 1 4 30.52143 7.630357 2.86158 0.03012686
# 2 65 173.32143 2.666484 NA NA
# Store some values
SSa <- 30.521
SSt <- 30.521 + 173.321
DFa <- 4
MSe <- 2.6665
# Eta-squared
SSa / SSt
# [1] 0.1497287
# Omega-squared
( SSa - (DFa * MSe) ) / (SSt + MSe)
# [1] 0.09614616
Our research questions require that we compare specific group means. Unforunately, ANOVA results do not allow us to identify which group means differ from one another. To do this, we’ll derive two types of post hoc methods:
# With 5 groups, we'll need 10 tests
choose(5, 2)
# [1] 10
# Family-wise alpha
1 - (.95^10)
# [1] 0.4012631
That’s the idea behind the Bonferroni correction:
Bonferroni correction for alpha: \(\alpha_{\textrm{for each test}}=\frac{\alpha_{\textrm{overall}}}{\textrm{number of tests}}\)
or
Bonferroni correction for p-values: \(p_{\textrm{adjusted}}=p_{\textrm{unadjusted}}*\textrm{number of tests}\)
Let’s try the Bonferroni correction. Remember, the p-value from our ANOVA implies at least one linear combination of group means will differ from zero. Let’s conduct all possible pairwise t-tests twice: once without correcting the p-values and once with the Bonferroni correction.
# Pairwise t-tests with no adjustment
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="none")
#
# Pairwise comparisons using t tests with pooled SD
#
# data: interviews$score and interviews$disability
#
# Amputee Crutches Hearing none
# Crutches 0.0184 - - -
# Hearing 0.5418 0.0035 - -
# none 0.4477 0.1028 0.1732 -
# Wheelchair 0.1433 0.3520 0.0401 0.4756
#
# P value adjustment method: none
# Pairwise t-tests with Bonferroni correction on the p-values
# Verify the p-values increase by a factor of 10 (in this example)
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="bonferroni")
#
# Pairwise comparisons using t tests with pooled SD
#
# data: interviews$score and interviews$disability
#
# Amputee Crutches Hearing none
# Crutches 0.184 - - -
# Hearing 1.000 0.035 - -
# none 1.000 1.000 1.000 -
# Wheelchair 1.000 1.000 0.401 1.000
#
# P value adjustment method: bonferroni
Because Bonferroni’s correction is conservative (and can struggle to find mean differences that actually exist), several other post-hoc strategies have been developed.
To use Holm’s method, you first calculate p-values from all pairwise t-tests. Then, you rank order those p-values from largest to smallest. For our data, the ranked order of our (uncorrected) p-values is:
We then correct each of those p-values by multiplying the p-value by its rank:
Notice that for the smallest p-value, this method is exactly the same as the Bonferroni correction. In subsequent comparisons, however, the p-value is corrected only for the remaining number of comparisons. This makes it more likely to find a difference between a pair of group means.
Let’s have R calculate the pairwise tests using Holm’s method:
# Holm's method
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="holm")
#
# Pairwise comparisons using t tests with pooled SD
#
# data: interviews$score and interviews$disability
#
# Amputee Crutches Hearing none
# Crutches 0.165 - - -
# Hearing 1.000 0.035 - -
# none 1.000 0.719 0.866 -
# Wheelchair 0.860 1.000 0.321 1.000
#
# P value adjustment method: holm
We don’t have time to investigate more of these pairwise post-hoc methods (e.g., Hochberg’s False Discovery Rate, Tukey’s Honestly Significant Difference, Dunnett’s method). You should get a sense, though, for which methods perform best.
As you learned in a previous statistics class, the Type I error rate is linked to the statistical power of a test. Because of this, it’s important that post-hoc methods control the overall Type I error rate without sacrificing too much power.
Bonferroni’s correction and Tukey’s HSD control the Type I error rate well, but they lack statistical power. Bonferroni’s correction has more power when testing a small number of comparisons, while Tukey’s HSD performs better with a larger number of comparisons. Holm’s method is more powerful than both Bonferroni and Tukey (and Hochberg’s method is more powerful still).
Each of these methods, however, performs poorly if the population group variances differ and the sample sizes are unequal. If you have an unequal sample sizes and unequal variances, you should probably use a robust post-hoc comparison method.
In case you’re interested, here’s the code to conduct a Tukey’s HSD:
# Conduct the post-hoc test on our ANOVA "model"
TukeyHSD(model, conf.level=.95)
# Tukey multiple comparisons of means
# 95% family-wise confidence level
#
# Fit: aov(formula = score ~ disability, data = interviews)
#
# $disability
# diff lwr upr p adj
# Crutches-Amputee 1.4928571 -0.2388756 3.2245899 0.1232819
# Hearing-Amputee -0.3785714 -2.1103042 1.3531613 0.9724743
# none-Amputee 0.4714286 -1.2603042 2.2031613 0.9399911
# Wheelchair-Amputee 0.9142857 -0.8174470 2.6460185 0.5781165
# Hearing-Crutches -1.8714286 -3.6031613 -0.1396958 0.0277842
# none-Crutches -1.0214286 -2.7531613 0.7103042 0.4686233
# Wheelchair-Crutches -0.5785714 -2.3103042 1.1531613 0.8812293
# none-Hearing 0.8500000 -0.8817328 2.5817328 0.6442517
# Wheelchair-Hearing 1.2928571 -0.4388756 3.0245899 0.2348141
# Wheelchair-none 0.4428571 -1.2888756 2.1745899 0.9517374
Recall the key questions in this study:
Pairwise comparisons won’t address those questions; we need to compare linear combinations of group means:
To compare linear combination of means, we can use the Scheffe method.
With the Scheffe method, we first need to calculate our contrast of interest by choosing weights in our linear combination:
Contrast (linear combination of means): \(\psi =\sum_{a=1}^{a}c_{a}\mu _{a}=c_{1}\mu _{1}+c_{2}\mu _{2}+c_{3}\mu _{3}+c_{4}\mu _{4}+c_{5}\mu _{5}, \ \ \ \textrm{where} \ \sum c_{a}=0\)
# Let's see the order of our disability groups
levels(interviews$disability)
# [1] "Amputee" "Crutches" "Hearing" "none" "Wheelchair"
# The no disabilities group is group #4 (out of 5)
# Create vector of contrast coefficients
coeff <- c(-1/4, -1/4, -1/4, 1, -1/4)
# Store group means as mm
mm <-mean(score ~ disability, data=interviews)
# Store group sample sizes as nn
nn <- tapply(interviews$score, interviews$disability, length)
# Calculate contrast (psi)
psi <- sum(coeff*mm)
psi
# [1] -0.03571429
We wouldn’t expect our observed contrast (calculated from sample data) to equal zero (even if the group population means were all identical).
Our task is to determine the likelihood of observing a contrast as or more extreme than the one we calculated (assuming the null hypothesis is true). To do this, we need to derive the sampling distribution of our contrast.
Let’s see if the general form of a t-test can be used to determine the likelihood of our contrast:
\(t=\frac{\textrm{observed}-\textrm{hypothesized}}{\textrm{standard error}}=\frac{\hat{\psi }-0}{\textrm{SE}_{\psi }}\)
If this works, we only need to derive the standard error of a contrast. Then, we could use the t-distribution to estimate our likelihood.
To derive the standard error of a contrast, we need to remember two properties of variances:
\(\textrm{var}\left ( ax \right )=a^{2}\textrm{var}\left ( x \right )\).
If X and Y are independent, then: \(\textrm{var}\left ( X+Y \right )=\textrm{var}\left ( X \right )+\textrm{var}\left ( Y \right )\)
With these two facts, we can calculate the variance of our contrast:
\(\textrm{var}\left ( \hat{\psi} \right )=\textrm{var}\left ( c_{1}\overline{X}_{1} \right )+\textrm{var}\left ( c_{2}\overline{X}_{2} \right )+\textrm{var}\left ( c_{3}\overline{X}_{3} \right )+\textrm{var}\left ( c_{4}\overline{X}_{4} \right )+\textrm{var}\left ( c_{5}\overline{X}_{5} \right )\)
\(\textrm{var}\left ( \hat{\psi} \right )=c_{1}^{2}\textrm{var}\left ( \overline{X}_{1} \right )+c_{2}^{2}\textrm{var}\left ( \overline{X}_{2} \right )+c_{3}^{2}\textrm{var}\left ( \overline{X}_{3} \right )+c_{4}^{2}\textrm{var}\left ( \overline{X}_{4} \right )+c_{5}^{2}\textrm{var}\left ( \overline{X}_{5} \right )\)
We now need one other fact about the variance of a mean: \(\textrm{var}\left ( \overline{X} \right )=\frac{\sigma _{x}^{2}}{n}\)
\(\textrm{var}\left ( \hat{\psi} \right )=c_{1}^{2}\frac{\sigma _{1}^{2}}{n_{1}}+c_{2}^{2}\frac{\sigma _{2}^{2}}{n_{2}}+c_{3}^{2}\frac{\sigma _{3}^{2}}{n_{3}}+c_{4}^{2}\frac{\sigma _{4}^{2}}{n_{4}}+c_{5}^{2}\frac{\sigma _{5}^{2}}{n_{5}}\)
If we’re going to assume the group variances are equal (which we already assumed in our ANOVA), we have:
\(\textrm{var}\left ( \hat{\psi} \right )=\sum \frac{c_{a}^{2}}{n_{a}}\sigma _{x}^{2}\)
Finally, we know MSE is our best estimate of the pooled standard deviation of our groups, so we can substitute:
\(\textrm{var}\left ( \hat{\psi} \right )=\textrm{MS}_{\textrm{E}}\sum \frac{c_{a}^{2}}{n_{a}}\)
That’s the standard error of our contrast. We can now substitute this into our t-statistic:
\(t=\frac{\textrm{observed}-\textrm{hypothesized}}{\textrm{standard error}}=\frac{\hat{\psi }-0}{\textrm{SE}_{\psi }}=\frac{\hat{\psi }-0}{\sqrt{\textrm{MS}_{\textrm{E}}\sum \frac{c_{a}^{2}}{n_{a}}}} \sim\sqrt{\left ( a-1 \right )F}\)
# Calculate standard error of contrast
# We need the value of MSE. We already stored it earlier as 'MSe'
# but here's another way to store that result
# First, run the ANOVA to get the mean squares
omnianova <- anova(aov(score ~ disability, data=interviews))
# Then, store mean square error as MSE
MSE <- omnianova$`Mean Sq`[2]
# Finally, calculate the SE of our psi
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )
# Calculate our test statistic (psi / std. error)
PSIoverSE <- psi / SEpsi
# Find critical value
# Choose alpha
alpha = 0.05
# sqrt(a-1) x sqrt(F)
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
# Paste results
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
# [1] "observed test.stat = -0.0732 . Critical value = 3.1705"
# We get the same result with a different contrast
coeff <- c(-1,-1,-1,4,-1)
mm <-mean(score ~ disability, data=interviews)
nn <- tapply(interviews$score, interviews$disability, length)
psi <- sum(coeff*mm)
omnianova <- anova(aov(score ~ disability, data=interviews))
MSE <- omnianova$`Mean Sq`[2]
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )
SEpsi <- sqrt(MSe) * sqrt( sum((coeff^2)/nn) )
PSIoverSE <- psi / SEpsi
alpha = 0.05
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
# [1] "observed test.stat = -0.0732 . Critical value = 3.1705"
coeff <- c(-2,3,-2,3,-2)
mm <-mean(score ~ disability, data=interviews)
nn <- tapply(interviews$score, interviews$disability, length)
psi <- sum(coeff*mm)
omnianova <- anova(aov(score ~ disability, data=interviews))
MSE <- omnianova$`Mean Sq`[2]
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )
PSIoverSE <- psi / SEpsi
alpha = 0.05
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
# [1] "observed test.stat = 2.017 . Critical value = 3.1705"
# Load data
music <- read.csv("http://www.bradthiessen.com/html5/data/music.csv")
# Means plot with standard error bars
ggplot(music, aes(MusicType, Amount)) +
stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.y = mean, geom = "line", aes(group = 1),
colour = "Blue", linetype = "dashed") +
stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
labs(x = "Music Type", y = "Money Spent")
# Levene's test for equal variances
leveneTest(Amount ~ MusicType, data=music)
# # A tibble: 2 × 3
# Df `F value` `Pr(>F)`
# * <int> <dbl> <dbl>
# 1 2 6.256873 0.002115235
# 2 390 NA NA
Once you’re satisfied with your solutions, email that html file to thiessenbradleya@sau.edu or give me a printed copy.
If you run into any problems, email me or work with other students in the class.
Sources
Study: Cesare, Tannenbaum, Dalessio (1990). Interviewers’ Decisions Related to Applicant Handicap Type and Rater Empathy. Human Performance, 3(3)
This document is released under a Creative Commons Attribution-ShareAlike 3.0 Unported license.