# Load the tidyverse and mosaic packages
library(tidyverse)
library(mosaic)

Scenario: Job interviews

What affect does a disability have on perceived job interview performance?


Two male actors recorded five videos of a job interview. Each video followed the same script (to show an applicant of average qualifications). The videos only differed in how the applicant was portrayed:

  • Wheelchair video (the applicant appeared in a wheelchair)
  • Crutches video (the applicant used canadian crutches)
  • Hearing impaired video
  • Amputee video (the applicant appeared to have one leg amputated)
  • No disability video


70 undergraduates were randomly split into five groups of n=14, with each group viewing one video. The students were asked to complete a 10-item Applicant Qualification Scale to assess the applicant’s job qualifications. Each item was scored on a 9-point scale and the overall average rating was recorded from each student.

Suppose we’re interested in two questions:

  • Q1: Are disabilities associated with higher or lower qualifications scores?
  • Q2: Are permanent disabilities (amputee, hearing, wheelchair) associated with lower scores than temporary disabilities (crutches)?



Data Exploration

Here are the interview scores from all 70 students:

# Download data from my website and store it as "interviews"
interviews <- read.csv("http://www.bradthiessen.com/html5/data/disability.csv")

# See the first several rows of data to get a sense of the contents
head(interviews)
#    # A tibble: 6 × 2
#      disability score
#    *     <fctr> <dbl>
#    1       none   1.9
#    2       none   2.5
#    3       none   3.0
#    4       none   3.6
#    5       none   4.1
#    6       none   4.2



Dotplot


interviews %>%
  ggplot(aes(x = disability, y = score)) +
  geom_dotplot(aes(fill=disability), binaxis = "y", binpositions="all", color = "white") +
  scale_y_continuous(breaks=seq(0, 9, 1), minor_breaks=NULL) +
  theme(axis.title.x = element_text(size = 12, color = "#777777")) +
  theme(axis.text.x = element_text(size = 12)) +
  theme(axis.title.y = element_text(size = 12, color="#777777")) +
  theme(axis.text.y = element_text(size = 12)) +
  theme(legend.position="none") +
  labs(
    title = "Qualifications by Disability"
  ) +
  coord_flip(xlim = NULL, ylim = NULL, expand = TRUE)

Boxplot


interviews %>%
  ggplot(aes(x = disability, y = score)) +
  geom_dotplot(aes(fill=disability), binaxis = "y", binpositions="all", color = "white", alpha=.3) +
  geom_boxplot(fill = "white", color = "black", alpha = 0.6) +
  scale_y_continuous(breaks=seq(0, 9, 1), minor_breaks=NULL) +
  theme(axis.title.x = element_text(size = 12, color = "#777777")) +
  theme(axis.text.x = element_text(size = 12)) +
  theme(axis.title.y = element_text(size = 12, color="#777777")) +
  theme(axis.text.y = element_text(size = 12)) +
  theme(legend.position="none") +
  labs(
    title = "Qualifications by Disability"
  ) +
  coord_flip(xlim = NULL, ylim = NULL, expand = TRUE)

Means with standard errors


interviews %>%
  ggplot(aes(x = disability, y = score)) +
  stat_summary(fun.y = mean, geom = "point") + 
  stat_summary(fun.y = mean, geom = "line", aes(group = 1),
               colour = "Blue", linetype = "dashed") +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  labs(x = "Treatment", y = "Mean Comprehension Score")

Conditions for ANOVA

  1. Based on the visualizations, what can you conclude? What are the conditions necessary to conduct an ANOVA? Expand the code and interpret the output.
# Install car package if it's not already installed
list.of.packages <- c("car")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(car)

# Levene Test
leveneTest(score ~ disability, data = interviews, center = mean)
#    # A tibble: 2 × 3
#         Df `F value`  `Pr(>F)`
#    * <int>     <dbl>     <dbl>
#    1     4 0.1960975 0.9395723
#    2    65        NA        NA

# Shapiro-Wilk test
tapply(interviews$score, interviews$disability, shapiro.test)
#    $Amputee
#    
#       Shapiro-Wilk normality test
#    
#    data:  X[[i]]
#    W = 0.96508, p-value = 0.805
#    
#    
#    $Crutches
#    
#       Shapiro-Wilk normality test
#    
#    data:  X[[i]]
#    W = 0.94916, p-value = 0.5478
#    
#    
#    $Hearing
#    
#       Shapiro-Wilk normality test
#    
#    data:  X[[i]]
#    W = 0.97464, p-value = 0.9318
#    
#    
#    $none
#    
#       Shapiro-Wilk normality test
#    
#    data:  X[[i]]
#    W = 0.97932, p-value = 0.9707
#    
#    
#    $Wheelchair
#    
#       Shapiro-Wilk normality test
#    
#    data:  X[[i]]
#    W = 0.94052, p-value = 0.4251

ANOVA

  1. Expand the code to see output from an ANOVA. Verify the degrees of freedom and interpret what the SS values represent. Interpret the p-value and effect size estimates. From this ANOVA, which disability is associated with the lowest qualifications score?
model <- aov(score ~ disability, data = interviews)
anova(model)
#    # A tibble: 2 × 5
#         Df  `Sum Sq` `Mean Sq` `F value`   `Pr(>F)`
#    * <int>     <dbl>     <dbl>     <dbl>      <dbl>
#    1     4  30.52143  7.630357   2.86158 0.03012686
#    2    65 173.32143  2.666484        NA         NA

# Store some values
SSa <- 30.521
SSt <- 30.521 + 173.321
DFa <- 4
MSe <- 2.6665

# Eta-squared
SSa / SSt
#    [1] 0.1497287

# Omega-squared
( SSa - (DFa * MSe) ) / (SSt + MSe)
#    [1] 0.09614616

Limited conclusions

Our research questions require that we compare specific group means. Unforunately, ANOVA results do not allow us to identify which group means differ from one another. To do this, we’ll derive two types of post hoc methods:

  • Pairwise comparison methods
  • Flexible comparison methods (contrasts)



Pairwise Comparisons

  1. Why shouldn’t we simply conduct t-tests to compare each pair of group means? Support your answer with a calculation.
# With 5 groups, we'll need 10 tests
choose(5, 2)
#    [1] 10

# Family-wise alpha
1 - (.95^10)
#    [1] 0.4012631
  1. If we wanted to control the overall Type I error rate (for all pairwise t-tests combined) to be \(\alpha=0.05\), what could we do?

Bonferroni correction

That’s the idea behind the Bonferroni correction:


Bonferroni correction for alpha: \(\alpha_{\textrm{for each test}}=\frac{\alpha_{\textrm{overall}}}{\textrm{number of tests}}\)

or

Bonferroni correction for p-values: \(p_{\textrm{adjusted}}=p_{\textrm{unadjusted}}*\textrm{number of tests}\)


  1. Assuming we wanted to conduct 10 t-tests to compare all five group means – and assuming we want \(\alpha_\textrm{overall}=0.05\) – at what level should we set \(\alpha\) for each test?

Let’s try the Bonferroni correction. Remember, the p-value from our ANOVA implies at least one linear combination of group means will differ from zero. Let’s conduct all possible pairwise t-tests twice: once without correcting the p-values and once with the Bonferroni correction.


# Pairwise t-tests with no adjustment
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="none")
#    
#       Pairwise comparisons using t tests with pooled SD 
#    
#    data:  interviews$score and interviews$disability 
#    
#               Amputee Crutches Hearing none  
#    Crutches   0.0184  -        -       -     
#    Hearing    0.5418  0.0035   -       -     
#    none       0.4477  0.1028   0.1732  -     
#    Wheelchair 0.1433  0.3520   0.0401  0.4756
#    
#    P value adjustment method: none

# Pairwise t-tests with Bonferroni correction on the p-values
# Verify the p-values increase by a factor of 10 (in this example)
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="bonferroni")
#    
#       Pairwise comparisons using t tests with pooled SD 
#    
#    data:  interviews$score and interviews$disability 
#    
#               Amputee Crutches Hearing none 
#    Crutches   0.184   -        -       -    
#    Hearing    1.000   0.035    -       -    
#    none       1.000   1.000    1.000   -    
#    Wheelchair 1.000   1.000    0.401   1.000
#    
#    P value adjustment method: bonferroni
  1. From the unadjusted (and Bonferroni adjusted) p-values, what conclusions can you make?
  1. The Bonferroni correction increases the magnitude of our p-values (by lowering the Type I error rate). This makes it less likely to reject a null hypothesis and conclude a pair of group means differs from one another. What does that do to the power of our statistical tests? Identify a drawback with the Bonferroni correction.

Because Bonferroni’s correction is conservative (and can struggle to find mean differences that actually exist), several other post-hoc strategies have been developed.


Holm’s method

To use Holm’s method, you first calculate p-values from all pairwise t-tests. Then, you rank order those p-values from largest to smallest. For our data, the ranked order of our (uncorrected) p-values is:

  • Tenth: Crutches vs Hearing (p=0.0035)
  • Ninth: Crutches vs Amputee (p=0.0184)
  • Eighth: Hearing vs Wheelchair (p=0.0401)
  • Seventh: None vs Crutches (p=0.1028)
  • Sixth: Wheelchair vs Amputee (p=0.1433)
  • (I’ll skip fifth through first)

We then correct each of those p-values by multiplying the p-value by its rank:

  • 10: Crutches vs Hearing (p = 0.0035 multiplied by 10 = 0.035)
  • 9: Crutches vs Amputee (p = 0.0184 mutiplied by 9 = 0.165)
  • 8: Hearing vs Wheelchair (p = 0.0401 multiplied by 8 = 0.321)
  • 7: None vs Crutches (p = 0.1028 multiplied by 7 = 0.719)
  • 6: Wheelchair vs Amputee (p = 0.1433 multiplied by 6 = 0.860)
  • (I’ll skip fifth through first)

Notice that for the smallest p-value, this method is exactly the same as the Bonferroni correction. In subsequent comparisons, however, the p-value is corrected only for the remaining number of comparisons. This makes it more likely to find a difference between a pair of group means.


Let’s have R calculate the pairwise tests using Holm’s method:

# Holm's method
pairwise.t.test(interviews$score, interviews$disability, p.adjust.method="holm")
#    
#       Pairwise comparisons using t tests with pooled SD 
#    
#    data:  interviews$score and interviews$disability 
#    
#               Amputee Crutches Hearing none 
#    Crutches   0.165   -        -       -    
#    Hearing    1.000   0.035    -       -    
#    none       1.000   0.719    0.866   -    
#    Wheelchair 0.860   1.000    0.321   1.000
#    
#    P value adjustment method: holm


Other pairwise post-hoc methods

We don’t have time to investigate more of these pairwise post-hoc methods (e.g., Hochberg’s False Discovery Rate, Tukey’s Honestly Significant Difference, Dunnett’s method). You should get a sense, though, for which methods perform best.

As you learned in a previous statistics class, the Type I error rate is linked to the statistical power of a test. Because of this, it’s important that post-hoc methods control the overall Type I error rate without sacrificing too much power.

Bonferroni’s correction and Tukey’s HSD control the Type I error rate well, but they lack statistical power. Bonferroni’s correction has more power when testing a small number of comparisons, while Tukey’s HSD performs better with a larger number of comparisons. Holm’s method is more powerful than both Bonferroni and Tukey (and Hochberg’s method is more powerful still).

Each of these methods, however, performs poorly if the population group variances differ and the sample sizes are unequal. If you have an unequal sample sizes and unequal variances, you should probably use a robust post-hoc comparison method.


In case you’re interested, here’s the code to conduct a Tukey’s HSD:

# Conduct the post-hoc test on our ANOVA "model"
TukeyHSD(model, conf.level=.95)
#      Tukey multiple comparisons of means
#        95% family-wise confidence level
#    
#    Fit: aov(formula = score ~ disability, data = interviews)
#    
#    $disability
#                              diff        lwr        upr     p adj
#    Crutches-Amputee     1.4928571 -0.2388756  3.2245899 0.1232819
#    Hearing-Amputee     -0.3785714 -2.1103042  1.3531613 0.9724743
#    none-Amputee         0.4714286 -1.2603042  2.2031613 0.9399911
#    Wheelchair-Amputee   0.9142857 -0.8174470  2.6460185 0.5781165
#    Hearing-Crutches    -1.8714286 -3.6031613 -0.1396958 0.0277842
#    none-Crutches       -1.0214286 -2.7531613  0.7103042 0.4686233
#    Wheelchair-Crutches -0.5785714 -2.3103042  1.1531613 0.8812293
#    none-Hearing         0.8500000 -0.8817328  2.5817328 0.6442517
#    Wheelchair-Hearing   1.2928571 -0.4388756  3.0245899 0.2348141
#    Wheelchair-none      0.4428571 -1.2888756  2.1745899 0.9517374



Flexible comparisons

Scheffe

Recall the key questions in this study:

  • Q1: Are disabilities associated with higher or lower qualifications scores?
  • Q2: Are permanent disabilities (amputee, hearing, wheelchair) associated with lower scores than temporary disabilities (crutches)?

Pairwise comparisons won’t address those questions; we need to compare linear combinations of group means:

  • Q1: The no disability group vs. the 4 other groups.
  • Q2: Permanent disabilities (amputee, hearing, wheelchair) vs. temporary (crutches, none).


To compare linear combination of means, we can use the Scheffe method.


With the Scheffe method, we first need to calculate our contrast of interest by choosing weights in our linear combination:

Contrast (linear combination of means): \(\psi =\sum_{a=1}^{a}c_{a}\mu _{a}=c_{1}\mu _{1}+c_{2}\mu _{2}+c_{3}\mu _{3}+c_{4}\mu _{4}+c_{5}\mu _{5}, \ \ \ \textrm{where} \ \sum c_{a}=0\)


  1. Calculate the contrast to compare the none group mean with a combination of the other 4 group means. Under a true null hypothesis, the expected value of this contrast would equal what?
# Let's see the order of our disability groups
levels(interviews$disability)
#    [1] "Amputee"    "Crutches"   "Hearing"    "none"       "Wheelchair"

# The no disabilities group is group #4 (out of 5)
# Create vector of contrast coefficients
coeff <- c(-1/4, -1/4, -1/4, 1, -1/4)

# Store group means as mm
mm <-mean(score ~ disability, data=interviews)

# Store group sample sizes as nn
nn <- tapply(interviews$score, interviews$disability, length)

# Calculate contrast (psi)
psi <- sum(coeff*mm)
psi
#    [1] -0.03571429

We wouldn’t expect our observed contrast (calculated from sample data) to equal zero (even if the group population means were all identical).

Our task is to determine the likelihood of observing a contrast as or more extreme than the one we calculated (assuming the null hypothesis is true). To do this, we need to derive the sampling distribution of our contrast.


Let’s see if the general form of a t-test can be used to determine the likelihood of our contrast:

      \(t=\frac{\textrm{observed}-\textrm{hypothesized}}{\textrm{standard error}}=\frac{\hat{\psi }-0}{\textrm{SE}_{\psi }}\)


If this works, we only need to derive the standard error of a contrast. Then, we could use the t-distribution to estimate our likelihood.

To derive the standard error of a contrast, we need to remember two properties of variances:

  • \(\textrm{var}\left ( ax \right )=a^{2}\textrm{var}\left ( x \right )\).

  • If X and Y are independent, then:   \(\textrm{var}\left ( X+Y \right )=\textrm{var}\left ( X \right )+\textrm{var}\left ( Y \right )\)


With these two facts, we can calculate the variance of our contrast:

      \(\textrm{var}\left ( \hat{\psi} \right )=\textrm{var}\left ( c_{1}\overline{X}_{1} \right )+\textrm{var}\left ( c_{2}\overline{X}_{2} \right )+\textrm{var}\left ( c_{3}\overline{X}_{3} \right )+\textrm{var}\left ( c_{4}\overline{X}_{4} \right )+\textrm{var}\left ( c_{5}\overline{X}_{5} \right )\)

      \(\textrm{var}\left ( \hat{\psi} \right )=c_{1}^{2}\textrm{var}\left ( \overline{X}_{1} \right )+c_{2}^{2}\textrm{var}\left ( \overline{X}_{2} \right )+c_{3}^{2}\textrm{var}\left ( \overline{X}_{3} \right )+c_{4}^{2}\textrm{var}\left ( \overline{X}_{4} \right )+c_{5}^{2}\textrm{var}\left ( \overline{X}_{5} \right )\)


We now need one other fact about the variance of a mean:   \(\textrm{var}\left ( \overline{X} \right )=\frac{\sigma _{x}^{2}}{n}\)


      \(\textrm{var}\left ( \hat{\psi} \right )=c_{1}^{2}\frac{\sigma _{1}^{2}}{n_{1}}+c_{2}^{2}\frac{\sigma _{2}^{2}}{n_{2}}+c_{3}^{2}\frac{\sigma _{3}^{2}}{n_{3}}+c_{4}^{2}\frac{\sigma _{4}^{2}}{n_{4}}+c_{5}^{2}\frac{\sigma _{5}^{2}}{n_{5}}\)


If we’re going to assume the group variances are equal (which we already assumed in our ANOVA), we have:


      \(\textrm{var}\left ( \hat{\psi} \right )=\sum \frac{c_{a}^{2}}{n_{a}}\sigma _{x}^{2}\)


Finally, we know MSE is our best estimate of the pooled standard deviation of our groups, so we can substitute:


      \(\textrm{var}\left ( \hat{\psi} \right )=\textrm{MS}_{\textrm{E}}\sum \frac{c_{a}^{2}}{n_{a}}\)


That’s the standard error of our contrast. We can now substitute this into our t-statistic:


      \(t=\frac{\textrm{observed}-\textrm{hypothesized}}{\textrm{standard error}}=\frac{\hat{\psi }-0}{\textrm{SE}_{\psi }}=\frac{\hat{\psi }-0}{\sqrt{\textrm{MS}_{\textrm{E}}\sum \frac{c_{a}^{2}}{n_{a}}}} \sim\sqrt{\left ( a-1 \right )F}\)


  1. Use what was just derived to estimate the p-value of our contrast of interest. What conclusions can we make?
# Calculate standard error of contrast
# We need the value of MSE.  We already stored it earlier as 'MSe'
# but here's another way to store that result

# First, run the ANOVA to get the mean squares
omnianova <- anova(aov(score ~ disability, data=interviews))
# Then, store mean square error as MSE
MSE <- omnianova$`Mean Sq`[2]
# Finally, calculate the SE of our psi
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )

# Calculate our test statistic (psi / std. error)
PSIoverSE <- psi / SEpsi

# Find critical value
# Choose alpha
alpha = 0.05
# sqrt(a-1) x sqrt(F)
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
# Paste results
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
#    [1] "observed test.stat = -0.0732 . Critical value = 3.1705"


# We get the same result with a different contrast
coeff <- c(-1,-1,-1,4,-1)
mm <-mean(score ~ disability, data=interviews)
nn <- tapply(interviews$score, interviews$disability, length)
psi <- sum(coeff*mm)
omnianova <- anova(aov(score ~ disability, data=interviews))
MSE <- omnianova$`Mean Sq`[2]
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )
SEpsi <- sqrt(MSe) * sqrt( sum((coeff^2)/nn) )
PSIoverSE <- psi / SEpsi
alpha = 0.05
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
#    [1] "observed test.stat = -0.0732 . Critical value = 3.1705"
  1. Calculate the contrast to compare the temporary disabilities (none and crutches) to the permanent disabilities (amputee, hearing, and wheelchair)? What conclusions can you draw?
coeff <- c(-2,3,-2,3,-2)
mm <-mean(score ~ disability, data=interviews)
nn <- tapply(interviews$score, interviews$disability, length)
psi <- sum(coeff*mm)
omnianova <- anova(aov(score ~ disability, data=interviews))
MSE <- omnianova$`Mean Sq`[2]
SEpsi <- sqrt(MSE) * sqrt( sum((coeff^2)/nn) )
PSIoverSE <- psi / SEpsi
alpha = 0.05
cv <- sqrt(omnianova$Df[1]) * sqrt(qf(1-alpha, omnianova$Df[1], omnianova$Df[2]))
paste("observed test.stat =", round(PSIoverSE,4), ". Critical value =", round(cv,4))
#    [1] "observed test.stat = 2.017 . Critical value = 3.1705"



Your turn

  1. A 2003 study investigated the effect of background music on the amount of money diners spend in restaurants. A British restaurant alternated among 3 types of background music – None (silence), Pop music, and Classical music – over the course of 18 days. Each type of music was played for 6 nights, with the order of the music being randomly determined.

    The data from N = 393 diners have been loaded in a data.frame called music. The variables are MusicType and Amount (the amount of money spent per person).

    (a) The variances in amount spent for each type of music range from 5.269 (for Classical music) to 11.445 (for None). That means the largest variance of a treatment is only a little more than twice as big as the smallest variance. When I conduct a Levene’s Test for equal variances (as displayed below), the p-value is 0.002. Explain why the p-value for this test is so low for this dataset if the sample variances only differ by a factor of 2.

    (b) Conduct an ANOVA to compare the average amount spent for the 3 types of music.

    (c) Use the Bonferroni correction to test all pairs of group means.

    (d) Use Holm’s method to compare all pairs of group means.

    (e) Write out all conclusions you can make about the effect of background music on the amount of money diners spend.

    (f) Use Scheffe’s method to compare no music (None) against the combination of Classical and Pop music.

    (g) Write a conclusion.
# Load data
music <- read.csv("http://www.bradthiessen.com/html5/data/music.csv")

# Means plot with standard error bars
ggplot(music, aes(MusicType, Amount)) + 
  stat_summary(fun.y = mean, geom = "point") + 
  stat_summary(fun.y = mean, geom = "line", aes(group = 1),
               colour = "Blue", linetype = "dashed") +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  labs(x = "Music Type", y = "Money Spent")


# Levene's test for equal variances
leveneTest(Amount ~ MusicType, data=music)
#    # A tibble: 2 × 3
#         Df `F value`    `Pr(>F)`
#    * <int>     <dbl>       <dbl>
#    1     2  6.256873 0.002115235
#    2   390        NA          NA

Publishing Your Turn solutions

  • Download the Your Turn Solutions template
    • RStudio will automatically open it if it downloads as a .Rmd file.
    • If it downloads as a .txt file, then you can:
      • open the file in a text editor and copy its contents
      • open RStudio and create a new R Markdown file (html file)
      • delete everything and paste the contents of the template
  • Type your solutions into the Your Turn Solutions template
    • I’ve tried to show you where to type each of your solutions/answers
    • You can run the code as you type it.
  • When you’ve answered every question, click the Knit HTML button located at the top of RStudio: Drawing
    • RStudio will begin working to create a .html file of your solutions.
    • It may take a few minutes to compile everything (if you’ve conducted any simulations)
    • While the file is compiling, you may see some red text in the console.
    • When the file is finished, it will open automatically (or it will give you an error message).
      • If the file does not open automatically, you’ll see an error message in the console.
  • Once you’re satisfied with your solutions, email that html file to thiessenbradleya@sau.edu or give me a printed copy.

  • If you run into any problems, email me or work with other students in the class.


Sources

Study: Cesare, Tannenbaum, Dalessio (1990). Interviewers’ Decisions Related to Applicant Handicap Type and Rater Empathy. Human Performance, 3(3)

This document is released under a Creative Commons Attribution-ShareAlike 3.0 Unported license.