Correlation: Malevolent Uniforms
## Load data
NFL <- read.csv("http://www.bradthiessen.com/html5/data/NFLuniforms.csv")
head(NFL)
## NFLTeam NFL_Malevolence ZPenYds
## 1 LA Raiders 5.10 1.19
## 2 Pittsburgh 5.00 0.48
## 3 Cincinnati 4.97 0.27
## 4 New Orleans 4.83 0.10
## 5 Chicago 4.68 0.29
## 6 Kansas City 4.58 -0.19
## Scatterplot
NFL %>%
ggvis(x=~NFL_Malevolence, y=~ZPenYds, fill := "steelblue", stroke:="steelblue", fillOpacity:=.6, strokeOpacity:=.6) %>%
layer_points() %>%
add_axis("x", title = "Malevolence") %>%
scale_numeric("x", domain=c(2.5, 5.5), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Penalty yards (z-score)") %>%
scale_numeric("y", domain=c(-2, 2), nice=FALSE, clamp=TRUE)
## Scatterplot with trend
NFL %>%
ggvis(x=~NFL_Malevolence, y=~ZPenYds, fill := "steelblue", stroke:="steelblue", fillOpacity:=.6, strokeOpacity:=.6) %>%
layer_points() %>%
layer_smooths() %>%
add_axis("x", title = "Malevolence") %>%
scale_numeric("x", domain=c(2.5, 5.5), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Penalty yards (z-score)") %>%
scale_numeric("y", domain=c(-2, 2), nice=FALSE, clamp=TRUE)
## Calculate Pearson's r
cor(NFL_Malevolence, ZPenYds, data=NFL)
## [1] 0.429796
## Test correlation (null: r=0)
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, alternative="greater")
##
## Pearson's product-moment correlation
##
## data: NFL$NFL_Malevolence and NFL$ZPenYds
## t = 2.4272, df = 26, p-value = 0.01122
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
## 0.1299369 1.0000000
## sample estimates:
## cor
## 0.429796
## Calculate Spearman's rho
cor(NFL_Malevolence, ZPenYds, data=NFL, method="spearman")
## [1] 0.2634173
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, method="spearman", alternative="greater")
## Warning in cor.test.default(NFL$NFL_Malevolence, NFL$ZPenYds, method =
## "spearman", : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: NFL$NFL_Malevolence and NFL$ZPenYds
## S = 2691.473, p-value = 0.08781
## alternative hypothesis: true rho is greater than 0
## sample estimates:
## rho
## 0.2634173
## Calculate Kendall's tau
cor(NFL_Malevolence, ZPenYds, data=NFL, method="kendall")
## [1] 0.1861702
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, method="kendall", alternative="greater")
## Warning in cor.test.default(NFL$NFL_Malevolence, NFL$ZPenYds, method =
## "kendall", : Cannot compute exact p-value with ties
##
## Kendall's rank correlation tau
##
## data: NFL$NFL_Malevolence and NFL$ZPenYds
## z = 1.384, p-value = 0.08317
## alternative hypothesis: true tau is greater than 0
## sample estimates:
## tau
## 0.1861702
## Randomization-based test for r
test.stat <- cor(NFL_Malevolence, ZPenYds, data=NFL)
randomize <- do(10000) * cor(NFL_Malevolence, shuffle(ZPenYds), data=NFL)
histogram(~result, data = randomize, col="grey", xlim=c(-1, 1), xlab = "Randomization - Correlation", width=.01,
main=paste("Observed r =", cor(NFL_Malevolence, ZPenYds, data=NFL)))
tally((~result >= test.stat), data=randomize, format="prop")
##
## TRUE FALSE
## 0.0101 0.9899
## Bootstrap estimate for r
boot.cor = function(x, y, n=5000, p=0.9, method="pearson"){
w = length(x); x.r = x; y.r = y
sm = 1:w; cor.b = 1:n
for (k in 1:n){
s = sample(sm, w, replace = T)
for (i in 1:w)
{
x.r[i] = x[s[i]]
y.r[i] = y[s[i]]
}
cor.b[k] = cor(x.r, y.r, use = "pairwise", method)
}
cor.b = sort(cor.b); a = round(n*(1-p)/2,0); b = round(n*(p+1)/2,0)
vec = c(cor.b[a+1], cor(x, y, use = "pairwise", method), cor.b[b-1])
n.r = c("value"); n.c = c("lower_bound", "correlation", "upper_bound")
matrix(vec,1,3,dimnames = list(n.r,n.c))
}
boot.cor(NFL$NFL_Malevolence, NFL$ZPenYds)
## lower_bound correlation upper_bound
## value 0.008809799 0.429796 0.685738
### Load a built-in dataset about Midwest states
data(midwest)
midwest <- midwest
names(midwest)
## [1] "PID" "county" "state"
## [4] "area" "poptotal" "popdensity"
## [7] "popwhite" "popblack" "popamerindian"
## [10] "popasian" "popother" "percwhite"
## [13] "percblack" "percamerindan" "percasian"
## [16] "percother" "popadults" "perchsd"
## [19] "percollege" "percprof" "poppovertyknown"
## [22] "percpovertyknown" "percbelowpoverty" "percchildbelowpovert"
## [25] "percadultpoverty" "percelderlypoverty" "inmetro"
## [28] "category"
midwest %>%
ggvis(x=~percollege, y=~percbelowpoverty, fill := "steelblue", stroke:="steelblue", fillOpacity:=.1, strokeOpacity:=.3) %>%
layer_points() %>%
add_axis("x", title = "% attending college") %>%
scale_numeric("x", domain=c(0, 50), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "% below poverty") %>%
scale_numeric("y", domain=c(0, 50), nice=FALSE, clamp=TRUE)
## We'll look at the correlation between
##### percbelowpoverty (% of population below poverty)
##### percollege (% of population who attended college)
## First let's calculate Pearson's r
cor(percbelowpoverty, percollege, data=midwest)
## [1] -0.2814064
## Now, let's run a test for that correlation
cor.test(midwest$percbelowpoverty, midwest$percollege, alternative="less")
##
## Pearson's product-moment correlation
##
## data: midwest$percbelowpoverty and midwest$percollege
## t = -6.1164, df = 435, p-value = 1.067e-09
## alternative hypothesis: true correlation is less than 0
## 95 percent confidence interval:
## -1.0000000 -0.2072089
## sample estimates:
## cor
## -0.2814064
## We get a p-value of 0.000000001067
## Let's try a randomization-based test
test.stat <- cor(midwest$percbelowpoverty, midwest$percollege)
randomize <- do(10000) * cor(midwest$percbelowpoverty, shuffle(midwest$percollege))
histogram(~result, data = randomize, col="grey", xlim=c(-.5, .5),
xlab = "Randomization - Correlation", width=.01,
main=paste("Observed r =", cor(midwest$percbelowpoverty, midwest$percollege)))
tally((~result <= test.stat), data=randomize, format="prop")
##
## TRUE FALSE
## 0 1
## Let's try Spearman's rho
cor(midwest$percbelowpoverty, midwest$percollege, method="spearman")
## [1] -0.3334071
cor.test(midwest$percbelowpoverty, midwest$percollege, method="spearman", alternative="less")
##
## Spearman's rank correlation rho
##
## data: midwest$percbelowpoverty and midwest$percollege
## S = 18546140, p-value = 5.626e-13
## alternative hypothesis: true rho is less than 0
## sample estimates:
## rho
## -0.3334071
## ... and Kendall's tau
cor(midwest$percbelowpoverty, midwest$percollege, method="kendall")
## [1] -0.2334516
cor.test(midwest$percbelowpoverty, midwest$percollege, method="kendall", alternative="less")
##
## Kendall's rank correlation tau
##
## data: midwest$percbelowpoverty and midwest$percollege
## z = -7.2911, p-value = 1.537e-13
## alternative hypothesis: true tau is less than 0
## sample estimates:
## tau
## -0.2334516