library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
library(aod)
## Binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(male, binge)
table(male, binge)
## binge
## male 0 1
## 0 8254 1690
## 1 5528 1624
## Logistic regression
drink.logmodel <- glm(binge ~ male, data=drink, family=binomial(link="logit"))
summary(drink.logmodel)
##
## Call:
## glm(formula = binge ~ male, family = binomial(link = "logit"),
## data = drink)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7177 -0.7177 -0.6104 -0.6104 1.8827
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.58597 0.02670 -59.400 <2e-16 ***
## male 0.36104 0.03885 9.292 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16814 on 17095 degrees of freedom
## Residual deviance: 16728 on 17094 degrees of freedom
## AIC: 16732
##
## Number of Fisher Scoring iterations: 4
# Plot result
ggplot(drink, aes(x=male, y=binge)) +
geom_point(shape=1, alpha=.01, position=position_jitter(width=.05,height=.05)) +
stat_smooth(method="glm", family="binomial", se=FALSE)
### Plot logistic curve through proportions
## Create data containing proportion of binge drinkers by gender
binge.drink.proportions <- drink %>%
group_by(male) %>%
summarize(drinkprop=mean(binge))
## Find predicted values from our logistic model
binge.drink.proportions$predicted <- predict(drink.logmodel, binge.drink.proportions)
plot(drinkprop ~ male, data=binge.drink.proportions)
lines(binge.drink.proportions$male, (exp(binge.drink.proportions$predicted)/(1+exp(binge.drink.proportions$predicted))), type="l", col="red", lw=3)
## Grad school admissions
## Example 2. A researcher is interested in how variables,
## such as GRE (Graduate Record Exam scores),
## GPA (grade point average) and prestige of the undergraduate
## institution, affect admission into graduate school.
## The response variable, admit/don't admit, is a binary variable.
# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
head(admit)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
# Convert rank to a
str(admit)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
admit$rank <- factor(admit$rank)
# Calculate means and standard deviations
sapply(admit, sd)
## admit gre gpa rank
## 0.4660867 115.5165364 0.3805668 0.9444602
sapply(admit, mean)
## Warning in mean.default(evalF$right[, 1], ...): argument is not numeric or
## logical: returning NA
## admit gre gpa rank
## 0.3175 587.7000 3.3899 NA
# Plot the scatterplot matrix
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
cex = 1, pch = 1, bg = "steelblue",
diag.panel = panel.hist, cex.labels = 1, font.labels = 2)
## Predict admission from gre scores
admit.gre <- glm(admit ~ gre, data=admit, family="binomial")
summary(admit.gre)
##
## Call:
## glm(formula = admit ~ gre, family = "binomial", data = admit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1623 -0.9052 -0.7547 1.3486 1.9879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.901344 0.606038 -4.787 1.69e-06 ***
## gre 0.003582 0.000986 3.633 0.00028 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 486.06 on 398 degrees of freedom
## AIC: 490.06
##
## Number of Fisher Scoring iterations: 4
## CIs using profiled log-likelihood
confint(admit.gre)
## 2.5 % 97.5 %
## (Intercept) -4.119988259 -1.739756286
## gre 0.001679963 0.005552748
## CIs using standard errors
confint.default(admit.gre)
## 2.5 % 97.5 %
## (Intercept) -4.089156159 -1.713532380
## gre 0.001649677 0.005514746
library(aod)
wald.test(b = coef(admit.gre), Sigma = vcov(admit.gre), Terms = 2)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 13.2, df = 1, P(> X2) = 0.00028
## odds ratios only
exp(coef(admit.gre))
## (Intercept) gre
## 0.0549493 1.0035886
## odds ratios and 95% CI
exp(cbind(OR = coef(admit.gre), confint(admit.gre)))
## OR 2.5 % 97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre 1.0035886 1.00168137 1.0055682
# Plot logistic curve through proportions
admitprop <- admit %>%
group_by(gre) %>%
summarize(admitprop=mean(admit), meangre=mean(gre), admittotal=sum(admit), count=n(),
notadmit=count-admittotal)
admitprop$predicted <- predict(admit.gre, admitprop)
plot(admitprop ~ gre, data=admitprop)
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)
## Challenger example
temp <- c(66, 69, 68, 72, 73, 70, 78, 67, 79, 70, 67, 57, 81,
76, 76, 63, 70, 53, 67, 75, 70, 75, 58, 80, 31)
damage <- c(rep(0,9), rep(1,14), rep(NA,2))
shuttle <- data.frame(temp, damage)
head(shuttle)
## temp damage
## 1 66 0
## 2 69 0
## 3 68 0
## 4 72 0
## 5 73 0
## 6 70 0
## Predict damage from temp
shuttle.temp <- glm(damage ~ temp, data=shuttle, family="binomial")
summary(shuttle.temp)
##
## Call:
## glm(formula = damage ~ temp, family = "binomial", data = shuttle)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4859 -1.2959 0.7172 0.9978 1.3002
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.03663 4.81697 1.046 0.296
## temp -0.06569 0.06819 -0.963 0.335
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30.789 on 22 degrees of freedom
## Residual deviance: 29.778 on 21 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 33.778
##
## Number of Fisher Scoring iterations: 4
## CIs using profiled log-likelihood
confint(shuttle.temp)
## 2.5 % 97.5 %
## (Intercept) -3.7413959 15.89616057
## temp -0.2179992 0.05973434
## CIs using standard errors
confint.default(shuttle.temp)
## 2.5 % 97.5 %
## (Intercept) -4.4044608 14.47771468
## temp -0.1993375 0.06795919
## odds ratios only
exp(coef(shuttle.temp))
## (Intercept) temp
## 153.9498532 0.9364219
## odds ratios and 95% CI
exp(cbind(OR = coef(shuttle.temp), confint(shuttle.temp)))
## OR 2.5 % 97.5 %
## (Intercept) 153.9498532 0.02372097 8.009674e+06
## temp 0.9364219 0.80412610 1.061554e+00
# Plot proportions and logistic function
shuttleprob <- shuttle %>%
group_by(temp) %>%
summarize(damageprop=mean(damage), temperature=mean(temp))
shuttleprob$predicted <- predict(shuttle.temp, shuttleprob)
plot(damageprop ~ temperature, data=shuttleprob)
lines(shuttleprob$temperature, (exp(shuttleprob$predicted)/(1+exp(shuttleprob$predicted))), type="l", col="red", lw=3)
#######################
## Toenail infection ##
#######################
toenail <- read.csv("http://www.bradthiessen.com/html5/data/toenail.csv")
# Plot proportions over time
toenailprop <- toenail %>%
group_by(treatment, visit) %>%
summarize(propinfect=mean(outcome), meanmonth=mean(month))
ggplot(toenailprop, aes(meanmonth, propinfect)) +
geom_point() + geom_line(size=2) +
facet_grid(. ~ treatment)
## Fit model
toenail.model <- glm(outcome ~ treatment*month, data=toenail, family="binomial")
summary(toenail.model)
##
## Call:
## glm(formula = outcome ~ treatment * month, family = "binomial",
## data = toenail)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9519 -0.7892 -0.5080 -0.2532 2.7339
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5566273 0.1089628 -5.108 3.25e-07 ***
## treatmentTerbinafine -0.0005817 0.1561463 -0.004 0.9970
## month -0.1703078 0.0236199 -7.210 5.58e-13 ***
## treatmentTerbinafine:month -0.0672216 0.0375235 -1.791 0.0732 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1980.5 on 1907 degrees of freedom
## Residual deviance: 1816.0 on 1904 degrees of freedom
## AIC: 1824
##
## Number of Fisher Scoring iterations: 5
## CIs using profiled log-likelihood
confint(toenail.model)
## 2.5 % 97.5 %
## (Intercept) -0.7719213 -0.344475861
## treatmentTerbinafine -0.3067395 0.305615999
## month -0.2182556 -0.125489310
## treatmentTerbinafine:month -0.1419292 0.005563446
## CIs using standard errors
confint.default(toenail.model)
## 2.5 % 97.5 %
## (Intercept) -0.7701903 -0.343064177
## treatmentTerbinafine -0.3066229 0.305459549
## month -0.2166020 -0.124013574
## treatmentTerbinafine:month -0.1407662 0.006323002
## odds ratios only
exp(coef(toenail.model))
## (Intercept) treatmentTerbinafine
## 0.5731389 0.9994185
## month treatmentTerbinafine:month
## 0.8434052 0.9349880
## odds ratios and 95% CI
exp(cbind(OR = coef(toenail.model), confint(toenail.model)))
## OR 2.5 % 97.5 %
## (Intercept) 0.5731389 0.4621244 0.7085917
## treatmentTerbinafine 0.9994185 0.7358422 1.3574609
## month 0.8434052 0.8039199 0.8820652
## treatmentTerbinafine:month 0.9349880 0.8676827 1.0055790
# Plot logistic curves through proportions
# I need to rename a variable back to "month"
toenailprop$month <- toenailprop$meanmonth
toenailprop$predicted <- predict(toenail.model, toenailprop)
ggplot(toenailprop, aes(month, propinfect)) +
geom_point() +
geom_line(aes(month, (exp(predicted)/(1+exp(predicted)))),
size=2, col="red", linetype="dashed") +
geom_line(aes(month, propinfect), size=2, col="black") +
facet_grid(. ~ treatment)
#######################
## Multiple Logistic ##
## Regression ##
#######################
### Multiple logistic regression
xtabs(~admit + rank, data = admit)
## rank
## admit 1 2 3 4
## 0 28 97 93 55
## 1 33 54 28 12
admitlogit <- glm(admit ~ gre + gpa + rank, data = admit, family = "binomial")
summary(admitlogit)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial",
## data = admit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rank2 -0.675443 0.316490 -2.134 0.032829 *
## rank3 -1.340204 0.345306 -3.881 0.000104 ***
## rank4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
## CIs using profiled log-likelihood
confint(admitlogit)
## 2.5 % 97.5 %
## (Intercept) -6.2716202334 -1.792547080
## gre 0.0001375921 0.004435874
## gpa 0.1602959439 1.464142727
## rank2 -1.3008888002 -0.056745722
## rank3 -2.0276713127 -0.670372346
## rank4 -2.4000265384 -0.753542605
## CIs using standard errors
confint.default(admitlogit)
## 2.5 % 97.5 %
## (Intercept) -6.2242418514 -1.755716295
## gre 0.0001202298 0.004408622
## gpa 0.1536836760 1.454391423
## rank2 -1.2957512650 -0.055134591
## rank3 -2.0169920597 -0.663415773
## rank4 -2.3703986294 -0.732528724
library(aod)
wald.test(b = coef(admitlogit), Sigma = vcov(admitlogit), Terms = 4:6)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(admitlogit), Sigma = vcov(admitlogit), L = l)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019
## odds ratios only
exp(coef(admitlogit))
## (Intercept) gre gpa rank2 rank3 rank4
## 0.0185001 1.0022670 2.2345448 0.5089310 0.2617923 0.2119375
## odds ratios and 95% CI
exp(cbind(OR = coef(admitlogit), confint(admitlogit)))
## OR 2.5 % 97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre 1.0022670 1.000137602 1.0044457
## gpa 2.2345448 1.173858216 4.3238349
## rank2 0.5089310 0.272289674 0.9448343
## rank3 0.2617923 0.131641717 0.5115181
## rank4 0.2119375 0.090715546 0.4706961
newdata1 <- with(admit, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(admitlogit, newdata = newdata1, type = "response")
newdata1
## gre gpa rank rankP
## 1 587.7 3.3899 1 0.5166016
## 2 587.7 3.3899 2 0.3522846
## 3 587.7 3.3899 3 0.2186120
## 4 587.7 3.3899 4 0.1846684
newdata2 <- with(admit, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(admitlogit, newdata = newdata2, type = "link",
se = TRUE))
newdata3 <- within(newdata3, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))
})
## view first few rows of final dataset
head(newdata3)
## gre gpa rank fit se.fit residual.scale UL
## 1 200.0000 3.3899 1 -0.8114870 0.5147714 1 0.5492064
## 2 206.0606 3.3899 1 -0.7977632 0.5090986 1 0.5498513
## 3 212.1212 3.3899 1 -0.7840394 0.5034491 1 0.5505074
## 4 218.1818 3.3899 1 -0.7703156 0.4978239 1 0.5511750
## 5 224.2424 3.3899 1 -0.7565919 0.4922237 1 0.5518545
## 6 230.3030 3.3899 1 -0.7428681 0.4866494 1 0.5525464
## LL PredictedProb
## 1 0.1393812 0.3075737
## 2 0.1423880 0.3105042
## 3 0.1454429 0.3134499
## 4 0.1485460 0.3164108
## 5 0.1516973 0.3193867
## 6 0.1548966 0.3223773
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
size = 1)
with(admitlogit, null.deviance - deviance)
## [1] 41.45903
with(admitlogit, df.null - df.residual)
## [1] 5
with(admitlogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 7.578194e-08
logLik(admitlogit)
## 'log Lik.' -229.2587 (df=6)
ggplot(drink, aes(x=male, y=binge)) + geom_point() +
stat_smooth(method="glm", family="binomial", se=FALSE)
########################
########################
## Poisson Regression ##
########################
########################
## Overdispersion
## Poisson: mean = variance.
## When variance > mean, we have overdispersion.
# Use a negative binomial distribution
# Use quasi-likelihood models with dispersion estimated from data
# Let's load a dataset on the mating of elephants
## Elephants reach maturity at about 14 years of age.
## But they have to compete with all adult males for mating
## opportunity. Females are generally more receptive to larger males.
## Size of an elephant increases as age increases.
## Hence it is expected that generally the number of matings should
## increase with age. Is there an optimal age after which the success
## rate does not rise further? Mating is a rare event and hence may
## follow a Poisson distribution.
library(gpk)
data(elephant)
head(elephant)
## Age_in_Years Number_of_Matings
## 1 27 0
## 2 28 1
## 3 28 1
## 4 28 1
## 5 28 3
## 6 29 0
elephant$Age <- elephant$Age_in_Years
elephant$Matings <- elephant$Number_of_Matings
plot(elephant$Age,jitter(elephant$Matings))
# It looks like the number of mates tends to be higher
# for older elephants, AND there seems to be more variability
# in the number of mates as age increases.
mean(elephant$Matings)
## [1] 2.682927
var(elephant$Matings)
## [1] 5.071951
mean(elephant$Matings)/var(elephant$Matings)
## [1] 0.5289733
eg <- elephant %>%
group_by(Age) %>%
summarize(mean=mean(Matings), var=var(Matings))
plot(log(eg$mean), log(eg$var), xlab="log(Mean matings)", ylab="log(Variance of matings)",
main="Figure 4.1. Mean-Variance Relationship")
mtext("matings by age", padj=-0.5)
abline(0,1)
## Null model
null.model <- glm(Matings ~ 1, family=poisson, data=elephant)
summary(null.model)
##
## Call:
## glm(formula = Matings ~ 1, family = poisson, data = elephant)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3164 -1.1799 -0.4368 0.1899 3.0252
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.98691 0.09535 10.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 75.372 on 40 degrees of freedom
## Residual deviance: 75.372 on 40 degrees of freedom
## AIC: 178.82
##
## Number of Fisher Scoring iterations: 5
exp(coef(null.model))
## (Intercept)
## 2.682927
mean(elephant$Matings)
## [1] 2.682927
## Show predictions
elephant$null.pred <- exp(predict(null.model, elephant))
plot(elephant$Age,jitter(elephant$Matings))
lines(elephant$Age,elephant$null.pred, lw=3, col=1)
## Deviance
deviance(null.model)
## [1] 75.37174
pchisq(0.95, df.residual(null.model))
## [1] 8.940942e-26
## Model with age
model <- glm(Matings ~ Age, family=poisson, data=elephant)
summary(model)
##
## Call:
## glm(formula = Matings ~ Age, family = poisson, data = elephant)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.80798 -0.86137 -0.08629 0.60087 2.17777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.58201 0.54462 -2.905 0.00368 **
## Age 0.06869 0.01375 4.997 5.81e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 75.372 on 40 degrees of freedom
## Residual deviance: 51.012 on 39 degrees of freedom
## AIC: 156.46
##
## Number of Fisher Scoring iterations: 5
exp(coef(model))
## (Intercept) Age
## 0.2055619 1.0711071
## Predict matings from age
elephant$predicted <- exp(predict(model, elephant))
lines(elephant$Age,elephant$predicted, lw=3, col=2)
## Prediction for 30 year-old elephant
exp(coef(model)[1]+coef(model)[2]*30)
## (Intercept)
## 1.614098
## Test model fit
deviance(null.model)
## [1] 75.37174
deviance(model)
## [1] 51.01163
pchisq(0.95, df.residual(model))
## [1] 5.841306e-25
1-pchisq(deviance(null.model)-deviance(model), df.residual(null.model)-df.residual(model))
## [1] 7.99062e-07
# Since the p-value is small,
# there is evidence that the addition of age
# explains a significant amount (more) of the deviance
### Next Example ###
### Publications ###
# Load data on publications by PhD Biochemists, Long (1990)
ab <- read.csv("http://www.bradthiessen.com/html5/data/biochem.csv")
tbl_df(ab)
## Source: local data frame [915 x 6]
##
## art fem mar kid5 phd ment
## 1 0 Men Married 0 2.52 7
## 2 0 Women Single 0 2.05 6
## 3 0 Women Single 0 3.75 6
## 4 0 Men Married 1 1.18 3
## 5 0 Women Single 0 3.75 26
## 6 0 Women Married 2 3.59 2
## 7 0 Women Single 0 3.19 3
## 8 0 Men Married 2 2.96 4
## 9 0 Men Single 0 4.62 6
## 10 0 Women Married 0 1.25 0
## .. ... ... ... ... ... ...
## art: articles in last three years of Ph.D.
## fem: coded one for females
## mar: coded one if married
## kid5: number of children under age six
## phd: prestige of Ph.D. program
## ment: articles by mentor in last three years
mean(ab$art)
## [1] 1.692896
var(ab$art)
## [1] 3.709742
mean(ab$art)/var(ab$art)
## [1] 0.456338
# The mean number of articles is 1.69 and the variance is 3.71,
# a bit more than twice the mean. The data are over-dispersed,
# but of course we haven't considered any covariates yet.
# Null model
null.model <- glm(art ~ 1, family=poisson, data=ab)
summary(null.model)
##
## Call:
## glm(formula = art ~ 1, family = poisson, data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8401 -1.8401 -0.5770 0.2294 7.5677
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.52644 0.02541 20.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1817.4 on 914 degrees of freedom
## AIC: 3487.1
##
## Number of Fisher Scoring iterations: 5
exp(coef(null.model))
## (Intercept)
## 1.692896
1-pchisq(deviance(null.model), df.residual(null.model))
## [1] 0
# Predict with gender
model1 <- glm(art ~ fem, family=poisson, data=ab)
summary(model1)
##
## Call:
## glm(formula = art ~ fem, family = poisson, data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9404 -1.7148 -0.4119 0.4139 7.3221
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.63265 0.03279 19.293 < 2e-16 ***
## femWomen -0.24718 0.05187 -4.765 1.89e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1794.4 on 913 degrees of freedom
## AIC: 3466.1
##
## Number of Fisher Scoring iterations: 5
exp(coef(model1))
## (Intercept) femWomen
## 1.8825911 0.7810027
1-pchisq(deviance(null.model)-deviance(model1),
df.residual(null.model)-df.residual(model1))
## [1] 1.596071e-06
# Predict with gender and marriage
model2 <- glm(art ~ fem + mar, family=poisson, data=ab)
summary(model2)
##
## Call:
## glm(formula = art ~ fem + mar, family = poisson, data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9466 -1.7019 -0.4268 0.4332 7.3072
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.63902 0.03513 18.190 < 2e-16 ***
## femWomen -0.24054 0.05353 -4.493 7.01e-06 ***
## marSingle -0.02815 0.05632 -0.500 0.617
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1794.1 on 912 degrees of freedom
## AIC: 3467.9
##
## Number of Fisher Scoring iterations: 5
exp(coef(model2))
## (Intercept) femWomen marSingle
## 1.8946194 0.7862031 0.9722457
1-pchisq(deviance(model1)-deviance(model2),
df.residual(model1)-df.residual(model2))
## [1] 0.6167193
# Predict with gender and marriage and interaction
model3 <- glm(art ~ fem*mar, family=poisson, data=ab)
summary(model3)
##
## Call:
## glm(formula = art ~ fem * mar, family = poisson, data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9733 -1.6660 -0.4669 0.4872 7.3459
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.62247 0.03753 16.586 < 2e-16 ***
## femWomen -0.18924 0.06550 -2.889 0.00386 **
## marSingle 0.04377 0.07716 0.567 0.57050
## femWomen:marSingle -0.14931 0.11186 -1.335 0.18193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1792.4 on 911 degrees of freedom
## AIC: 3468.1
##
## Number of Fisher Scoring iterations: 5
exp(coef(model3))
## (Intercept) femWomen marSingle
## 1.8635171 0.8275869 1.0447464
## femWomen:marSingle
## 0.8613011
1-pchisq(deviance(model1)-deviance(model3),
df.residual(model1)-df.residual(model3))
## [1] 0.363465
## Predict with number of kids
model4 <- glm(art ~ kid5, family=poisson, data=ab)
summary(model4)
##
## Call:
## glm(formula = art ~ kid5, family = poisson, data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8708 -1.8067 -0.5333 0.3694 7.4916
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.55960 0.02988 18.728 <2e-16 ***
## kid5 -0.06978 0.03450 -2.023 0.0431 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1813.2 on 913 degrees of freedom
## AIC: 3485
##
## Number of Fisher Scoring iterations: 5
exp(coef(model4))
## (Intercept) kid5
## 1.7499725 0.9325973
# Show predictions
ab$predicted <- exp(predict(model4, ab))
plot(jitter(ab$kid5), jitter(ab$art))
lines(ab$kid5, ab$predicted, lw=3, col=2)
# Let's try a model with all the predictors
full.model <- glm(art ~ fem + mar + kid5 + phd + ment, family=poisson, data=ab)
summary(full.model)
##
## Call:
## glm(formula = art ~ fem + mar + kid5 + phd + ment, family = poisson,
## data = ab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5672 -1.5398 -0.3660 0.5722 5.4467
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.459860 0.093335 4.927 8.35e-07 ***
## femWomen -0.224594 0.054613 -4.112 3.92e-05 ***
## marSingle -0.155243 0.061374 -2.529 0.0114 *
## kid5 -0.184883 0.040127 -4.607 4.08e-06 ***
## phd 0.012823 0.026397 0.486 0.6271
## ment 0.025543 0.002006 12.733 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1817.4 on 914 degrees of freedom
## Residual deviance: 1634.4 on 909 degrees of freedom
## AIC: 3314.1
##
## Number of Fisher Scoring iterations: 5
exp(coef(full.model))
## (Intercept) femWomen marSingle kid5 phd ment
## 1.5838526 0.7988403 0.8562068 0.8312018 1.0129051 1.0258718
1-pchisq(deviance(model1)-deviance(full.model),
df.residual(model1)-df.residual(full.model))
## [1] 0
## The fit is still terrible, but it's better than a model with no predictors
## Prediction for a single female with 1 kids under the age of 6
exp(coef(full.model)[1]+coef(full.model)[2]+coef(full.model)[3]+(2*coef(full.model)[4])+(mean(ab$phd)*coef(full.model)[5])+(mean(ab$ment)*coef(full.model)[6]))
## (Intercept)
## 0.9743212
## Prediction for a married male with no kids under the age of 6
exp(coef(full.model)[1]+(mean(ab$phd)*coef(full.model)[5])+(mean(ab$ment)*coef(full.model)[6]))
## (Intercept)
## 2.061819
##################
## Next example ##
#### Absences ####
##################
# Load data
absences <- read.csv("http://www.bradthiessen.com/html5/data/absences.csv")
absences$prog <- factor(absences$prog, levels = 1:3,
labels = c("General", "Academic", "Vocational"))
head(absences)
## id gender math daysabs prog
## 1 1001 male 63 4 Academic
## 2 1002 male 27 4 Academic
## 3 1003 female 20 2 Academic
## 4 1004 female 16 3 Academic
## 5 1005 female 2 3 Academic
## 6 1006 female 71 13 Academic
## Summarize
summary(absences)
## id gender math daysabs
## Min. :1001 female:160 Min. : 1.00 Min. : 0.000
## 1st Qu.:1079 male :154 1st Qu.:28.00 1st Qu.: 1.000
## Median :1158 Median :48.00 Median : 4.000
## Mean :1576 Mean :48.27 Mean : 5.955
## 3rd Qu.:2078 3rd Qu.:70.00 3rd Qu.: 8.000
## Max. :2157 Max. :99.00 Max. :35.000
## prog
## General : 40
## Academic :167
## Vocational:107
##
##
##
ggplot(absences, aes(daysabs, fill = prog)) +
geom_histogram(binwidth = 1, alpha=.5) +
facet_grid(prog ~ ., margins = TRUE, scales = "free")
plot(absences$math, absences$daysabs)
# Let's try a model with all the predictors
pois.model <- glm(daysabs ~ gender + math + prog, family=poisson, data=absences)
summary(pois.model)
##
## Call:
## glm(formula = daysabs ~ gender + math + prog, family = poisson,
## data = absences)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2218 -2.1892 -0.9238 0.7218 7.0048
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7594786 0.0637731 43.270 < 2e-16 ***
## gendermale -0.2424762 0.0467765 -5.184 2.18e-07 ***
## math -0.0069561 0.0009354 -7.437 1.03e-13 ***
## progAcademic -0.4260327 0.0567308 -7.510 5.92e-14 ***
## progVocational -1.2707199 0.0779143 -16.309 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2217.7 on 313 degrees of freedom
## Residual deviance: 1746.8 on 309 degrees of freedom
## AIC: 2640.2
##
## Number of Fisher Scoring iterations: 5
exp(coef(pois.model))
## (Intercept) gendermale math progAcademic progVocational
## 15.7916072 0.7846824 0.9930680 0.6530950 0.2806295
## Fit is terrible
## We're assuming mean = variance
mean(absences$daysabs ~ absences$prog)
## General Academic Vocational
## 10.650000 6.934132 2.672897
var(absences$daysabs ~ absences$prog)
## General Academic Vocational
## 67.25897 55.44744 13.93916
## What can we do with this overdispersion?
## We could try a negative binomial regression
## It has an extra parameter to model overdispersion
library(MASS)
neg.bin.model <- glm.nb(daysabs ~ gender + math + prog, data=absences)
summary(neg.bin.model)
##
## Call:
## glm.nb(formula = daysabs ~ gender + math + prog, data = absences,
## init.theta = 1.047288915, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1567 -1.0761 -0.3810 0.2856 2.7235
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.707484 0.204275 13.254 < 2e-16 ***
## gendermale -0.211086 0.121989 -1.730 0.0836 .
## math -0.006236 0.002492 -2.502 0.0124 *
## progAcademic -0.424540 0.181725 -2.336 0.0195 *
## progVocational -1.252615 0.199699 -6.273 3.55e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0473) family taken to be 1)
##
## Null deviance: 431.67 on 313 degrees of freedom
## Residual deviance: 358.87 on 309 degrees of freedom
## AIC: 1740.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.047
## Std. Err.: 0.108
##
## 2 x log-likelihood: -1728.307
exp(coef(neg.bin.model))
## (Intercept) gendermale math progAcademic progVocational
## 14.9915148 0.8097046 0.9937839 0.6540708 0.2857565
1-pchisq(deviance(neg.bin.model),
df.residual(neg.bin.model))
## [1] 0.02656535
# R first displays the call and the deviance residuals. Next, we see the regression coefficients for each of the variables, along with standard errors, z-scores, and p-values. The variable math has a coefficient of -0.006, which is statistically significant. This means that for each one-unit increase in math, the expected log count of the number of days absent decreases by 0.006. The indicator variable shown as progAcademic is the expected difference in log count between group 2 and the reference group (prog=1). The expected log count for level 2 of prog is 0.44 lower than the expected log count for level 1. The indicator variable for progVocational is the expected difference in log count between group 3 and the reference group.The expected log count for level 3 of prog is 1.28 lower than the expected log count for level 1. To determine if prog itself, overall, is statistically significant, we can compare a model with and without prog. The reason it is important to fit separate models, is that unless we do, the overdispersion parameter is held constant.
neg.bin.no.prog <- update(neg.bin.model, . ~ . - prog)
anova(neg.bin.model, neg.bin.no.prog)
## Likelihood ratio tests of Negative Binomial Models
##
## Response: daysabs
## Model theta Resid. df 2 x log-lik. Test df
## 1 gender + math 0.8705939 311 -1772.074
## 2 gender + math + prog 1.0472889 309 -1728.307 1 vs 2 2
## LR stat. Pr(Chi)
## 1
## 2 43.76737 3.133546e-10
neg.bin.model <- glm.nb(daysabs ~ math + prog, data=absences)
newdata1 <- data.frame(math = mean(absences$math), prog = factor(1:3, levels = 1:3,
labels = levels(absences$prog)))
newdata1$phat <- predict(neg.bin.model, newdata1, type = "response")
newdata1
## math prog phat
## 1 48.26752 General 10.236899
## 2 48.26752 Academic 6.587927
## 3 48.26752 Vocational 2.850083
newdata2 <- data.frame(
math = rep(seq(from = min(absences$math), to = max(absences$math), length.out = 100), 3),
prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
levels(absences$prog)))
newdata2 <- cbind(newdata2, predict(neg.bin.model, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
DaysAbsent <- exp(fit)
LL <- exp(fit - 1.96 * se.fit)
UL <- exp(fit + 1.96 * se.fit)
})
ggplot(newdata2, aes(math, DaysAbsent)) +
geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
geom_line(aes(colour = prog), size = 2) +
labs(x = "Math Score", y = "Predicted Days Absent")
###################
## Final example ##
###################
# Load data
medicare <- read.csv("http://www.bradthiessen.com/html5/data/medicare.csv")
tbl_df(medicare)
## Source: local data frame [4,406 x 19]
##
## ofp ofnp opp opnp emer hosp health numchron adldiff region age black
## 1 5 0 0 0 0 1 average 2 no other 6.9 yes
## 2 1 0 2 0 2 0 average 2 no other 7.4 no
## 3 13 0 0 0 3 3 poor 4 yes other 6.6 yes
## 4 16 0 5 0 1 1 poor 2 yes other 7.6 no
## 5 3 0 0 0 0 0 average 2 yes other 7.9 no
## 6 17 0 0 0 0 0 poor 5 yes other 6.6 no
## 7 9 0 0 0 0 0 average 0 no midwest 7.5 no
## 8 3 0 0 0 0 0 average 0 no midwest 8.7 no
## 9 1 0 0 0 0 0 average 0 no midwest 7.3 no
## 10 0 0 0 0 0 0 average 0 no midwest 7.8 no
## .. ... ... ... ... ... ... ... ... ... ... ... ...
## Variables not shown: gender (fctr), married (fctr), school (int), faminc
## (dbl), employed (fctr), privins (fctr), medicaid (fctr)
## Source: US National Medical Expenditure Survey (NMES) for 1987/88
## ofp = number of physician office visits
## hosp = number of hospital stays
## health = self-perceived health status
## numchron = number of chronic conditions
## gender
## school = years of education
## privins = private insurance?
## Select only these variables
med.data <- medicare[, c(1, 6:8, 13, 15, 18)]
## Plot number of doctor office visits
plot(table(med.data$ofp))
hist(med.data$ofp, breaks = 0:90 - 0.5)
## Bad plot
plot(ofp ~ numchron, data = med.data)
## Continuity-corrected log transform function
clog <- function(x) log(x + 0.5)
## Function to transform count to factor variable
cfac <- function(x, breaks = NULL) {
if(is.null(breaks)) breaks <- unique(quantile(x, 0:10/10))
x <- cut(x, breaks, include.lowest = TRUE, right = FALSE)
levels(x) <- paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1,
c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""),
sep = "")
return(x)}
## Good plots
plot(clog(ofp) ~ cfac(numchron), data = med.data)
plot(clog(ofp) ~ health, data = med.data, varwidth = TRUE)
plot(clog(ofp) ~ cfac(numchron), data = med.data)
plot(clog(ofp) ~ privins, data = med.data, varwidth = TRUE)
plot(clog(ofp) ~ cfac(hosp, c(0:2, 8)), data = med.data)
plot(clog(ofp) ~ gender, data = med.data, varwidth = TRUE)
plot(cfac(ofp, c(0:2, 4, 6, 10, 100)) ~ school, data = med.data, breaks = 9)
## Poisson regression
fm_pois <- glm(ofp ~ ., data = med.data, family = poisson)
summary(fm_pois)
##
## Call:
## glm(formula = ofp ~ ., family = poisson, data = med.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4055 -1.9962 -0.6737 0.7049 16.3620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.028874 0.023785 43.258 <2e-16 ***
## hosp 0.164797 0.005997 27.478 <2e-16 ***
## healthexcellent -0.361993 0.030304 -11.945 <2e-16 ***
## healthpoor 0.248307 0.017845 13.915 <2e-16 ***
## numchron 0.146639 0.004580 32.020 <2e-16 ***
## gendermale -0.112320 0.012945 -8.677 <2e-16 ***
## school 0.026143 0.001843 14.182 <2e-16 ***
## privinsyes 0.201687 0.016860 11.963 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 26943 on 4405 degrees of freedom
## Residual deviance: 23168 on 4398 degrees of freedom
## AIC: 35959
##
## Number of Fisher Scoring iterations: 5
## Sandwich standard errors
## As the exploratory analysis suggested that
## over-dispersion is present in this data set,
## we re-compute the Wald tests using sandwich
## standard errors
library(sandwich)
library(lmtest)
coeftest(fm_pois, vcov = sandwich)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.028874 0.064530 15.9442 < 2.2e-16 ***
## hosp 0.164797 0.021945 7.5095 5.935e-14 ***
## healthexcellent -0.361993 0.077449 -4.6740 2.954e-06 ***
## healthpoor 0.248307 0.054022 4.5964 4.298e-06 ***
## numchron 0.146639 0.012908 11.3605 < 2.2e-16 ***
## gendermale -0.112320 0.035343 -3.1780 0.001483 **
## school 0.026143 0.005084 5.1422 2.715e-07 ***
## privinsyes 0.201687 0.043128 4.6765 2.919e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Quasi-poisson regression
fm_qpois <- glm(ofp ~ ., data = med.data, family = quasipoisson)
summary(fm_qpois)
##
## Call:
## glm(formula = ofp ~ ., family = quasipoisson, data = med.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4055 -1.9962 -0.6737 0.7049 16.3620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.028874 0.061594 16.704 < 2e-16 ***
## hosp 0.164797 0.015531 10.611 < 2e-16 ***
## healthexcellent -0.361993 0.078476 -4.613 4.09e-06 ***
## healthpoor 0.248307 0.046211 5.373 8.13e-08 ***
## numchron 0.146639 0.011860 12.364 < 2e-16 ***
## gendermale -0.112320 0.033523 -3.351 0.000813 ***
## school 0.026143 0.004774 5.477 4.58e-08 ***
## privinsyes 0.201687 0.043661 4.619 3.96e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 6.706254)
##
## Null deviance: 26943 on 4405 degrees of freedom
## Residual deviance: 23168 on 4398 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
## estimated dispersion of φˆ = 6.706 is larger than 1
## confirming that over-dispersion is present in the data
## Negative binomial regression
library(MASS)
fm_nbin <- glm.nb(ofp ~ ., data = med.data)
summary(fm_nbin)
##
## Call:
## glm.nb(formula = ofp ~ ., data = med.data, init.theta = 1.206603534,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0469 -0.9955 -0.2948 0.2961 5.8185
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.929257 0.054591 17.022 < 2e-16 ***
## hosp 0.217772 0.020176 10.793 < 2e-16 ***
## healthexcellent -0.341807 0.060924 -5.610 2.02e-08 ***
## healthpoor 0.305013 0.048511 6.288 3.23e-10 ***
## numchron 0.174916 0.012092 14.466 < 2e-16 ***
## gendermale -0.126488 0.031216 -4.052 5.08e-05 ***
## school 0.026815 0.004394 6.103 1.04e-09 ***
## privinsyes 0.224402 0.039464 5.686 1.30e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.2066) family taken to be 1)
##
## Null deviance: 5743.7 on 4405 degrees of freedom
## Residual deviance: 5044.5 on 4398 degrees of freedom
## AIC: 24359
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.2066
## Std. Err.: 0.0336
##
## 2 x log-likelihood: -24341.1070
## both regression coefficients and standard errors
## are similar to the quasi-Poisson and
## the sandwich-adjusted Poisson results
## Compare estimated coefficients
fm <- list("ML-Pois" = fm_pois, "Quasi-Pois" = fm_qpois, "NB" = fm_nbin)
sapply(fm, function(x) coef(x)[1:8])
## ML-Pois Quasi-Pois NB
## (Intercept) 1.02887420 1.02887420 0.92925658
## hosp 0.16479739 0.16479739 0.21777223
## healthexcellent -0.36199320 -0.36199320 -0.34180660
## healthpoor 0.24830697 0.24830697 0.30501303
## numchron 0.14663928 0.14663928 0.17491552
## gendermale -0.11231992 -0.11231992 -0.12648813
## school 0.02614299 0.02614299 0.02681508
## privinsyes 0.20168688 0.20168688 0.22440187
## Compare standard errors
cbind("ML-Pois" = sqrt(diag(vcov(fm_pois))),
"Adj-Pois" = sqrt(diag(sandwich(fm_pois))),
sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8]))
## ML-Pois Adj-Pois Quasi-Pois NB
## (Intercept) 0.023784601 0.064529808 0.061593641 0.054591271
## hosp 0.005997367 0.021945186 0.015531043 0.020176492
## healthexcellent 0.030303905 0.077448586 0.078476316 0.060923623
## healthpoor 0.017844531 0.054021990 0.046210977 0.048510797
## numchron 0.004579677 0.012907865 0.011859732 0.012091749
## gendermale 0.012945146 0.035343487 0.033523316 0.031215523
## school 0.001843329 0.005084002 0.004773565 0.004393971
## privinsyes 0.016859826 0.043128006 0.043660942 0.039463744
## Log-likelihood
rbind(logLik = sapply(fm, function(x) round(logLik(x), digits = 0)),
Df = sapply(fm, function(x) attr(logLik(x), "df")))
## ML-Pois Quasi-Pois NB
## logLik -17972 NA -12171
## Df 8 8 9
## The ML Poisson model is clearly inferior to all other fits.
## Expected vs observed zero counts
round(c("Obs" = sum(med.data$ofp < 1),
"ML-Pois" = sum(dpois(0, fitted(fm_pois))),
"NB" = sum(dnbinom(0, mu = fitted(fm_nbin), size = fm_nbin$theta))))
## Obs ML-Pois NB
## 683 47 608
retention <- read.csv(“http://www.bradthiessen.com/html5/data/retention1113.csv”) students2014 <- read.csv(“http://www.bradthiessen.com/html5/data/retention14.csv”)
retention2 <- retention %>% filter(!is.na(retained))
str(retention2) retention2\(female <- as.factor(retention2\)female) retention2\(race <- as.factor(retention2\)race) retention2\(mom_education <- as.factor(retention2\)mom_education) retention2\(dad_education <- as.factor(retention2\)dad_education) retention2\(residence <- as.factor(retention2\)residence) retention2\(athlete <- as.factor(retention2\)athlete)
xtabs(~retained + year, data = retention) table(retention2\(dropped, retention2\)year)
table(students2014$year)
resample <- do(100000) * mean(sample(retention2$retained, 533, replace=T)) densityplot(~result, data=resample, plot.points = FALSE, col=“darkblue”, lwd=4) confint(resample, level = 0.95, method = “quantile”) mean(resample)
logitmodel <- glm(formula = dropped ~ act_comp + hsgpa + female*athlete, data=retention2, family=binomial)
before.enroll.model <- glm(formula = dropped ~ act_math + act_read + hsgpa + female + race + athlete + residence + mom_education + dad_education, data=retention2, family=binomial) summary(before.enroll.model) wald.test(b = coef(before.enroll.model), Sigma = vcov(before.enroll.model), Terms = 3:8)
reduced.enroll.model <- glm(formula = dropped ~ hsgpa + athlete + residence, data=retention2, family=binomial) summary(reduced.enroll.model)
with(reduced.enroll.model, null.deviance - deviance) with(reduced.enroll.model, df.null - df.residual) with(reduced.enroll.model, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) logLik(admitlogit)
september.model <- glm(formula = dropped ~ risk_rating + act_math + hsgpa + female + race + factor_commit + factor_finance + factor_basic + athlete + residence, data=retention2, family=binomial) summary(september.model)
risk.model <- glm(formula = dropped ~ risk_rating, data=retention2, family=binomial) summary(risk.model)
summary(logitmodel) # display results confint(logitmodel) # 95% CI for the coefficients exp(coef(logitmodel)) # exponentiated coefficients exp(confint(logitmodel)) # 95% CI for exponentiated coefficients exp(cbind(OR = coef(logitmodel), confint(logitmodel)))
anova(logitmodel, test=“Chisq”) 1 - pchisq(16.467, df=1)
wald.test(b = coef(logitmodel), Sigma = vcov(logitmodel), Terms = 2:3)
predictions <- data.frame(matrix(nrow=1568)) predictions\(first <- predict(logitmodel, type="response") # predicted values predictions\)residuals <- residuals(logitmodel, type=“deviance”) # residuals
densityplot(~first, data=predictions, plot.points = FALSE, col=“darkblue”, lwd=4) densityplot(~residuals, data=predictions, plot.points = FALSE, col=“darkblue”, lwd=4)
cdplot(dropped~act_comp, data=retention2) cdplot(retained~act_comp, data=retention2)
newdata = data.frame(act_comp=18, hsgpa=2.50, falltermgpa=2.00) predict(logitmodel, newdata, type=“response”)
newdata1 <- with(retention2, data.frame(act_comp = mean(act_comp, na.rm=T), hsgpa = mean(hsgpa, na.rm=T), athlete=factor(c(0,1,1,0)), female = factor(0:1)))
newdata1$rankP <- predict(logitmodel, newdata = newdata1, type = “response”)
newdata2 <- with(mydata, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100), 4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
lines(retention2\(act_comp, logitmodel\)fitted, type=“l”, col=“red”)
titanic <- read.csv(“http://www.bradthiessen.com/html5/data/titanic.csv”) tbl_df(titanic)
survived.model <- glm(survived ~ sex + age + fare, data=titanic, family=“binomial”) summary(survived.model)
confint(survived.model) ## CIs using standard errors confint.default(survived.model)
library(aod) wald.test(b = coef(survived.model), Sigma = vcov(survived.model), Terms = 2)
exp(coef(survived.model)) ## odds ratios and 95% CI exp(cbind(OR = coef(survived.model), confint(survived.model)))
survived <- titanic %>% group_by(fare) %>% summarize(admitprop=mean(admit), meangre=mean(gre), admittotal=sum(admit), count=n(), notadmit=count-admittotal)
admitprop\(predicted <- predict(admit.gre, admitprop) plot(admitprop ~ gre, data=admitprop) lines(admitprop\)gre, (exp(admitprop\(predicted)/(1+exp(admitprop\)predicted))), type=“l”, col=“red”, lw=3)