Load packages

library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
library(aod)

Logistic Regression

## Binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(male, binge)
table(male, binge)

##     binge
## male    0    1
##    0 8254 1690
##    1 5528 1624

## Logistic regression
drink.logmodel <- glm(binge ~ male, data=drink, family=binomial(link="logit"))
summary(drink.logmodel)

## 
## Call:
## glm(formula = binge ~ male, family = binomial(link = "logit"), 
##     data = drink)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7177  -0.7177  -0.6104  -0.6104   1.8827  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58597    0.02670 -59.400   <2e-16 ***
## male         0.36104    0.03885   9.292   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16814  on 17095  degrees of freedom
## Residual deviance: 16728  on 17094  degrees of freedom
## AIC: 16732
## 
## Number of Fisher Scoring iterations: 4

# Plot result
ggplot(drink, aes(x=male, y=binge)) + 
  geom_point(shape=1, alpha=.01, position=position_jitter(width=.05,height=.05)) + 
  stat_smooth(method="glm", family="binomial", se=FALSE)

### Plot logistic curve through proportions
## Create data containing proportion of binge drinkers by gender
binge.drink.proportions <- drink %>%
  group_by(male) %>%
  summarize(drinkprop=mean(binge))
## Find predicted values from our logistic model
binge.drink.proportions$predicted <- predict(drink.logmodel, binge.drink.proportions)
plot(drinkprop ~ male, data=binge.drink.proportions)
lines(binge.drink.proportions$male, (exp(binge.drink.proportions$predicted)/(1+exp(binge.drink.proportions$predicted))), type="l", col="red", lw=3)

## Grad school admissions
## Example 2. A researcher is interested in how variables,
## such as GRE (Graduate Record Exam scores), 
## GPA (grade point average) and prestige of the undergraduate
## institution, affect admission into graduate school. 
## The response variable, admit/don't admit, is a binary variable.

# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
head(admit)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

# Convert rank to a
str(admit)

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

admit$rank <- factor(admit$rank)

# Calculate means and standard deviations
sapply(admit, sd)

##       admit         gre         gpa        rank 
##   0.4660867 115.5165364   0.3805668   0.9444602

sapply(admit, mean)

## Warning in mean.default(evalF$right[, 1], ...): argument is not numeric or
## logical: returning NA

##    admit      gre      gpa     rank 
##   0.3175 587.7000   3.3899       NA

# Plot the scatterplot matrix
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## Predict admission from gre scores
admit.gre <- glm(admit ~ gre, data=admit, family="binomial")
summary(admit.gre)

## 
## Call:
## glm(formula = admit ~ gre, family = "binomial", data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1623  -0.9052  -0.7547   1.3486   1.9879  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.901344   0.606038  -4.787 1.69e-06 ***
## gre          0.003582   0.000986   3.633  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.06  on 398  degrees of freedom
## AIC: 490.06
## 
## Number of Fisher Scoring iterations: 4

## CIs using profiled log-likelihood
confint(admit.gre)

##                    2.5 %       97.5 %
## (Intercept) -4.119988259 -1.739756286
## gre          0.001679963  0.005552748

## CIs using standard errors
confint.default(admit.gre)

##                    2.5 %       97.5 %
## (Intercept) -4.089156159 -1.713532380
## gre          0.001649677  0.005514746

library(aod)
wald.test(b = coef(admit.gre), Sigma = vcov(admit.gre), Terms = 2)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 13.2, df = 1, P(> X2) = 0.00028

## odds ratios only
exp(coef(admit.gre))

## (Intercept)         gre 
##   0.0549493   1.0035886

## odds ratios and 95% CI
exp(cbind(OR = coef(admit.gre), confint(admit.gre)))

##                    OR      2.5 %    97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre         1.0035886 1.00168137 1.0055682

# Plot logistic curve through proportions
admitprop <- admit %>%
  group_by(gre) %>%
  summarize(admitprop=mean(admit), meangre=mean(gre), admittotal=sum(admit), count=n(),
            notadmit=count-admittotal)

admitprop$predicted <- predict(admit.gre, admitprop)
plot(admitprop ~ gre, data=admitprop)
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)

## Challenger example
temp <- c(66, 69, 68, 72, 73, 70, 78, 67, 79, 70, 67, 57, 81,
          76, 76, 63, 70, 53, 67, 75, 70, 75, 58, 80, 31)
damage <- c(rep(0,9), rep(1,14), rep(NA,2))
shuttle <- data.frame(temp, damage)
head(shuttle)

##   temp damage
## 1   66      0
## 2   69      0
## 3   68      0
## 4   72      0
## 5   73      0
## 6   70      0

## Predict damage from temp
shuttle.temp <- glm(damage ~ temp, data=shuttle, family="binomial")
summary(shuttle.temp)

## 
## Call:
## glm(formula = damage ~ temp, family = "binomial", data = shuttle)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4859  -1.2959   0.7172   0.9978   1.3002  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  5.03663    4.81697   1.046    0.296
## temp        -0.06569    0.06819  -0.963    0.335
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 30.789  on 22  degrees of freedom
## Residual deviance: 29.778  on 21  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 33.778
## 
## Number of Fisher Scoring iterations: 4

## CIs using profiled log-likelihood
confint(shuttle.temp)

##                  2.5 %      97.5 %
## (Intercept) -3.7413959 15.89616057
## temp        -0.2179992  0.05973434

## CIs using standard errors
confint.default(shuttle.temp)

##                  2.5 %      97.5 %
## (Intercept) -4.4044608 14.47771468
## temp        -0.1993375  0.06795919

## odds ratios only
exp(coef(shuttle.temp))

## (Intercept)        temp 
## 153.9498532   0.9364219

## odds ratios and 95% CI
exp(cbind(OR = coef(shuttle.temp), confint(shuttle.temp)))

##                      OR      2.5 %       97.5 %
## (Intercept) 153.9498532 0.02372097 8.009674e+06
## temp          0.9364219 0.80412610 1.061554e+00

# Plot proportions and logistic function
shuttleprob <- shuttle %>%
  group_by(temp) %>%
  summarize(damageprop=mean(damage), temperature=mean(temp))

shuttleprob$predicted <- predict(shuttle.temp, shuttleprob)
plot(damageprop ~ temperature, data=shuttleprob)
lines(shuttleprob$temperature, (exp(shuttleprob$predicted)/(1+exp(shuttleprob$predicted))), type="l", col="red", lw=3)

#######################
## Toenail infection ##
#######################
toenail <- read.csv("http://www.bradthiessen.com/html5/data/toenail.csv")

# Plot proportions over time
toenailprop <- toenail %>%
  group_by(treatment, visit) %>%
  summarize(propinfect=mean(outcome), meanmonth=mean(month))

ggplot(toenailprop, aes(meanmonth, propinfect)) +
  geom_point() + geom_line(size=2) +
  facet_grid(. ~ treatment)

## Fit model
toenail.model <- glm(outcome ~ treatment*month, data=toenail, family="binomial")
summary(toenail.model)

## 
## Call:
## glm(formula = outcome ~ treatment * month, family = "binomial", 
##     data = toenail)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9519  -0.7892  -0.5080  -0.2532   2.7339  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.5566273  0.1089628  -5.108 3.25e-07 ***
## treatmentTerbinafine       -0.0005817  0.1561463  -0.004   0.9970    
## month                      -0.1703078  0.0236199  -7.210 5.58e-13 ***
## treatmentTerbinafine:month -0.0672216  0.0375235  -1.791   0.0732 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1980.5  on 1907  degrees of freedom
## Residual deviance: 1816.0  on 1904  degrees of freedom
## AIC: 1824
## 
## Number of Fisher Scoring iterations: 5

## CIs using profiled log-likelihood
confint(toenail.model)

##                                 2.5 %       97.5 %
## (Intercept)                -0.7719213 -0.344475861
## treatmentTerbinafine       -0.3067395  0.305615999
## month                      -0.2182556 -0.125489310
## treatmentTerbinafine:month -0.1419292  0.005563446

## CIs using standard errors
confint.default(toenail.model)

##                                 2.5 %       97.5 %
## (Intercept)                -0.7701903 -0.343064177
## treatmentTerbinafine       -0.3066229  0.305459549
## month                      -0.2166020 -0.124013574
## treatmentTerbinafine:month -0.1407662  0.006323002

## odds ratios only
exp(coef(toenail.model))

##                (Intercept)       treatmentTerbinafine 
##                  0.5731389                  0.9994185 
##                      month treatmentTerbinafine:month 
##                  0.8434052                  0.9349880

## odds ratios and 95% CI
exp(cbind(OR = coef(toenail.model), confint(toenail.model)))

##                                   OR     2.5 %    97.5 %
## (Intercept)                0.5731389 0.4621244 0.7085917
## treatmentTerbinafine       0.9994185 0.7358422 1.3574609
## month                      0.8434052 0.8039199 0.8820652
## treatmentTerbinafine:month 0.9349880 0.8676827 1.0055790

# Plot logistic curves through proportions
# I need to rename a variable back to "month"
toenailprop$month <- toenailprop$meanmonth
toenailprop$predicted <- predict(toenail.model, toenailprop)
ggplot(toenailprop, aes(month, propinfect)) +
  geom_point() +
  geom_line(aes(month, (exp(predicted)/(1+exp(predicted)))), 
            size=2, col="red", linetype="dashed") +
  geom_line(aes(month, propinfect), size=2, col="black") +
  facet_grid(. ~ treatment)

#######################
## Multiple Logistic ##
##     Regression    ##
#######################


### Multiple logistic regression
xtabs(~admit + rank, data = admit)

##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12

admitlogit <- glm(admit ~ gre + gpa + rank, data = admit, family = "binomial")
summary(admitlogit)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

## CIs using profiled log-likelihood
confint(admitlogit)

##                     2.5 %       97.5 %
## (Intercept) -6.2716202334 -1.792547080
## gre          0.0001375921  0.004435874
## gpa          0.1602959439  1.464142727
## rank2       -1.3008888002 -0.056745722
## rank3       -2.0276713127 -0.670372346
## rank4       -2.4000265384 -0.753542605

## CIs using standard errors
confint.default(admitlogit)

##                     2.5 %       97.5 %
## (Intercept) -6.2242418514 -1.755716295
## gre          0.0001202298  0.004408622
## gpa          0.1536836760  1.454391423
## rank2       -1.2957512650 -0.055134591
## rank3       -2.0169920597 -0.663415773
## rank4       -2.3703986294 -0.732528724

library(aod)
wald.test(b = coef(admitlogit), Sigma = vcov(admitlogit), Terms = 4:6)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011

l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(admitlogit), Sigma = vcov(admitlogit), L = l)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019

## odds ratios only
exp(coef(admitlogit))

## (Intercept)         gre         gpa       rank2       rank3       rank4 
##   0.0185001   1.0022670   2.2345448   0.5089310   0.2617923   0.2119375

## odds ratios and 95% CI
exp(cbind(OR = coef(admitlogit), confint(admitlogit)))

##                    OR       2.5 %    97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre         1.0022670 1.000137602 1.0044457
## gpa         2.2345448 1.173858216 4.3238349
## rank2       0.5089310 0.272289674 0.9448343
## rank3       0.2617923 0.131641717 0.5115181
## rank4       0.2119375 0.090715546 0.4706961

newdata1 <- with(admit, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(admitlogit, newdata = newdata1, type = "response")
newdata1

##     gre    gpa rank     rankP
## 1 587.7 3.3899    1 0.5166016
## 2 587.7 3.3899    2 0.3522846
## 3 587.7 3.3899    3 0.2186120
## 4 587.7 3.3899    4 0.1846684

newdata2 <- with(admit, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
    4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(admitlogit, newdata = newdata2, type = "link",
    se = TRUE))
newdata3 <- within(newdata3, {
    PredictedProb <- plogis(fit)
    LL <- plogis(fit - (1.96 * se.fit))
    UL <- plogis(fit + (1.96 * se.fit))
})

## view first few rows of final dataset
head(newdata3)

##        gre    gpa rank        fit    se.fit residual.scale        UL
## 1 200.0000 3.3899    1 -0.8114870 0.5147714              1 0.5492064
## 2 206.0606 3.3899    1 -0.7977632 0.5090986              1 0.5498513
## 3 212.1212 3.3899    1 -0.7840394 0.5034491              1 0.5505074
## 4 218.1818 3.3899    1 -0.7703156 0.4978239              1 0.5511750
## 5 224.2424 3.3899    1 -0.7565919 0.4922237              1 0.5518545
## 6 230.3030 3.3899    1 -0.7428681 0.4866494              1 0.5525464
##          LL PredictedProb
## 1 0.1393812     0.3075737
## 2 0.1423880     0.3105042
## 3 0.1454429     0.3134499
## 4 0.1485460     0.3164108
## 5 0.1516973     0.3193867
## 6 0.1548966     0.3223773

ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
    ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
    size = 1)

with(admitlogit, null.deviance - deviance)

## [1] 41.45903

with(admitlogit, df.null - df.residual)

## [1] 5

with(admitlogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

## [1] 7.578194e-08

logLik(admitlogit)

## 'log Lik.' -229.2587 (df=6)

ggplot(drink, aes(x=male, y=binge)) + geom_point() + 
  stat_smooth(method="glm", family="binomial", se=FALSE)

########################
########################
## Poisson Regression ##
########################
########################

## Overdispersion
## Poisson: mean = variance.
## When variance > mean, we have overdispersion.
# Use a negative binomial distribution
# Use quasi-likelihood models with dispersion estimated from data


# Let's load a dataset on the mating of elephants
## Elephants reach maturity at about 14 years of age. 
## But they have to compete with all adult males for mating 
## opportunity. Females are generally more receptive to larger males. 
## Size of an elephant increases as age increases. 
## Hence it is expected that generally the number of matings should 
## increase with age. Is there an optimal age after which the success 
## rate does not rise further? Mating is a rare event and hence may
## follow a Poisson distribution.

library(gpk)
data(elephant)
head(elephant)

##   Age_in_Years Number_of_Matings
## 1           27                 0
## 2           28                 1
## 3           28                 1
## 4           28                 1
## 5           28                 3
## 6           29                 0

elephant$Age <- elephant$Age_in_Years
elephant$Matings <- elephant$Number_of_Matings

plot(elephant$Age,jitter(elephant$Matings))

# It looks like the number of mates tends to be higher
# for older elephants, AND there seems to be more variability 
# in the number of mates as age increases.

mean(elephant$Matings)

## [1] 2.682927

var(elephant$Matings)

## [1] 5.071951

mean(elephant$Matings)/var(elephant$Matings)

## [1] 0.5289733

eg <- elephant %>%
  group_by(Age) %>%
  summarize(mean=mean(Matings), var=var(Matings))

plot(log(eg$mean), log(eg$var), xlab="log(Mean matings)",  ylab="log(Variance of matings)",
     main="Figure 4.1. Mean-Variance Relationship")
mtext("matings by age", padj=-0.5)
abline(0,1)

## Null model
null.model <- glm(Matings ~ 1, family=poisson, data=elephant)
summary(null.model)

## 
## Call:
## glm(formula = Matings ~ 1, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3164  -1.1799  -0.4368   0.1899   3.0252  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.98691    0.09535   10.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 75.372  on 40  degrees of freedom
## AIC: 178.82
## 
## Number of Fisher Scoring iterations: 5

exp(coef(null.model))

## (Intercept) 
##    2.682927

mean(elephant$Matings)

## [1] 2.682927

## Show predictions
elephant$null.pred <- exp(predict(null.model, elephant))
plot(elephant$Age,jitter(elephant$Matings))
lines(elephant$Age,elephant$null.pred, lw=3, col=1)

## Deviance
deviance(null.model)

## [1] 75.37174

pchisq(0.95, df.residual(null.model))

## [1] 8.940942e-26

## Model with age
model <- glm(Matings ~ Age, family=poisson, data=elephant)
summary(model)

## 
## Call:
## glm(formula = Matings ~ Age, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.80798  -0.86137  -0.08629   0.60087   2.17777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58201    0.54462  -2.905  0.00368 ** 
## Age          0.06869    0.01375   4.997 5.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 51.012  on 39  degrees of freedom
## AIC: 156.46
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model))

## (Intercept)         Age 
##   0.2055619   1.0711071

## Predict matings from age
elephant$predicted <- exp(predict(model, elephant))
lines(elephant$Age,elephant$predicted, lw=3, col=2)

## Prediction for 30 year-old elephant
exp(coef(model)[1]+coef(model)[2]*30)

## (Intercept) 
##    1.614098

## Test model fit
deviance(null.model)

## [1] 75.37174

deviance(model)

## [1] 51.01163

pchisq(0.95, df.residual(model))

## [1] 5.841306e-25

1-pchisq(deviance(null.model)-deviance(model), df.residual(null.model)-df.residual(model))

## [1] 7.99062e-07

# Since the p-value is small, 
# there is evidence that the addition of age 
# explains a significant amount (more) of the deviance




### Next Example ###
### Publications ###

# Load data on publications by PhD Biochemists, Long (1990)
ab <- read.csv("http://www.bradthiessen.com/html5/data/biochem.csv")
tbl_df(ab)

## Source: local data frame [915 x 6]
## 
##    art   fem     mar kid5  phd ment
## 1    0   Men Married    0 2.52    7
## 2    0 Women  Single    0 2.05    6
## 3    0 Women  Single    0 3.75    6
## 4    0   Men Married    1 1.18    3
## 5    0 Women  Single    0 3.75   26
## 6    0 Women Married    2 3.59    2
## 7    0 Women  Single    0 3.19    3
## 8    0   Men Married    2 2.96    4
## 9    0   Men  Single    0 4.62    6
## 10   0 Women Married    0 1.25    0
## .. ...   ...     ...  ...  ...  ...

## art: articles in last three years of Ph.D.
## fem: coded one for females
## mar: coded one if married
## kid5: number of children under age six
## phd: prestige of Ph.D. program
## ment: articles by mentor in last three years
mean(ab$art)

## [1] 1.692896

var(ab$art)

## [1] 3.709742

mean(ab$art)/var(ab$art)

## [1] 0.456338

# The mean number of articles is 1.69 and the variance is 3.71, 
# a bit more than twice the mean. The data are over-dispersed, 
# but of course we haven't considered any covariates yet.

# Null model
null.model <- glm(art ~ 1, family=poisson, data=ab)
summary(null.model)

## 
## Call:
## glm(formula = art ~ 1, family = poisson, data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8401  -1.8401  -0.5770   0.2294   7.5677  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.52644    0.02541   20.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1817.4  on 914  degrees of freedom
## AIC: 3487.1
## 
## Number of Fisher Scoring iterations: 5

exp(coef(null.model))

## (Intercept) 
##    1.692896

1-pchisq(deviance(null.model), df.residual(null.model))

## [1] 0

# Predict with gender
model1 <- glm(art ~ fem, family=poisson, data=ab)
summary(model1)

## 
## Call:
## glm(formula = art ~ fem, family = poisson, data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9404  -1.7148  -0.4119   0.4139   7.3221  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.63265    0.03279  19.293  < 2e-16 ***
## femWomen    -0.24718    0.05187  -4.765 1.89e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1794.4  on 913  degrees of freedom
## AIC: 3466.1
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model1))

## (Intercept)    femWomen 
##   1.8825911   0.7810027

1-pchisq(deviance(null.model)-deviance(model1), 
         df.residual(null.model)-df.residual(model1))

## [1] 1.596071e-06

# Predict with gender and marriage
model2 <- glm(art ~ fem + mar, family=poisson, data=ab)
summary(model2)

## 
## Call:
## glm(formula = art ~ fem + mar, family = poisson, data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9466  -1.7019  -0.4268   0.4332   7.3072  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.63902    0.03513  18.190  < 2e-16 ***
## femWomen    -0.24054    0.05353  -4.493 7.01e-06 ***
## marSingle   -0.02815    0.05632  -0.500    0.617    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1794.1  on 912  degrees of freedom
## AIC: 3467.9
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model2))

## (Intercept)    femWomen   marSingle 
##   1.8946194   0.7862031   0.9722457

1-pchisq(deviance(model1)-deviance(model2), 
         df.residual(model1)-df.residual(model2))

## [1] 0.6167193

# Predict with gender and marriage and interaction
model3 <- glm(art ~ fem*mar, family=poisson, data=ab)
summary(model3)

## 
## Call:
## glm(formula = art ~ fem * mar, family = poisson, data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9733  -1.6660  -0.4669   0.4872   7.3459  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.62247    0.03753  16.586  < 2e-16 ***
## femWomen           -0.18924    0.06550  -2.889  0.00386 ** 
## marSingle           0.04377    0.07716   0.567  0.57050    
## femWomen:marSingle -0.14931    0.11186  -1.335  0.18193    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1792.4  on 911  degrees of freedom
## AIC: 3468.1
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model3))

##        (Intercept)           femWomen          marSingle 
##          1.8635171          0.8275869          1.0447464 
## femWomen:marSingle 
##          0.8613011

1-pchisq(deviance(model1)-deviance(model3), 
         df.residual(model1)-df.residual(model3))

## [1] 0.363465

## Predict with number of kids
model4 <- glm(art ~ kid5, family=poisson, data=ab)
summary(model4)

## 
## Call:
## glm(formula = art ~ kid5, family = poisson, data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8708  -1.8067  -0.5333   0.3694   7.4916  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.55960    0.02988  18.728   <2e-16 ***
## kid5        -0.06978    0.03450  -2.023   0.0431 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1813.2  on 913  degrees of freedom
## AIC: 3485
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model4))

## (Intercept)        kid5 
##   1.7499725   0.9325973

# Show predictions
ab$predicted <- exp(predict(model4, ab))
plot(jitter(ab$kid5), jitter(ab$art))
lines(ab$kid5, ab$predicted, lw=3, col=2)

# Let's try a model with all the predictors
full.model <- glm(art ~ fem + mar + kid5 + phd + ment, family=poisson, data=ab)
summary(full.model)

## 
## Call:
## glm(formula = art ~ fem + mar + kid5 + phd + ment, family = poisson, 
##     data = ab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5672  -1.5398  -0.3660   0.5722   5.4467  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.459860   0.093335   4.927 8.35e-07 ***
## femWomen    -0.224594   0.054613  -4.112 3.92e-05 ***
## marSingle   -0.155243   0.061374  -2.529   0.0114 *  
## kid5        -0.184883   0.040127  -4.607 4.08e-06 ***
## phd          0.012823   0.026397   0.486   0.6271    
## ment         0.025543   0.002006  12.733  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1817.4  on 914  degrees of freedom
## Residual deviance: 1634.4  on 909  degrees of freedom
## AIC: 3314.1
## 
## Number of Fisher Scoring iterations: 5

exp(coef(full.model))

## (Intercept)    femWomen   marSingle        kid5         phd        ment 
##   1.5838526   0.7988403   0.8562068   0.8312018   1.0129051   1.0258718

1-pchisq(deviance(model1)-deviance(full.model), 
         df.residual(model1)-df.residual(full.model))

## [1] 0

## The fit is still terrible, but it's better than a model with no predictors


## Prediction for a single female with 1 kids under the age of 6
exp(coef(full.model)[1]+coef(full.model)[2]+coef(full.model)[3]+(2*coef(full.model)[4])+(mean(ab$phd)*coef(full.model)[5])+(mean(ab$ment)*coef(full.model)[6]))

## (Intercept) 
##   0.9743212

## Prediction for a married male with no kids under the age of 6
exp(coef(full.model)[1]+(mean(ab$phd)*coef(full.model)[5])+(mean(ab$ment)*coef(full.model)[6]))

## (Intercept) 
##    2.061819

##################
## Next example ##
#### Absences ####
##################

# Load data
absences <- read.csv("http://www.bradthiessen.com/html5/data/absences.csv")
absences$prog <- factor(absences$prog, levels = 1:3, 
                        labels = c("General", "Academic", "Vocational"))
head(absences)

##     id gender math daysabs     prog
## 1 1001   male   63       4 Academic
## 2 1002   male   27       4 Academic
## 3 1003 female   20       2 Academic
## 4 1004 female   16       3 Academic
## 5 1005 female    2       3 Academic
## 6 1006 female   71      13 Academic

## Summarize
summary(absences)

##        id          gender         math          daysabs      
##  Min.   :1001   female:160   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.:1079   male  :154   1st Qu.:28.00   1st Qu.: 1.000  
##  Median :1158                Median :48.00   Median : 4.000  
##  Mean   :1576                Mean   :48.27   Mean   : 5.955  
##  3rd Qu.:2078                3rd Qu.:70.00   3rd Qu.: 8.000  
##  Max.   :2157                Max.   :99.00   Max.   :35.000  
##          prog    
##  General   : 40  
##  Academic  :167  
##  Vocational:107  
##                  
##                  
##

ggplot(absences, aes(daysabs, fill = prog)) + 
  geom_histogram(binwidth = 1, alpha=.5) + 
  facet_grid(prog ~ ., margins = TRUE, scales = "free")

plot(absences$math, absences$daysabs)

# Let's try a model with all the predictors
pois.model <- glm(daysabs ~ gender + math + prog, family=poisson, data=absences)
summary(pois.model)

## 
## Call:
## glm(formula = daysabs ~ gender + math + prog, family = poisson, 
##     data = absences)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2218  -2.1892  -0.9238   0.7218   7.0048  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.7594786  0.0637731  43.270  < 2e-16 ***
## gendermale     -0.2424762  0.0467765  -5.184 2.18e-07 ***
## math           -0.0069561  0.0009354  -7.437 1.03e-13 ***
## progAcademic   -0.4260327  0.0567308  -7.510 5.92e-14 ***
## progVocational -1.2707199  0.0779143 -16.309  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2217.7  on 313  degrees of freedom
## Residual deviance: 1746.8  on 309  degrees of freedom
## AIC: 2640.2
## 
## Number of Fisher Scoring iterations: 5

exp(coef(pois.model))

##    (Intercept)     gendermale           math   progAcademic progVocational 
##     15.7916072      0.7846824      0.9930680      0.6530950      0.2806295

## Fit is terrible
## We're assuming mean = variance
mean(absences$daysabs ~ absences$prog)

##    General   Academic Vocational 
##  10.650000   6.934132   2.672897

var(absences$daysabs ~ absences$prog)

##    General   Academic Vocational 
##   67.25897   55.44744   13.93916

## What can we do with this overdispersion?
## We could try a negative binomial regression
## It has an extra parameter to model overdispersion
library(MASS)
neg.bin.model <- glm.nb(daysabs ~ gender + math + prog, data=absences)
summary(neg.bin.model)

## 
## Call:
## glm.nb(formula = daysabs ~ gender + math + prog, data = absences, 
##     init.theta = 1.047288915, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1567  -1.0761  -0.3810   0.2856   2.7235  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.707484   0.204275  13.254  < 2e-16 ***
## gendermale     -0.211086   0.121989  -1.730   0.0836 .  
## math           -0.006236   0.002492  -2.502   0.0124 *  
## progAcademic   -0.424540   0.181725  -2.336   0.0195 *  
## progVocational -1.252615   0.199699  -6.273 3.55e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0473) family taken to be 1)
## 
##     Null deviance: 431.67  on 313  degrees of freedom
## Residual deviance: 358.87  on 309  degrees of freedom
## AIC: 1740.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.047 
##           Std. Err.:  0.108 
## 
##  2 x log-likelihood:  -1728.307

exp(coef(neg.bin.model))

##    (Intercept)     gendermale           math   progAcademic progVocational 
##     14.9915148      0.8097046      0.9937839      0.6540708      0.2857565

1-pchisq(deviance(neg.bin.model), 
         df.residual(neg.bin.model))

## [1] 0.02656535

# R first displays the call and the deviance residuals. Next, we see the regression coefficients for each of the variables, along with standard errors, z-scores, and p-values. The variable math has a coefficient of -0.006, which is statistically significant. This means that for each one-unit increase in math, the expected log count of the number of days absent decreases by 0.006. The indicator variable shown as progAcademic is the expected difference in log count between group 2 and the reference group (prog=1). The expected log count for level 2 of prog is 0.44 lower than the expected log count for level 1. The indicator variable for progVocational is the expected difference in log count between group 3 and the reference group.The expected log count for level 3 of prog is 1.28 lower than the expected log count for level 1. To determine if prog itself, overall, is statistically significant, we can compare a model with and without prog. The reason it is important to fit separate models, is that unless we do, the overdispersion parameter is held constant.

neg.bin.no.prog <- update(neg.bin.model, . ~ . - prog)
anova(neg.bin.model, neg.bin.no.prog)

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: daysabs
##                  Model     theta Resid. df    2 x log-lik.   Test    df
## 1        gender + math 0.8705939       311       -1772.074             
## 2 gender + math + prog 1.0472889       309       -1728.307 1 vs 2     2
##   LR stat.      Pr(Chi)
## 1                      
## 2 43.76737 3.133546e-10

neg.bin.model <- glm.nb(daysabs ~ math + prog, data=absences)
newdata1 <- data.frame(math = mean(absences$math), prog = factor(1:3, levels = 1:3, 
    labels = levels(absences$prog)))
newdata1$phat <- predict(neg.bin.model, newdata1, type = "response")
newdata1

##       math       prog      phat
## 1 48.26752    General 10.236899
## 2 48.26752   Academic  6.587927
## 3 48.26752 Vocational  2.850083

newdata2 <- data.frame(
  math = rep(seq(from = min(absences$math), to = max(absences$math), length.out = 100), 3),
  prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
  levels(absences$prog)))

newdata2 <- cbind(newdata2, predict(neg.bin.model, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
  DaysAbsent <- exp(fit)
  LL <- exp(fit - 1.96 * se.fit)
  UL <- exp(fit + 1.96 * se.fit)
})

ggplot(newdata2, aes(math, DaysAbsent)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
  geom_line(aes(colour = prog), size = 2) +
  labs(x = "Math Score", y = "Predicted Days Absent")

###################
## Final example ##
###################


# Load data
medicare <- read.csv("http://www.bradthiessen.com/html5/data/medicare.csv")
tbl_df(medicare)

## Source: local data frame [4,406 x 19]
## 
##    ofp ofnp opp opnp emer hosp  health numchron adldiff  region age black
## 1    5    0   0    0    0    1 average        2      no   other 6.9   yes
## 2    1    0   2    0    2    0 average        2      no   other 7.4    no
## 3   13    0   0    0    3    3    poor        4     yes   other 6.6   yes
## 4   16    0   5    0    1    1    poor        2     yes   other 7.6    no
## 5    3    0   0    0    0    0 average        2     yes   other 7.9    no
## 6   17    0   0    0    0    0    poor        5     yes   other 6.6    no
## 7    9    0   0    0    0    0 average        0      no midwest 7.5    no
## 8    3    0   0    0    0    0 average        0      no midwest 8.7    no
## 9    1    0   0    0    0    0 average        0      no midwest 7.3    no
## 10   0    0   0    0    0    0 average        0      no midwest 7.8    no
## .. ...  ... ...  ...  ...  ...     ...      ...     ...     ... ...   ...
## Variables not shown: gender (fctr), married (fctr), school (int), faminc
##   (dbl), employed (fctr), privins (fctr), medicaid (fctr)

## Source:  US National Medical Expenditure Survey (NMES) for 1987/88
## ofp = number of physician office visits
## hosp = number of hospital stays
## health = self-perceived health status
## numchron = number of chronic conditions
## gender
## school = years of education
## privins = private insurance?

## Select only these variables
med.data <- medicare[, c(1, 6:8, 13, 15, 18)]

## Plot number of doctor office visits
plot(table(med.data$ofp))

hist(med.data$ofp, breaks = 0:90 - 0.5)

## Bad plot
plot(ofp ~ numchron, data = med.data)

## Continuity-corrected log transform function
clog <- function(x) log(x + 0.5)

## Function to transform count to factor variable
cfac <- function(x, breaks = NULL) {
  if(is.null(breaks)) breaks <- unique(quantile(x, 0:10/10))
  x <- cut(x, breaks, include.lowest = TRUE, right = FALSE)
levels(x) <- paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1,
  c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""),
  sep = "")
return(x)}

## Good plots
plot(clog(ofp) ~ cfac(numchron), data = med.data)

plot(clog(ofp) ~ health, data = med.data, varwidth = TRUE)

plot(clog(ofp) ~ cfac(numchron), data = med.data)

plot(clog(ofp) ~ privins, data = med.data, varwidth = TRUE)

plot(clog(ofp) ~ cfac(hosp, c(0:2, 8)), data = med.data)

plot(clog(ofp) ~ gender, data = med.data, varwidth = TRUE)

plot(cfac(ofp, c(0:2, 4, 6, 10, 100)) ~ school, data = med.data, breaks = 9)

## Poisson regression
fm_pois <- glm(ofp ~ ., data = med.data, family = poisson)
summary(fm_pois)

## 
## Call:
## glm(formula = ofp ~ ., family = poisson, data = med.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4055  -1.9962  -0.6737   0.7049  16.3620  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.028874   0.023785  43.258   <2e-16 ***
## hosp             0.164797   0.005997  27.478   <2e-16 ***
## healthexcellent -0.361993   0.030304 -11.945   <2e-16 ***
## healthpoor       0.248307   0.017845  13.915   <2e-16 ***
## numchron         0.146639   0.004580  32.020   <2e-16 ***
## gendermale      -0.112320   0.012945  -8.677   <2e-16 ***
## school           0.026143   0.001843  14.182   <2e-16 ***
## privinsyes       0.201687   0.016860  11.963   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 26943  on 4405  degrees of freedom
## Residual deviance: 23168  on 4398  degrees of freedom
## AIC: 35959
## 
## Number of Fisher Scoring iterations: 5

## Sandwich standard errors
## As the exploratory analysis suggested that 
## over-dispersion is present in this data set, 
## we re-compute the Wald tests using sandwich 
## standard errors
library(sandwich)
library(lmtest)
coeftest(fm_pois, vcov = sandwich)

## 
## z test of coefficients:
## 
##                  Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)      1.028874   0.064530 15.9442 < 2.2e-16 ***
## hosp             0.164797   0.021945  7.5095 5.935e-14 ***
## healthexcellent -0.361993   0.077449 -4.6740 2.954e-06 ***
## healthpoor       0.248307   0.054022  4.5964 4.298e-06 ***
## numchron         0.146639   0.012908 11.3605 < 2.2e-16 ***
## gendermale      -0.112320   0.035343 -3.1780  0.001483 ** 
## school           0.026143   0.005084  5.1422 2.715e-07 ***
## privinsyes       0.201687   0.043128  4.6765 2.919e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Quasi-poisson regression
fm_qpois <- glm(ofp ~ ., data = med.data, family = quasipoisson)
summary(fm_qpois)

## 
## Call:
## glm(formula = ofp ~ ., family = quasipoisson, data = med.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4055  -1.9962  -0.6737   0.7049  16.3620  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.028874   0.061594  16.704  < 2e-16 ***
## hosp             0.164797   0.015531  10.611  < 2e-16 ***
## healthexcellent -0.361993   0.078476  -4.613 4.09e-06 ***
## healthpoor       0.248307   0.046211   5.373 8.13e-08 ***
## numchron         0.146639   0.011860  12.364  < 2e-16 ***
## gendermale      -0.112320   0.033523  -3.351 0.000813 ***
## school           0.026143   0.004774   5.477 4.58e-08 ***
## privinsyes       0.201687   0.043661   4.619 3.96e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 6.706254)
## 
##     Null deviance: 26943  on 4405  degrees of freedom
## Residual deviance: 23168  on 4398  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

## estimated dispersion of φˆ = 6.706 is larger than 1 
## confirming that over-dispersion is present in the data


## Negative binomial regression
library(MASS)
fm_nbin <- glm.nb(ofp ~ ., data = med.data)
summary(fm_nbin)

## 
## Call:
## glm.nb(formula = ofp ~ ., data = med.data, init.theta = 1.206603534, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0469  -0.9955  -0.2948   0.2961   5.8185  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.929257   0.054591  17.022  < 2e-16 ***
## hosp             0.217772   0.020176  10.793  < 2e-16 ***
## healthexcellent -0.341807   0.060924  -5.610 2.02e-08 ***
## healthpoor       0.305013   0.048511   6.288 3.23e-10 ***
## numchron         0.174916   0.012092  14.466  < 2e-16 ***
## gendermale      -0.126488   0.031216  -4.052 5.08e-05 ***
## school           0.026815   0.004394   6.103 1.04e-09 ***
## privinsyes       0.224402   0.039464   5.686 1.30e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.2066) family taken to be 1)
## 
##     Null deviance: 5743.7  on 4405  degrees of freedom
## Residual deviance: 5044.5  on 4398  degrees of freedom
## AIC: 24359
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.2066 
##           Std. Err.:  0.0336 
## 
##  2 x log-likelihood:  -24341.1070

## both regression coefficients and standard errors 
## are similar to the quasi-Poisson and 
## the sandwich-adjusted Poisson results


## Compare estimated coefficients
fm <- list("ML-Pois" = fm_pois, "Quasi-Pois" = fm_qpois, "NB" = fm_nbin)
sapply(fm, function(x) coef(x)[1:8])

##                     ML-Pois  Quasi-Pois          NB
## (Intercept)      1.02887420  1.02887420  0.92925658
## hosp             0.16479739  0.16479739  0.21777223
## healthexcellent -0.36199320 -0.36199320 -0.34180660
## healthpoor       0.24830697  0.24830697  0.30501303
## numchron         0.14663928  0.14663928  0.17491552
## gendermale      -0.11231992 -0.11231992 -0.12648813
## school           0.02614299  0.02614299  0.02681508
## privinsyes       0.20168688  0.20168688  0.22440187

## Compare standard errors
cbind("ML-Pois" = sqrt(diag(vcov(fm_pois))),
 "Adj-Pois" = sqrt(diag(sandwich(fm_pois))),
 sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8]))

##                     ML-Pois    Adj-Pois  Quasi-Pois          NB
## (Intercept)     0.023784601 0.064529808 0.061593641 0.054591271
## hosp            0.005997367 0.021945186 0.015531043 0.020176492
## healthexcellent 0.030303905 0.077448586 0.078476316 0.060923623
## healthpoor      0.017844531 0.054021990 0.046210977 0.048510797
## numchron        0.004579677 0.012907865 0.011859732 0.012091749
## gendermale      0.012945146 0.035343487 0.033523316 0.031215523
## school          0.001843329 0.005084002 0.004773565 0.004393971
## privinsyes      0.016859826 0.043128006 0.043660942 0.039463744

## Log-likelihood
rbind(logLik = sapply(fm, function(x) round(logLik(x), digits = 0)),
   Df = sapply(fm, function(x) attr(logLik(x), "df")))

##        ML-Pois Quasi-Pois     NB
## logLik  -17972         NA -12171
## Df           8          8      9

## The ML Poisson model is clearly inferior to all other fits.

## Expected vs observed zero counts
round(c("Obs" = sum(med.data$ofp < 1),
   "ML-Pois" = sum(dpois(0, fitted(fm_pois))),
   "NB" = sum(dnbinom(0, mu = fitted(fm_nbin), size = fm_nbin$theta))))

##     Obs ML-Pois      NB 
##     683      47     608

Assignment Stuff

Retention

Load data

retention <- read.csv(“http://www.bradthiessen.com/html5/data/retention1113.csv”) students2014 <- read.csv(“http://www.bradthiessen.com/html5/data/retention14.csv”)

Remove missing data

retention2 <- retention %>% filter(!is.na(retained))

Convert to factor variables

str(retention2) retention2$female <- as.factor(retention2$female) retention2$race <- as.factor(retention2$race) retention2$mom_education <- as.factor(retention2$mom_education) retention2$dad_education <- as.factor(retention2$dad_education) retention2$residence <- as.factor(retention2$residence) retention2$athlete <- as.factor(retention2$athlete)

Retention numbers by year

xtabs(~retained + year, data = retention) table(retention2$dropped, retention2$year)

Number of students in 2014

table(students2014$year)

Bootstrap estimate of retention rate for 2014

resample <- do(100000) * mean(sample(retention2$retained, 533, replace=T)) densityplot(~result, data=resample, plot.points = FALSE, col=“darkblue”, lwd=4) confint(resample, level = 0.95, method = “quantile”) mean(resample)

Logistic regression

logitmodel <- glm(formula = dropped ~ act_comp + hsgpa + female*athlete, data=retention2, family=binomial)

before.enroll.model <- glm(formula = dropped ~ act_math + act_read + hsgpa + female + race + athlete + residence + mom_education + dad_education, data=retention2, family=binomial) summary(before.enroll.model) wald.test(b = coef(before.enroll.model), Sigma = vcov(before.enroll.model), Terms = 3:8)

reduced.enroll.model <- glm(formula = dropped ~ hsgpa + athlete + residence, data=retention2, family=binomial) summary(reduced.enroll.model)

with(reduced.enroll.model, null.deviance - deviance) with(reduced.enroll.model, df.null - df.residual) with(reduced.enroll.model, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) logLik(admitlogit)

september.model <- glm(formula = dropped ~ risk_rating + act_math + hsgpa + female + race + factor_commit + factor_finance + factor_basic + athlete + residence, data=retention2, family=binomial) summary(september.model)

risk.model <- glm(formula = dropped ~ risk_rating, data=retention2, family=binomial) summary(risk.model)

summary(logitmodel) # display results confint(logitmodel) # 95% CI for the coefficients exp(coef(logitmodel)) # exponentiated coefficients exp(confint(logitmodel)) # 95% CI for exponentiated coefficients exp(cbind(OR = coef(logitmodel), confint(logitmodel)))

anova(logitmodel, test=“Chisq”) 1 - pchisq(16.467, df=1)

wald.test(b = coef(logitmodel), Sigma = vcov(logitmodel), Terms = 2:3)

Create empty dataframe to store predictions and residuals

predictions <- data.frame(matrix(nrow=1568)) predictions$first <- predict(logitmodel, type="response") # predicted values predictions$residuals <- residuals(logitmodel, type=“deviance”) # residuals

densityplot(~first, data=predictions, plot.points = FALSE, col=“darkblue”, lwd=4) densityplot(~residuals, data=predictions, plot.points = FALSE, col=“darkblue”, lwd=4)

cdplot(dropped~act_comp, data=retention2) cdplot(retained~act_comp, data=retention2)

newdata = data.frame(act_comp=18, hsgpa=2.50, falltermgpa=2.00) predict(logitmodel, newdata, type=“response”)

newdata1 <- with(retention2, data.frame(act_comp = mean(act_comp, na.rm=T), hsgpa = mean(hsgpa, na.rm=T), athlete=factor(c(0,1,1,0)), female = factor(0:1)))

newdata1$rankP <- predict(logitmodel, newdata = newdata1, type = “response”)

newdata2 <- with(mydata, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100), 4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))

lines(retention2$act_comp, logitmodel$fitted, type=“l”, col=“red”)

Titanic

Load data

titanic <- read.csv(“http://www.bradthiessen.com/html5/data/titanic.csv”) tbl_df(titanic)

Predict admission from gre scores

survived.model <- glm(survived ~ sex + age + fare, data=titanic, family=“binomial”) summary(survived.model)

CIs using profiled log-likelihood

confint(survived.model) ## CIs using standard errors confint.default(survived.model)

library(aod) wald.test(b = coef(survived.model), Sigma = vcov(survived.model), Terms = 2)

odds ratios only

exp(coef(survived.model)) ## odds ratios and 95% CI exp(cbind(OR = coef(survived.model), confint(survived.model)))

Plot logistic curve through proportions

survived <- titanic %>% group_by(fare) %>% summarize(admitprop=mean(admit), meangre=mean(gre), admittotal=sum(admit), count=n(), notadmit=count-admittotal)

admitprop$predicted <- predict(admit.gre, admitprop) plot(admitprop ~ gre, data=admitprop) lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type=“l”, col=“red”, lw=3)

MATH 301: Activity 13

Brad Thiessen

October 26, 2014

Load packages

Logistic Regression

Assignment Stuff

Retention

Load data

Remove missing data

Convert to factor variables

Retention numbers by year

Number of students in 2014

Bootstrap estimate of retention rate for 2014

Logistic regression

Create empty dataframe to store predictions and residuals

Titanic

Load data

Predict admission from gre scores

CIs using profiled log-likelihood

odds ratios only

Plot logistic curve through proportions