Generalized Linear Model: Logistic/Poisson regression

Binge drinking example
Graduate school admissions example
Breast cancer example
Toenail infection
Poisson regression: Elephants
Example: Student absences

# Load packages
library(mosaic)
library(ggvis)
library(aod)
library(MASS)
library(corrplot)
library(car)
library(reshape2)
library(bestglm)

Binge drinking example

# Input binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(gender = factor(male, labels=c("female", "male")), 
                    drink = factor(binge, labels=c("Not binge", "binge")))
# Look at data
head(drink)

##   gender drink
## 1   male binge
## 2   male binge
## 3   male binge
## 4   male binge
## 5   male binge
## 6   male binge

tail(drink)

##       gender     drink
## 17091 female Not binge
## 17092 female Not binge
## 17093 female Not binge
## 17094 female Not binge
## 17095 female Not binge
## 17096 female Not binge

# Table of results
tally(drink ~ gender, margins=TRUE, data=drink)

##            gender
## drink       female male
##   Not binge   8254 5528
##   binge       1690 1624
##   Total       9944 7152

# Plot results
ggplot(drink, aes(x=male, y=binge)) + 
  geom_point(shape=1, alpha=.008, position=position_jitter(width=.05,height=.05)) + 
  stat_smooth(method="glm", family="binomial", se=F)

Maximum likelihood estimation

# Set parameters of our binomial distribution
N <- 17096
X <- 3314
# Set range of values to look for max likelihood estimate
p <- seq(.001,1-.001,by=.001)

# Calculate likelihoods of our data across the range of possible probabilities
L <- lapply(p, dbinom, size=N, x=X)

# Plot the likelihoods
plot(p, as.numeric(L), main="max(Likelihood): p=0.19385", type="l", xlab="Probability p", ylab="L(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)

# Let's plot the log-likelihood
plot(p, log(as.numeric(L)), main="max(Log-Likelihood): p=0.19385", type="l",
     xlab="Probability p", ylab="l(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)

Calculate probabilities, odds, odds ratios

# Calculate for females
p.binge.female <- 1690/9944
odds.binge.female <- p.binge.female / (1 - p.binge.female)
ln.odds.b.f <- log(odds.binge.female)

# Calculate for males
p.binge.male <- 1624/7152
odds.binge.male <- p.binge.male / (1 - p.binge.male)
ln.odds.b.m <- log(odds.binge.male)

# Calculate odds ratio
odds.ratio <- odds.binge.male / odds.binge.female

# Paste results all together
cat("Females: prob =", round(p.binge.female, 5), "; odds =", round(odds.binge.female, 3),
      "; ln(odds) =", round(ln.odds.b.f, 3), "\n ",
    "males: prob =", round(p.binge.male, 5), "; odds =", round(odds.binge.male, 3),
      "; ln(odds) =", round(ln.odds.b.m, 3), "\nodds ratio =",
    round(odds.binge.male / odds.binge.female, 4), "\nrelative probability",
    round(p.binge.male / p.binge.female, 4))

## Females: prob = 0.16995 ; odds = 0.205 ; ln(odds) = -1.586 
##   males: prob = 0.22707 ; odds = 0.294 ; ln(odds) = -1.225 
## odds ratio = 1.4348 
## relative probability 1.3361

Logistic regression model for binge drinking

# Fit model (notice glm and family=binomial)
drink.logmodel <- glm(drink ~ gender, data=drink, family=binomial)
# Summarize model
summary(drink.logmodel)

## 
## Call:
## glm(formula = drink ~ gender, family = binomial, data = drink)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7177  -0.7177  -0.6104  -0.6104   1.8827  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58597    0.02670 -59.400   <2e-16 ***
## gendermale   0.36104    0.03885   9.292   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16814  on 17095  degrees of freedom
## Residual deviance: 16728  on 17094  degrees of freedom
## AIC: 16732
## 
## Number of Fisher Scoring iterations: 4

# Odds ratio (exponential function of coefficients) 
# Notice the odds ratio matches what we calculated earlier
exp(coef(drink.logmodel))

## (Intercept)  gendermale 
##   0.2047492   1.4348145

# Create new dataset of 1 female and 1 male
gender <- c(0,1)
new <- data.frame(gender = factor(gender, labels=c("female", "male")))

# Predict the probability of binge drinking for the new data.frame
# Notice that these match the probabilities we calculated earlier
predict(drink.logmodel, newdata=new, type="response")

##         1         2 
## 0.1699517 0.2270694

Graduate school admissions example

# Clear out memory
rm(list = ls())

# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
# Look at structure
str(admit)

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

# Calculate means
sapply(admit, mean)

##    admit      gre      gpa     rank 
##   0.3175 587.7000   3.3899   2.4850

# Convert rank to a factor
admit$rank <- factor(admit$rank)

# Function to plot the scatterplot matrix
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

Fit logistic model with GRE as predictor

# Fit logistic model with GRE as predictor
gre.model <- glm(admit ~ gre, family=binomial, data=admit)
# Summarize model
summary(gre.model)

## 
## Call:
## glm(formula = admit ~ gre, family = binomial, data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1623  -0.9052  -0.7547   1.3486   1.9879  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.901344   0.606038  -4.787 1.69e-06 ***
## gre          0.003582   0.000986   3.633  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.06  on 398  degrees of freedom
## AIC: 490.06
## 
## Number of Fisher Scoring iterations: 4

Calculate deviance manually

# Predicted probabilities from our model
admit$prob <- predict(gre.model, type="response")
# Calculate likelihood of model:  Product of [P^y * (1-P)^(1-y)]
admit$like <- (admit$prob ^ admit$admit) * ((1 - admit$prob) ^ (1-admit$admit))
model.likelihood <- prod(admit$like)
# Calculate deviance:  -2LL
-2 * log(model.likelihood)

## [1] 486.0561

# Fit null model with no predictors
null.model <- glm(admit ~ 1, family=binomial, data=admit)
# Predicted probabilities from our null model
admit$nullprob <- predict(null.model, type="response")
# Calculate likelihood of null model
admit$nulllike <- (admit$nullprob ^ admit$admit) * ((1 - admit$nullprob) ^ (1-admit$admit))
null.likelihood <- prod(admit$nulllike)
# Calculate deviance of model
-2 * log(null.likelihood)

## [1] 499.9765

Notice how those values match the output from the glm() command.

Chi-square test for improvement in deviance

# Chi-square = difference in deviance (null model - GRE model)
modelChi <- gre.model$null.deviance - gre.model$deviance
# Degrees-of-freedom (parameters in null model - parameters in GRE model)
chidf <- gre.model$df.null - gre.model$df.residual
# Calculate p-value for that chi-square statistic
chisq.prob <- 1 - pchisq(modelChi, chidf)
# Paste results
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 5))

## [1] "p-value for chi-square test of difference in deviance = 0.00019"

Comparing predictions: GRE of 400 vs 401

# Generate data.frame with GRE scores of 400 and 401
gre.new <- c(400, 401)
gre.new <- data.frame(gre = gre.new)

# Generate predicted log-odds
gre.new$log.odds <- -2.901344 + (0.003582 * gre.new$gre)
# Generate predicted odds
gre.new$odds <- exp(-2.901344 + (0.003582 * gre.new$gre))
# Calculate odds ratio
gre.odds.ratio <- exp(-2.901344 + 0.003582) / exp(-2.901344)
# Generate predicted probabilities
gre.new$predicted <- predict(gre.model, newdata=gre.new, type="response")

# Paste results
cat("GRE of 401: log-odds =", round(gre.new$log.odds[gre.new$gre==401], 5), "; odds =",
    round(gre.new$odds[gre.new$gre==401], 5), "; prob =",
    round(gre.new$predicted[gre.new$gre==401], 5), "\nGRE of 400:",
      "log-odds =", round(gre.new$log.odds[gre.new$gre==400], 5), "; odds =",
    round(gre.new$odds[gre.new$gre==400], 5), "; prob =",
    round(gre.new$predicted[gre.new$gre==400], 5), "\nOdds ratio:",
    round(gre.odds.ratio, 6))

## GRE of 401: log-odds = -1.46496 ; odds = 0.23109 ; prob = 0.18772 
## GRE of 400: log-odds = -1.46854 ; odds = 0.23026 ; prob = 0.18718 
## Odds ratio: 1.003588

# We can get this odds ratio by exponentiating our logistic model
# Here I will get the odds ratio and 95% confidence intervals
# Notice we get the same odds ratio of 1.003588
exp(cbind(OR = coef(gre.model), confint(gre.model)))

## Waiting for profiling to be done...

##                    OR      2.5 %    97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre         1.0035886 1.00168137 1.0055682

Getting probabilities of admission for various GRE scores

# Create data.frame with various GRE scores
gre.new.2 <- c(200, 300, 400, 500, 600, 700, 800)
gre.new.2 <- data.frame(gre = gre.new.2)
# Generate predicted log-odds
gre.new.2$log.odds <- -2.901344 + (0.003582 * gre.new.2$gre)
# Generate predicted odds
gre.new.2$odds <- exp(-2.901344 + (0.003582 * gre.new.2$gre))
# Generate predicted probabilities
gre.new.2$predicted <- predict(gre.model, newdata=gre.new.2, type="response")

# Table of predicted odds and probabilities
gre.new.2 %>%
  group_by(gre) %>%
  summarize(predicted.Odds = round(mean(odds),3), Probability = round(mean(predicted),3))

## Source: local data frame [7 x 3]
## 
##     gre predicted.Odds Probability
##   (dbl)          (dbl)       (dbl)
## 1   200          0.112       0.101
## 2   300          0.161       0.139
## 3   400          0.230       0.187
## 4   500          0.329       0.248
## 5   600          0.471       0.320
## 6   700          0.674       0.403
## 7   800          0.965       0.491

# Plot logistic curve through proportions
# Summarize proportion admitted by GRE score
admitprop <- admit %>%
  group_by(gre) %>%
  summarize(admitprop=mean(admit))
# Predict probabilities for these GRE scores
admitprop$predicted <- predict(gre.model, admitprop)
# Plot proportions admitted by GRE score
plot(admitprop ~ gre, data=admitprop)
# Add a line for the predicted probabilities
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)

Predict admissions using all 4 predictors

# Fit logistic model with all predictors
admit.model <- glm(admit ~ gre + gpa + rank, family=binomial, data=admit)
# Summarize model
summary(admit.model)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = binomial, data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

# Test improvement in deviance
# Compared to model with only gre as a predictor
modelChi <- gre.model$deviance - admit.model$deviance
chidf <- gre.model$df.residual - admit.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 8))

## [1] "p-value for chi-square test of difference in deviance = 1.547e-05"

# Get the odds ratios and confidence intervals
exp(cbind(OR = coef(admit.model), confint(admit.model)))

## Waiting for profiling to be done...

##                    OR       2.5 %    97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre         1.0022670 1.000137602 1.0044457
## gpa         2.2345448 1.173858216 4.3238349
## rank2       0.5089310 0.272289674 0.9448343
## rank3       0.2617923 0.131641717 0.5115181
## rank4       0.2119375 0.090715546 0.4706961

# Create the colorful graph (lots of work)
newdata1 <- with(admit, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(admit.model, newdata = newdata1, type = "response")
newdata2 <- with(admit, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
    4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(admit.model, newdata = newdata2, type = "link",
    se = TRUE))
newdata3 <- within(newdata3, {
    PredictedProb <- plogis(fit)
    LL <- plogis(fit - (1.96 * se.fit))
    UL <- plogis(fit + (1.96 * se.fit))
})

# view first few rows of final dataset
head(newdata3)

##        gre    gpa rank        fit    se.fit residual.scale        UL
## 1 200.0000 3.3899    1 -0.8114870 0.5147714              1 0.5492064
## 2 206.0606 3.3899    1 -0.7977632 0.5090986              1 0.5498513
## 3 212.1212 3.3899    1 -0.7840394 0.5034491              1 0.5505074
## 4 218.1818 3.3899    1 -0.7703156 0.4978239              1 0.5511750
## 5 224.2424 3.3899    1 -0.7565919 0.4922237              1 0.5518545
## 6 230.3030 3.3899    1 -0.7428681 0.4866494              1 0.5525464
##          LL PredictedProb
## 1 0.1393812     0.3075737
## 2 0.1423880     0.3105042
## 3 0.1454429     0.3134499
## 4 0.1485460     0.3164108
## 5 0.1516973     0.3193867
## 6 0.1548966     0.3223773

# Create the plot
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
    ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
    size = 1)

Wald statistic

# Test rank of undergraduate institution using Wald statistic
# Requires library(aod)
# Compare terms 4-6 in our logistic model vs. a null model
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), Terms = 4:6)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011

# Compare terms 4 vs 5 (rank 2 vs rank 3)
# Set-up a contrast (like we did with the Scheffe method)
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), L = l)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019

Breast cancer example

# Load data from MASS library
# library(MASS)
data(biopsy)
# Examine structure
str(biopsy)

## 'data.frame':    699 obs. of  11 variables:
##  $ ID   : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

# Eliminate ID variable
biopsy <- biopsy[, -1]
# Name variables
names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom",
                   "n.nuc", "mit", "class")
# Eliminate cases with missing values
biopsy2 <- na.omit(biopsy)
head(biopsy2)

##   thick u.size u.shape adhsn s.size nucl chrom n.nuc mit     class
## 1     5      1       1     1      2    1     3     1   1    benign
## 2     5      4       4     5      7   10     3     2   1    benign
## 3     3      1       1     1      2    2     3     1   1    benign
## 4     6      8       8     1      3    4     3     7   1    benign
## 5     4      1       1     3      2    1     3     1   1    benign
## 6     8     10      10     8      7   10     9     7   1 malignant

# Construct boxplots of predictor variables by each value of our dependent variable
# Convert to wide dataset (in order to get the boxplots) 
biop.m <- melt(biopsy2, id.var="class")
# Construct the boxplots
ggplot(data=biop.m, aes(x=class, y=value)) +
  geom_boxplot() +
  facet_wrap(~variable, ncol=3)

# Nucl looks important; mit does not
# Let's look at correlations
library(corrplot)
bc = cor(biopsy2[, 1:9])
corrplot.mixed(bc)

# Collinearity problem?

# Check VIF
# library(car)
# Fit full model
full.model <- glm(class ~ ., family=binomial, data=biopsy2)
# Calculate VIF
round(vif(full.model),3)

##   thick  u.size u.shape   adhsn  s.size    nucl   chrom   n.nuc     mit 
##   1.188   2.819   2.763   1.186   1.347   1.142   1.215   1.220   1.042

Split into training and test datasets

# Create training and test datasets
# Set random number seed
set.seed(71930)
# Create random integers ~70% of them 1s; ~30% of them 2s (1, 2)
ind <- sample(2, nrow(biopsy2), replace=TRUE, prob=c(.7, .3))
# Training dataset = 1; test = 2
train <- biopsy2[ind==1, ]
test <- biopsy2[ind==2, ]
# See if we have relatively equal occurance of cancer in each set
tally(~class, data=train, format="prop")

## 
##    benign malignant 
## 0.6582278 0.3417722

tally(~class, data=test, format="prop")

## 
##    benign malignant 
## 0.6315789 0.3684211

Fit and test logistic model with all predictors

# Fit logistic model with all predictors
full.model <- glm(class ~ ., family=binomial, data=train)
summary(full.model)

## 
## Call:
## glm(formula = class ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4692  -0.0778  -0.0358   0.0045   2.2562  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.07441    1.97761  -6.106 1.02e-09 ***
## thick         0.75495    0.22554   3.347 0.000816 ***
## u.size       -0.26684    0.28954  -0.922 0.356730    
## u.shape       0.22282    0.28379   0.785 0.432360    
## adhsn         0.58666    0.20702   2.834 0.004600 ** 
## s.size       -0.06207    0.23246  -0.267 0.789455    
## nucl          0.52526    0.14448   3.635 0.000277 ***
## chrom         0.71157    0.23284   3.056 0.002243 ** 
## n.nuc         0.44724    0.17153   2.607 0.009125 ** 
## mit           0.44050    0.41143   1.071 0.284318    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 608.809  on 473  degrees of freedom
## Residual deviance:  55.022  on 464  degrees of freedom
## AIC: 75.022
## 
## Number of Fisher Scoring iterations: 9

# Chi-square test for improvement in deviance
modelChi <- full.model$null.deviance - full.model$deviance
chidf <- full.model$df.null - full.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob

## [1] 0

# Odds ratios and confidence intervals
exp(cbind(OR = coef(full.model), confint(full.model)))

## Waiting for profiling to be done...

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                       OR        2.5 %       97.5 %
## (Intercept) 5.703626e-06 5.036254e-08 0.0001420459
## thick       2.127503e+00 1.434655e+00 3.5320359815
## u.size      7.657944e-01 4.385016e-01 1.4097698097
## u.shape     1.249599e+00 6.721431e-01 2.1223295619
## adhsn       1.797980e+00 1.210043e+00 2.8107870239
## s.size      9.398173e-01 5.959236e-01 1.5117183251
## nucl        1.690893e+00 1.304239e+00 2.3273236618
## chrom       2.037192e+00 1.335426e+00 3.3920793079
## n.nuc       1.563996e+00 1.124172e+00 2.2327291779
## mit         1.553485e+00 8.836812e-01 3.5117318200

# See how it fits our training dataset
train$probs <- predict(full.model, type="response")
train$predict <- rep("benign", 474)
train$predict[train$probs > 0.50] = "malignant"
table(train$predict, train$class)

##            
##             benign malignant
##   benign       306         4
##   malignant      6       158

# Let's get the overall % correct
mean(train$predict == train$class)

## [1] 0.978903

# Let's try it on the test dataset
test$probs <- predict(full.model, newdata=test, type="response")
test$predict <- rep("benign", 209)
test$predict[test$probs > 0.50] = "malignant"
table(test$predict, test$class)

##            
##             benign malignant
##   benign       127         4
##   malignant      5        73

mean(test$predict == test$class)

## [1] 0.9569378

Cross validation

# Let's try cross-validation
# library(bestglm)
# Recode our outcome variable as 0-1
train$y <- rep(0, 474)
train$y[train$class == "malignant"] <- 1
# Could have used this code instead of the previous 2 lines
# train$y <- as.numeric(train$class)-1

# Remove all extraneous variables and make sure
# outcome variable is in last column
biopsy.cv <- train[ , -10:-12]
head(biopsy.cv)

##    thick u.size u.shape adhsn s.size nucl chrom n.nuc mit y
## 1      5      1       1     1      2    1     3     1   1 0
## 4      6      8       8     1      3    4     3     7   1 0
## 5      4      1       1     3      2    1     3     1   1 0
## 6      8     10      10     8      7   10     9     7   1 1
## 7      1      1       1     1      2   10     3     1   1 0
## 10     4      2       1     1      2    1     2     1   1 0

# Find best model
bestglm(Xy = biopsy.cv, IC="CV", CVArgs = list(Method="HTF", K=10, REP=1), family=binomial)

## Morgan-Tatar search since family is non-gaussian.

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## CV(K = 10, REP = 1)
## BICq equivalent for q in (0.0421013207411016, 0.311136342210368)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -10.6047190  1.4089470 -7.526698 5.203948e-14
## thick         0.7600489  0.1645563  4.618776 3.860096e-06
## nucl          0.5420547  0.1166692  4.646082 3.382978e-06
## chrom         0.7864741  0.1933391  4.067847 4.744949e-05
## n.nuc         0.4006591  0.1256526  3.188625 1.429511e-03

# Fit that reduced model to our training dataset
reduce.model <- glm(class ~ thick + nucl + chrom + n.nuc, family=binomial, data=train)
# Test fit
train$cv.probs <- predict(reduce.model, type="response")
train$cv.predict <- rep("benign", 474)
train$cv.predict[train$cv.probs > 0.50] = "malignant"
table(train$cv.predict, train$class)

##            
##             benign malignant
##   benign       306         7
##   malignant      6       155

# Let's get the overall % correct
mean(train$cv.predict == train$class)

## [1] 0.9725738

# Test reduced model on new data
test$cv.probs <- predict(reduce.model, newdata=test, type="response")
test$cv.predict <- rep("benign", 209)
test$cv.predict[test$cv.probs > 0.50] = "malignant"
table(test$cv.predict, test$class)

##            
##             benign malignant
##   benign       127         6
##   malignant      5        71

mean(test$cv.predict == test$class)

## [1] 0.9473684

# That didn't really improve anything
# Let's try again with BIC as our criterion

bestglm(Xy = biopsy.cv, IC="BIC", family=binomial)

## Morgan-Tatar search since family is non-gaussian.

## BIC
## BICq equivalent for q in (0.311136342210368, 0.896220081100116)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -11.5782399  1.7150431 -6.750991 1.468389e-11
## thick         0.8101726  0.1836012  4.412675 1.021013e-05
## adhsn         0.5290541  0.1932962  2.737012 6.200000e-03
## nucl          0.5007588  0.1269485  3.944582 7.993917e-05
## chrom         0.6695838  0.2124473  3.151764 1.622873e-03
## n.nuc         0.4021677  0.1288032  3.122341 1.794187e-03

BIC.model <- glm(class ~ thick + adhsn + nucl + chrom + n.nuc, family=binomial, data=train)
train$BIC.probs <- predict(BIC.model, type="response")
train$BIC.predict <- rep("benign", 474)
train$BIC.predict[train$BIC.probs > 0.50] = "malignant"
table(train$BIC.predict, train$class)

##            
##             benign malignant
##   benign       306         6
##   malignant      6       156

# Let's get the overall % correct
mean(train$BIC.predict == train$class)

## [1] 0.9746835

# Test reduced model on new data
test$BIC.probs <- predict(BIC.model, newdata=test, type="response")
test$BIC.predict <- rep("benign", 209)
test$BIC.predict[test$BIC.probs > 0.50] = "malignant"
table(test$BIC.predict, test$class)

##            
##             benign malignant
##   benign       127         4
##   malignant      5        73

mean(test$BIC.predict == test$class)

## [1] 0.9569378

# We can turn to discriminant analysis for further identification

Toenail infection

# Clear out memory
rm(list = ls())
# Load data
toenail <- read.csv("http://www.bradthiessen.com/html5/data/toenail.csv")

# Plot proportions over time
toenailprop <- toenail %>%
  group_by(treatment, visit) %>%
  summarize(propinfect=mean(outcome), meanmonth=mean(month))

ggplot(toenailprop, aes(meanmonth, propinfect)) +
  geom_point() + geom_line(size=2) +
  facet_grid(. ~ treatment)

## Fit model
toenail.model <- glm(outcome ~ treatment*month, data=toenail, family="binomial")
summary(toenail.model)

## 
## Call:
## glm(formula = outcome ~ treatment * month, family = "binomial", 
##     data = toenail)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9519  -0.7892  -0.5080  -0.2532   2.7339  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.5566273  0.1089628  -5.108 3.25e-07 ***
## treatmentTerbinafine       -0.0005817  0.1561463  -0.004   0.9970    
## month                      -0.1703078  0.0236199  -7.210 5.58e-13 ***
## treatmentTerbinafine:month -0.0672216  0.0375235  -1.791   0.0732 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1980.5  on 1907  degrees of freedom
## Residual deviance: 1816.0  on 1904  degrees of freedom
## AIC: 1824
## 
## Number of Fisher Scoring iterations: 5

## CIs using profiled log-likelihood
confint(toenail.model)

## Waiting for profiling to be done...

##                                 2.5 %       97.5 %
## (Intercept)                -0.7719213 -0.344475861
## treatmentTerbinafine       -0.3067395  0.305615999
## month                      -0.2182556 -0.125489310
## treatmentTerbinafine:month -0.1419292  0.005563446

## CIs using standard errors
confint.default(toenail.model)

##                                 2.5 %       97.5 %
## (Intercept)                -0.7701903 -0.343064177
## treatmentTerbinafine       -0.3066229  0.305459549
## month                      -0.2166020 -0.124013574
## treatmentTerbinafine:month -0.1407662  0.006323002

## odds ratios only
exp(coef(toenail.model))

##                (Intercept)       treatmentTerbinafine 
##                  0.5731389                  0.9994185 
##                      month treatmentTerbinafine:month 
##                  0.8434052                  0.9349880

## odds ratios and 95% CI
exp(cbind(OR = coef(toenail.model), confint(toenail.model)))

## Waiting for profiling to be done...

##                                   OR     2.5 %    97.5 %
## (Intercept)                0.5731389 0.4621244 0.7085917
## treatmentTerbinafine       0.9994185 0.7358422 1.3574609
## month                      0.8434052 0.8039199 0.8820652
## treatmentTerbinafine:month 0.9349880 0.8676827 1.0055790

# Plot logistic curves through proportions
# I need to rename a variable back to "month"
toenailprop$month <- toenailprop$meanmonth
toenailprop$predicted <- predict(toenail.model, toenailprop)
ggplot(toenailprop, aes(month, propinfect)) +
  geom_point() +
  geom_line(aes(month, (exp(predicted)/(1+exp(predicted)))), 
            size=2, col="red", linetype="dashed") +
  geom_line(aes(month, propinfect), size=2, col="black") +
  facet_grid(. ~ treatment)

Poisson regression: Elephants

# Let's load a dataset on the mating of elephants
# Mating is a rare event that may follow a Poisson distribution.
library(gpk)
data(elephant)
head(elephant)

##   Age_in_Years Number_of_Matings
## 1           27                 0
## 2           28                 1
## 3           28                 1
## 4           28                 1
## 5           28                 3
## 6           29                 0

tail(elephant)

##    Age_in_Years Number_of_Matings
## 36           43                 9
## 37           44                 3
## 38           45                 5
## 39           47                 7
## 40           48                 2
## 41           52                 9

elephant$Age <- elephant$Age_in_Years
elephant$Matings <- elephant$Number_of_Matings

## Scatterplot
elephant %>% 
  ggvis(x=~Age, y=~jitter(Matings), fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.8) %>%
  layer_points() %>%
  add_axis("x", title = "Age") %>%
  scale_numeric("x", domain=c(25, 55), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Matings") %>%
  scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE)

# It looks like the number of mates tends to be higher
# for older elephants, AND there seems to be more variability 
# in the number of mates as age increases.

mean(elephant$Matings)

## [1] 2.682927

var(elephant$Matings)

## [1] 5.071951

mean(elephant$Matings)/var(elephant$Matings)

## [1] 0.5289733

mean(elephant$Age)

## [1] 35.85366

var(elephant$Age)

## [1] 43.27805

eg <- elephant %>%
  group_by(Age) %>%
  summarize(mean=mean(Matings), var=var(Matings))

plot(log(eg$mean), log(eg$var), xlab="log(Mean matings)",  ylab="log(Variance of matings)",
     main="Figure 4.1. Mean-Variance Relationship")
mtext("matings by age", padj=-0.5)
abline(0,1)

## Null model
null.model <- glm(Matings ~ 1, family=poisson, data=elephant)
summary(null.model)

## 
## Call:
## glm(formula = Matings ~ 1, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3164  -1.1799  -0.4368   0.1899   3.0252  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.98691    0.09535   10.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 75.372  on 40  degrees of freedom
## AIC: 178.82
## 
## Number of Fisher Scoring iterations: 5

exp(coef(null.model))

## (Intercept) 
##    2.682927

mean(elephant$Matings)

## [1] 2.682927

## Show predictions
elephant$null.pred <- exp(predict(null.model, elephant))
plot(elephant$Age,jitter(elephant$Matings))
lines(elephant$Age,elephant$null.pred, lw=3, col=1)

## Deviance
deviance(null.model)

## [1] 75.37174

pchisq(0.95, df.residual(null.model))

## [1] 8.940942e-26

## Model with age
model <- glm(Matings ~ Age, family=poisson, data=elephant)
summary(model)

## 
## Call:
## glm(formula = Matings ~ Age, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.80798  -0.86137  -0.08629   0.60087   2.17777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58201    0.54462  -2.905  0.00368 ** 
## Age          0.06869    0.01375   4.997 5.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 51.012  on 39  degrees of freedom
## AIC: 156.46
## 
## Number of Fisher Scoring iterations: 5

exp(coef(model))

## (Intercept)         Age 
##   0.2055619   1.0711071

## Predict matings from age
elephant$predicted <- exp(predict(model, elephant))
lines(elephant$Age,elephant$predicted, lw=3, col=2)

## Prediction for 30 year-old elephant
exp(coef(model)[1]+coef(model)[2]*30)

## (Intercept) 
##    1.614098

## Test model fit
deviance(null.model)

## [1] 75.37174

deviance(model)

## [1] 51.01163

pchisq(0.95, df.residual(model))

## [1] 5.841306e-25

1-pchisq(deviance(null.model)-deviance(model), df.residual(null.model)-df.residual(model))

## [1] 7.99062e-07

# Since the p-value is small, 
# there is evidence that the addition of age 
# explains a significant amount (more) of the deviance

Example: Student absences

# Load data
absences <- read.csv("http://www.bradthiessen.com/html5/data/absences.csv")
absences$prog <- factor(absences$prog, levels = 1:3, 
                        labels = c("General", "Academic", "Vocational"))
head(absences)

##     id gender math daysabs     prog
## 1 1001   male   63       4 Academic
## 2 1002   male   27       4 Academic
## 3 1003 female   20       2 Academic
## 4 1004 female   16       3 Academic
## 5 1005 female    2       3 Academic
## 6 1006 female   71      13 Academic

tail(absences)

##       id gender math daysabs       prog
## 309 2152 female   46       1 Vocational
## 310 2153   male   26       1   Academic
## 311 2154 female   79       3 Vocational
## 312 2155 female   59       0   Academic
## 313 2156 female   90       0 Vocational
## 314 2157 female   77       2 Vocational

# Summarize
summary(absences)

##        id          gender         math          daysabs      
##  Min.   :1001   female:160   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.:1079   male  :154   1st Qu.:28.00   1st Qu.: 1.000  
##  Median :1158                Median :48.00   Median : 4.000  
##  Mean   :1576                Mean   :48.27   Mean   : 5.955  
##  3rd Qu.:2078                3rd Qu.:70.00   3rd Qu.: 8.000  
##  Max.   :2157                Max.   :99.00   Max.   :35.000  
##          prog    
##  General   : 40  
##  Academic  :167  
##  Vocational:107  
##                  
##                  
##

mean(absences$daysabs)

## [1] 5.955414

var(absences$daysabs)

## [1] 49.51877

ggplot(absences, aes(daysabs, fill = prog)) + 
  geom_histogram(binwidth = 1, alpha=.5) + 
  facet_grid(prog ~ ., margins = TRUE, scales = "free")

plot(absences$math, absences$daysabs)

# Let's try a model with all the predictors
pois.model <- glm(daysabs ~ gender + math + prog, family=poisson, data=absences)
summary(pois.model)

## 
## Call:
## glm(formula = daysabs ~ gender + math + prog, family = poisson, 
##     data = absences)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2218  -2.1892  -0.9238   0.7218   7.0048  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.7594786  0.0637731  43.270  < 2e-16 ***
## gendermale     -0.2424762  0.0467765  -5.184 2.18e-07 ***
## math           -0.0069561  0.0009354  -7.437 1.03e-13 ***
## progAcademic   -0.4260327  0.0567308  -7.510 5.92e-14 ***
## progVocational -1.2707199  0.0779143 -16.309  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2217.7  on 313  degrees of freedom
## Residual deviance: 1746.8  on 309  degrees of freedom
## AIC: 2640.2
## 
## Number of Fisher Scoring iterations: 5

exp(coef(pois.model))

##    (Intercept)     gendermale           math   progAcademic progVocational 
##     15.7916072      0.7846824      0.9930680      0.6530950      0.2806295

## Fit is terrible
## We're assuming mean = variance
mean(absences$daysabs ~ absences$prog)

##    General   Academic Vocational 
##  10.650000   6.934132   2.672897

var(absences$daysabs ~ absences$prog)

##    General   Academic Vocational 
##   67.25897   55.44744   13.93916

## What can we do with this overdispersion?
## We could try a negative binomial regression
## It has an extra parameter to model overdispersion
library(MASS)
neg.bin.model <- glm.nb(daysabs ~ gender + math + prog, data=absences)
summary(neg.bin.model)

## 
## Call:
## glm.nb(formula = daysabs ~ gender + math + prog, data = absences, 
##     init.theta = 1.047288915, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1567  -1.0761  -0.3810   0.2856   2.7235  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.707484   0.204275  13.254  < 2e-16 ***
## gendermale     -0.211086   0.121989  -1.730   0.0836 .  
## math           -0.006236   0.002492  -2.502   0.0124 *  
## progAcademic   -0.424540   0.181725  -2.336   0.0195 *  
## progVocational -1.252615   0.199699  -6.273 3.55e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0473) family taken to be 1)
## 
##     Null deviance: 431.67  on 313  degrees of freedom
## Residual deviance: 358.87  on 309  degrees of freedom
## AIC: 1740.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.047 
##           Std. Err.:  0.108 
## 
##  2 x log-likelihood:  -1728.307

exp(coef(neg.bin.model))

##    (Intercept)     gendermale           math   progAcademic progVocational 
##     14.9915148      0.8097046      0.9937839      0.6540708      0.2857565

1-pchisq(deviance(neg.bin.model), 
         df.residual(neg.bin.model))

## [1] 0.02656535

neg.bin.no.prog <- update(neg.bin.model, . ~ . - prog)
anova(neg.bin.model, neg.bin.no.prog)

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: daysabs
##                  Model     theta Resid. df    2 x log-lik.   Test    df
## 1        gender + math 0.8705939       311       -1772.074             
## 2 gender + math + prog 1.0472889       309       -1728.307 1 vs 2     2
##   LR stat.      Pr(Chi)
## 1                      
## 2 43.76737 3.133546e-10

neg.bin.model <- glm.nb(daysabs ~ math + prog, data=absences)
newdata1 <- data.frame(math = mean(absences$math), prog = factor(1:3, levels = 1:3, 
    labels = levels(absences$prog)))
newdata1$phat <- predict(neg.bin.model, newdata1, type = "response")
newdata1

##       math       prog      phat
## 1 48.26752    General 10.236899
## 2 48.26752   Academic  6.587927
## 3 48.26752 Vocational  2.850083

newdata2 <- data.frame(
  math = rep(seq(from = min(absences$math), to = max(absences$math), length.out = 100), 3),
  prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
  levels(absences$prog)))

newdata2 <- cbind(newdata2, predict(neg.bin.model, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
  DaysAbsent <- exp(fit)
  LL <- exp(fit - 1.96 * se.fit)
  UL <- exp(fit + 1.96 * se.fit)
})

ggplot(newdata2, aes(math, DaysAbsent)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
  geom_line(aes(colour = prog), size = 2) +
  labs(x = "Math Score", y = "Predicted Days Absent")

Generalized Linear Model: Logistic/Poisson regression

Lesson 13

Binge drinking example

Maximum likelihood estimation

Calculate probabilities, odds, odds ratios

Logistic regression model for binge drinking

Graduate school admissions example

Fit logistic model with GRE as predictor

Calculate deviance manually

Chi-square test for improvement in deviance

Comparing predictions: GRE of 400 vs 401

Getting probabilities of admission for various GRE scores

Predict admissions using all 4 predictors

Wald statistic

Breast cancer example

Split into training and test datasets

Fit and test logistic model with all predictors

Cross validation

Toenail infection

Poisson regression: Elephants

Example: Student absences