# Load packages
library(mosaic)
library(ggvis)
library(aod)
library(MASS)
library(corrplot)
library(car)
library(reshape2)
library(bestglm)
# Input binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(gender = factor(male, labels=c("female", "male")),
drink = factor(binge, labels=c("Not binge", "binge")))
# Look at data
head(drink)
## gender drink
## 1 male binge
## 2 male binge
## 3 male binge
## 4 male binge
## 5 male binge
## 6 male binge
tail(drink)
## gender drink
## 17091 female Not binge
## 17092 female Not binge
## 17093 female Not binge
## 17094 female Not binge
## 17095 female Not binge
## 17096 female Not binge
# Table of results
tally(drink ~ gender, margins=TRUE, data=drink)
## gender
## drink female male
## Not binge 8254 5528
## binge 1690 1624
## Total 9944 7152
# Plot results
ggplot(drink, aes(x=male, y=binge)) +
geom_point(shape=1, alpha=.008, position=position_jitter(width=.05,height=.05)) +
stat_smooth(method="glm", family="binomial", se=F)
# Set parameters of our binomial distribution
N <- 17096
X <- 3314
# Set range of values to look for max likelihood estimate
p <- seq(.001,1-.001,by=.001)
# Calculate likelihoods of our data across the range of possible probabilities
L <- lapply(p, dbinom, size=N, x=X)
# Plot the likelihoods
plot(p, as.numeric(L), main="max(Likelihood): p=0.19385", type="l", xlab="Probability p", ylab="L(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)
# Let's plot the log-likelihood
plot(p, log(as.numeric(L)), main="max(Log-Likelihood): p=0.19385", type="l",
xlab="Probability p", ylab="l(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)
# Calculate for females
p.binge.female <- 1690/9944
odds.binge.female <- p.binge.female / (1 - p.binge.female)
ln.odds.b.f <- log(odds.binge.female)
# Calculate for males
p.binge.male <- 1624/7152
odds.binge.male <- p.binge.male / (1 - p.binge.male)
ln.odds.b.m <- log(odds.binge.male)
# Calculate odds ratio
odds.ratio <- odds.binge.male / odds.binge.female
# Paste results all together
cat("Females: prob =", round(p.binge.female, 5), "; odds =", round(odds.binge.female, 3),
"; ln(odds) =", round(ln.odds.b.f, 3), "\n ",
"males: prob =", round(p.binge.male, 5), "; odds =", round(odds.binge.male, 3),
"; ln(odds) =", round(ln.odds.b.m, 3), "\nodds ratio =",
round(odds.binge.male / odds.binge.female, 4), "\nrelative probability",
round(p.binge.male / p.binge.female, 4))
## Females: prob = 0.16995 ; odds = 0.205 ; ln(odds) = -1.586
## males: prob = 0.22707 ; odds = 0.294 ; ln(odds) = -1.225
## odds ratio = 1.4348
## relative probability 1.3361
# Fit model (notice glm and family=binomial)
drink.logmodel <- glm(drink ~ gender, data=drink, family=binomial)
# Summarize model
summary(drink.logmodel)
##
## Call:
## glm(formula = drink ~ gender, family = binomial, data = drink)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7177 -0.7177 -0.6104 -0.6104 1.8827
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.58597 0.02670 -59.400 <2e-16 ***
## gendermale 0.36104 0.03885 9.292 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16814 on 17095 degrees of freedom
## Residual deviance: 16728 on 17094 degrees of freedom
## AIC: 16732
##
## Number of Fisher Scoring iterations: 4
# Odds ratio (exponential function of coefficients)
# Notice the odds ratio matches what we calculated earlier
exp(coef(drink.logmodel))
## (Intercept) gendermale
## 0.2047492 1.4348145
# Create new dataset of 1 female and 1 male
gender <- c(0,1)
new <- data.frame(gender = factor(gender, labels=c("female", "male")))
# Predict the probability of binge drinking for the new data.frame
# Notice that these match the probabilities we calculated earlier
predict(drink.logmodel, newdata=new, type="response")
## 1 2
## 0.1699517 0.2270694
# Clear out memory
rm(list = ls())
# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
# Look at structure
str(admit)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
# Calculate means
sapply(admit, mean)
## admit gre gpa rank
## 0.3175 587.7000 3.3899 2.4850
# Convert rank to a factor
admit$rank <- factor(admit$rank)
# Function to plot the scatterplot matrix
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
cex = 1, pch = 1, bg = "steelblue",
diag.panel = panel.hist, cex.labels = 1, font.labels = 2)
# Fit logistic model with GRE as predictor
gre.model <- glm(admit ~ gre, family=binomial, data=admit)
# Summarize model
summary(gre.model)
##
## Call:
## glm(formula = admit ~ gre, family = binomial, data = admit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1623 -0.9052 -0.7547 1.3486 1.9879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.901344 0.606038 -4.787 1.69e-06 ***
## gre 0.003582 0.000986 3.633 0.00028 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 486.06 on 398 degrees of freedom
## AIC: 490.06
##
## Number of Fisher Scoring iterations: 4
# Predicted probabilities from our model
admit$prob <- predict(gre.model, type="response")
# Calculate likelihood of model: Product of [P^y * (1-P)^(1-y)]
admit$like <- (admit$prob ^ admit$admit) * ((1 - admit$prob) ^ (1-admit$admit))
model.likelihood <- prod(admit$like)
# Calculate deviance: -2LL
-2 * log(model.likelihood)
## [1] 486.0561
# Fit null model with no predictors
null.model <- glm(admit ~ 1, family=binomial, data=admit)
# Predicted probabilities from our null model
admit$nullprob <- predict(null.model, type="response")
# Calculate likelihood of null model
admit$nulllike <- (admit$nullprob ^ admit$admit) * ((1 - admit$nullprob) ^ (1-admit$admit))
null.likelihood <- prod(admit$nulllike)
# Calculate deviance of model
-2 * log(null.likelihood)
## [1] 499.9765
Notice how those values match the output from the glm()
command.
# Chi-square = difference in deviance (null model - GRE model)
modelChi <- gre.model$null.deviance - gre.model$deviance
# Degrees-of-freedom (parameters in null model - parameters in GRE model)
chidf <- gre.model$df.null - gre.model$df.residual
# Calculate p-value for that chi-square statistic
chisq.prob <- 1 - pchisq(modelChi, chidf)
# Paste results
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 5))
## [1] "p-value for chi-square test of difference in deviance = 0.00019"
# Generate data.frame with GRE scores of 400 and 401
gre.new <- c(400, 401)
gre.new <- data.frame(gre = gre.new)
# Generate predicted log-odds
gre.new$log.odds <- -2.901344 + (0.003582 * gre.new$gre)
# Generate predicted odds
gre.new$odds <- exp(-2.901344 + (0.003582 * gre.new$gre))
# Calculate odds ratio
gre.odds.ratio <- exp(-2.901344 + 0.003582) / exp(-2.901344)
# Generate predicted probabilities
gre.new$predicted <- predict(gre.model, newdata=gre.new, type="response")
# Paste results
cat("GRE of 401: log-odds =", round(gre.new$log.odds[gre.new$gre==401], 5), "; odds =",
round(gre.new$odds[gre.new$gre==401], 5), "; prob =",
round(gre.new$predicted[gre.new$gre==401], 5), "\nGRE of 400:",
"log-odds =", round(gre.new$log.odds[gre.new$gre==400], 5), "; odds =",
round(gre.new$odds[gre.new$gre==400], 5), "; prob =",
round(gre.new$predicted[gre.new$gre==400], 5), "\nOdds ratio:",
round(gre.odds.ratio, 6))
## GRE of 401: log-odds = -1.46496 ; odds = 0.23109 ; prob = 0.18772
## GRE of 400: log-odds = -1.46854 ; odds = 0.23026 ; prob = 0.18718
## Odds ratio: 1.003588
# We can get this odds ratio by exponentiating our logistic model
# Here I will get the odds ratio and 95% confidence intervals
# Notice we get the same odds ratio of 1.003588
exp(cbind(OR = coef(gre.model), confint(gre.model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre 1.0035886 1.00168137 1.0055682
# Create data.frame with various GRE scores
gre.new.2 <- c(200, 300, 400, 500, 600, 700, 800)
gre.new.2 <- data.frame(gre = gre.new.2)
# Generate predicted log-odds
gre.new.2$log.odds <- -2.901344 + (0.003582 * gre.new.2$gre)
# Generate predicted odds
gre.new.2$odds <- exp(-2.901344 + (0.003582 * gre.new.2$gre))
# Generate predicted probabilities
gre.new.2$predicted <- predict(gre.model, newdata=gre.new.2, type="response")
# Table of predicted odds and probabilities
gre.new.2 %>%
group_by(gre) %>%
summarize(predicted.Odds = round(mean(odds),3), Probability = round(mean(predicted),3))
## Source: local data frame [7 x 3]
##
## gre predicted.Odds Probability
## (dbl) (dbl) (dbl)
## 1 200 0.112 0.101
## 2 300 0.161 0.139
## 3 400 0.230 0.187
## 4 500 0.329 0.248
## 5 600 0.471 0.320
## 6 700 0.674 0.403
## 7 800 0.965 0.491
# Plot logistic curve through proportions
# Summarize proportion admitted by GRE score
admitprop <- admit %>%
group_by(gre) %>%
summarize(admitprop=mean(admit))
# Predict probabilities for these GRE scores
admitprop$predicted <- predict(gre.model, admitprop)
# Plot proportions admitted by GRE score
plot(admitprop ~ gre, data=admitprop)
# Add a line for the predicted probabilities
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)
# Fit logistic model with all predictors
admit.model <- glm(admit ~ gre + gpa + rank, family=binomial, data=admit)
# Summarize model
summary(admit.model)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = binomial, data = admit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rank2 -0.675443 0.316490 -2.134 0.032829 *
## rank3 -1.340204 0.345306 -3.881 0.000104 ***
## rank4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
# Test improvement in deviance
# Compared to model with only gre as a predictor
modelChi <- gre.model$deviance - admit.model$deviance
chidf <- gre.model$df.residual - admit.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 8))
## [1] "p-value for chi-square test of difference in deviance = 1.547e-05"
# Get the odds ratios and confidence intervals
exp(cbind(OR = coef(admit.model), confint(admit.model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre 1.0022670 1.000137602 1.0044457
## gpa 2.2345448 1.173858216 4.3238349
## rank2 0.5089310 0.272289674 0.9448343
## rank3 0.2617923 0.131641717 0.5115181
## rank4 0.2119375 0.090715546 0.4706961
# Create the colorful graph (lots of work)
newdata1 <- with(admit, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(admit.model, newdata = newdata1, type = "response")
newdata2 <- with(admit, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(admit.model, newdata = newdata2, type = "link",
se = TRUE))
newdata3 <- within(newdata3, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))
})
# view first few rows of final dataset
head(newdata3)
## gre gpa rank fit se.fit residual.scale UL
## 1 200.0000 3.3899 1 -0.8114870 0.5147714 1 0.5492064
## 2 206.0606 3.3899 1 -0.7977632 0.5090986 1 0.5498513
## 3 212.1212 3.3899 1 -0.7840394 0.5034491 1 0.5505074
## 4 218.1818 3.3899 1 -0.7703156 0.4978239 1 0.5511750
## 5 224.2424 3.3899 1 -0.7565919 0.4922237 1 0.5518545
## 6 230.3030 3.3899 1 -0.7428681 0.4866494 1 0.5525464
## LL PredictedProb
## 1 0.1393812 0.3075737
## 2 0.1423880 0.3105042
## 3 0.1454429 0.3134499
## 4 0.1485460 0.3164108
## 5 0.1516973 0.3193867
## 6 0.1548966 0.3223773
# Create the plot
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
size = 1)
# Test rank of undergraduate institution using Wald statistic
# Requires library(aod)
# Compare terms 4-6 in our logistic model vs. a null model
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), Terms = 4:6)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011
# Compare terms 4 vs 5 (rank 2 vs rank 3)
# Set-up a contrast (like we did with the Scheffe method)
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), L = l)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019
# Load data from MASS library
# library(MASS)
data(biopsy)
# Examine structure
str(biopsy)
## 'data.frame': 699 obs. of 11 variables:
## $ ID : chr "1000025" "1002945" "1015425" "1016277" ...
## $ V1 : int 5 5 3 6 4 8 1 2 2 4 ...
## $ V2 : int 1 4 1 8 1 10 1 1 1 2 ...
## $ V3 : int 1 4 1 8 1 10 1 2 1 1 ...
## $ V4 : int 1 5 1 1 3 8 1 1 1 1 ...
## $ V5 : int 2 7 2 3 2 7 2 2 2 2 ...
## $ V6 : int 1 10 2 4 1 10 10 1 1 1 ...
## $ V7 : int 3 3 3 3 3 9 3 3 1 2 ...
## $ V8 : int 1 2 1 7 1 7 1 1 1 1 ...
## $ V9 : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Eliminate ID variable
biopsy <- biopsy[, -1]
# Name variables
names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom",
"n.nuc", "mit", "class")
# Eliminate cases with missing values
biopsy2 <- na.omit(biopsy)
head(biopsy2)
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit class
## 1 5 1 1 1 2 1 3 1 1 benign
## 2 5 4 4 5 7 10 3 2 1 benign
## 3 3 1 1 1 2 2 3 1 1 benign
## 4 6 8 8 1 3 4 3 7 1 benign
## 5 4 1 1 3 2 1 3 1 1 benign
## 6 8 10 10 8 7 10 9 7 1 malignant
# Construct boxplots of predictor variables by each value of our dependent variable
# Convert to wide dataset (in order to get the boxplots)
biop.m <- melt(biopsy2, id.var="class")
# Construct the boxplots
ggplot(data=biop.m, aes(x=class, y=value)) +
geom_boxplot() +
facet_wrap(~variable, ncol=3)
# Nucl looks important; mit does not
# Let's look at correlations
library(corrplot)
bc = cor(biopsy2[, 1:9])
corrplot.mixed(bc)
# Collinearity problem?
# Check VIF
# library(car)
# Fit full model
full.model <- glm(class ~ ., family=binomial, data=biopsy2)
# Calculate VIF
round(vif(full.model),3)
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit
## 1.188 2.819 2.763 1.186 1.347 1.142 1.215 1.220 1.042
# Create training and test datasets
# Set random number seed
set.seed(71930)
# Create random integers ~70% of them 1s; ~30% of them 2s (1, 2)
ind <- sample(2, nrow(biopsy2), replace=TRUE, prob=c(.7, .3))
# Training dataset = 1; test = 2
train <- biopsy2[ind==1, ]
test <- biopsy2[ind==2, ]
# See if we have relatively equal occurance of cancer in each set
tally(~class, data=train, format="prop")
##
## benign malignant
## 0.6582278 0.3417722
tally(~class, data=test, format="prop")
##
## benign malignant
## 0.6315789 0.3684211
# Fit logistic model with all predictors
full.model <- glm(class ~ ., family=binomial, data=train)
summary(full.model)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4692 -0.0778 -0.0358 0.0045 2.2562
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.07441 1.97761 -6.106 1.02e-09 ***
## thick 0.75495 0.22554 3.347 0.000816 ***
## u.size -0.26684 0.28954 -0.922 0.356730
## u.shape 0.22282 0.28379 0.785 0.432360
## adhsn 0.58666 0.20702 2.834 0.004600 **
## s.size -0.06207 0.23246 -0.267 0.789455
## nucl 0.52526 0.14448 3.635 0.000277 ***
## chrom 0.71157 0.23284 3.056 0.002243 **
## n.nuc 0.44724 0.17153 2.607 0.009125 **
## mit 0.44050 0.41143 1.071 0.284318
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 608.809 on 473 degrees of freedom
## Residual deviance: 55.022 on 464 degrees of freedom
## AIC: 75.022
##
## Number of Fisher Scoring iterations: 9
# Chi-square test for improvement in deviance
modelChi <- full.model$null.deviance - full.model$deviance
chidf <- full.model$df.null - full.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 0
# Odds ratios and confidence intervals
exp(cbind(OR = coef(full.model), confint(full.model)))
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## OR 2.5 % 97.5 %
## (Intercept) 5.703626e-06 5.036254e-08 0.0001420459
## thick 2.127503e+00 1.434655e+00 3.5320359815
## u.size 7.657944e-01 4.385016e-01 1.4097698097
## u.shape 1.249599e+00 6.721431e-01 2.1223295619
## adhsn 1.797980e+00 1.210043e+00 2.8107870239
## s.size 9.398173e-01 5.959236e-01 1.5117183251
## nucl 1.690893e+00 1.304239e+00 2.3273236618
## chrom 2.037192e+00 1.335426e+00 3.3920793079
## n.nuc 1.563996e+00 1.124172e+00 2.2327291779
## mit 1.553485e+00 8.836812e-01 3.5117318200
# See how it fits our training dataset
train$probs <- predict(full.model, type="response")
train$predict <- rep("benign", 474)
train$predict[train$probs > 0.50] = "malignant"
table(train$predict, train$class)
##
## benign malignant
## benign 306 4
## malignant 6 158
# Let's get the overall % correct
mean(train$predict == train$class)
## [1] 0.978903
# Let's try it on the test dataset
test$probs <- predict(full.model, newdata=test, type="response")
test$predict <- rep("benign", 209)
test$predict[test$probs > 0.50] = "malignant"
table(test$predict, test$class)
##
## benign malignant
## benign 127 4
## malignant 5 73
mean(test$predict == test$class)
## [1] 0.9569378
# Let's try cross-validation
# library(bestglm)
# Recode our outcome variable as 0-1
train$y <- rep(0, 474)
train$y[train$class == "malignant"] <- 1
# Could have used this code instead of the previous 2 lines
# train$y <- as.numeric(train$class)-1
# Remove all extraneous variables and make sure
# outcome variable is in last column
biopsy.cv <- train[ , -10:-12]
head(biopsy.cv)
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit y
## 1 5 1 1 1 2 1 3 1 1 0
## 4 6 8 8 1 3 4 3 7 1 0
## 5 4 1 1 3 2 1 3 1 1 0
## 6 8 10 10 8 7 10 9 7 1 1
## 7 1 1 1 1 2 10 3 1 1 0
## 10 4 2 1 1 2 1 2 1 1 0
# Find best model
bestglm(Xy = biopsy.cv, IC="CV", CVArgs = list(Method="HTF", K=10, REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## CV(K = 10, REP = 1)
## BICq equivalent for q in (0.0421013207411016, 0.311136342210368)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.6047190 1.4089470 -7.526698 5.203948e-14
## thick 0.7600489 0.1645563 4.618776 3.860096e-06
## nucl 0.5420547 0.1166692 4.646082 3.382978e-06
## chrom 0.7864741 0.1933391 4.067847 4.744949e-05
## n.nuc 0.4006591 0.1256526 3.188625 1.429511e-03
# Fit that reduced model to our training dataset
reduce.model <- glm(class ~ thick + nucl + chrom + n.nuc, family=binomial, data=train)
# Test fit
train$cv.probs <- predict(reduce.model, type="response")
train$cv.predict <- rep("benign", 474)
train$cv.predict[train$cv.probs > 0.50] = "malignant"
table(train$cv.predict, train$class)
##
## benign malignant
## benign 306 7
## malignant 6 155
# Let's get the overall % correct
mean(train$cv.predict == train$class)
## [1] 0.9725738
# Test reduced model on new data
test$cv.probs <- predict(reduce.model, newdata=test, type="response")
test$cv.predict <- rep("benign", 209)
test$cv.predict[test$cv.probs > 0.50] = "malignant"
table(test$cv.predict, test$class)
##
## benign malignant
## benign 127 6
## malignant 5 71
mean(test$cv.predict == test$class)
## [1] 0.9473684
# That didn't really improve anything
# Let's try again with BIC as our criterion
bestglm(Xy = biopsy.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.311136342210368, 0.896220081100116)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.5782399 1.7150431 -6.750991 1.468389e-11
## thick 0.8101726 0.1836012 4.412675 1.021013e-05
## adhsn 0.5290541 0.1932962 2.737012 6.200000e-03
## nucl 0.5007588 0.1269485 3.944582 7.993917e-05
## chrom 0.6695838 0.2124473 3.151764 1.622873e-03
## n.nuc 0.4021677 0.1288032 3.122341 1.794187e-03
BIC.model <- glm(class ~ thick + adhsn + nucl + chrom + n.nuc, family=binomial, data=train)
train$BIC.probs <- predict(BIC.model, type="response")
train$BIC.predict <- rep("benign", 474)
train$BIC.predict[train$BIC.probs > 0.50] = "malignant"
table(train$BIC.predict, train$class)
##
## benign malignant
## benign 306 6
## malignant 6 156
# Let's get the overall % correct
mean(train$BIC.predict == train$class)
## [1] 0.9746835
# Test reduced model on new data
test$BIC.probs <- predict(BIC.model, newdata=test, type="response")
test$BIC.predict <- rep("benign", 209)
test$BIC.predict[test$BIC.probs > 0.50] = "malignant"
table(test$BIC.predict, test$class)
##
## benign malignant
## benign 127 4
## malignant 5 73
mean(test$BIC.predict == test$class)
## [1] 0.9569378
# We can turn to discriminant analysis for further identification
# Clear out memory
rm(list = ls())
# Load data
toenail <- read.csv("http://www.bradthiessen.com/html5/data/toenail.csv")
# Plot proportions over time
toenailprop <- toenail %>%
group_by(treatment, visit) %>%
summarize(propinfect=mean(outcome), meanmonth=mean(month))
ggplot(toenailprop, aes(meanmonth, propinfect)) +
geom_point() + geom_line(size=2) +
facet_grid(. ~ treatment)
## Fit model
toenail.model <- glm(outcome ~ treatment*month, data=toenail, family="binomial")
summary(toenail.model)
##
## Call:
## glm(formula = outcome ~ treatment * month, family = "binomial",
## data = toenail)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9519 -0.7892 -0.5080 -0.2532 2.7339
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5566273 0.1089628 -5.108 3.25e-07 ***
## treatmentTerbinafine -0.0005817 0.1561463 -0.004 0.9970
## month -0.1703078 0.0236199 -7.210 5.58e-13 ***
## treatmentTerbinafine:month -0.0672216 0.0375235 -1.791 0.0732 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1980.5 on 1907 degrees of freedom
## Residual deviance: 1816.0 on 1904 degrees of freedom
## AIC: 1824
##
## Number of Fisher Scoring iterations: 5
## CIs using profiled log-likelihood
confint(toenail.model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.7719213 -0.344475861
## treatmentTerbinafine -0.3067395 0.305615999
## month -0.2182556 -0.125489310
## treatmentTerbinafine:month -0.1419292 0.005563446
## CIs using standard errors
confint.default(toenail.model)
## 2.5 % 97.5 %
## (Intercept) -0.7701903 -0.343064177
## treatmentTerbinafine -0.3066229 0.305459549
## month -0.2166020 -0.124013574
## treatmentTerbinafine:month -0.1407662 0.006323002
## odds ratios only
exp(coef(toenail.model))
## (Intercept) treatmentTerbinafine
## 0.5731389 0.9994185
## month treatmentTerbinafine:month
## 0.8434052 0.9349880
## odds ratios and 95% CI
exp(cbind(OR = coef(toenail.model), confint(toenail.model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5731389 0.4621244 0.7085917
## treatmentTerbinafine 0.9994185 0.7358422 1.3574609
## month 0.8434052 0.8039199 0.8820652
## treatmentTerbinafine:month 0.9349880 0.8676827 1.0055790
# Plot logistic curves through proportions
# I need to rename a variable back to "month"
toenailprop$month <- toenailprop$meanmonth
toenailprop$predicted <- predict(toenail.model, toenailprop)
ggplot(toenailprop, aes(month, propinfect)) +
geom_point() +
geom_line(aes(month, (exp(predicted)/(1+exp(predicted)))),
size=2, col="red", linetype="dashed") +
geom_line(aes(month, propinfect), size=2, col="black") +
facet_grid(. ~ treatment)
# Let's load a dataset on the mating of elephants
# Mating is a rare event that may follow a Poisson distribution.
library(gpk)
data(elephant)
head(elephant)
## Age_in_Years Number_of_Matings
## 1 27 0
## 2 28 1
## 3 28 1
## 4 28 1
## 5 28 3
## 6 29 0
tail(elephant)
## Age_in_Years Number_of_Matings
## 36 43 9
## 37 44 3
## 38 45 5
## 39 47 7
## 40 48 2
## 41 52 9
elephant$Age <- elephant$Age_in_Years
elephant$Matings <- elephant$Number_of_Matings
## Scatterplot
elephant %>%
ggvis(x=~Age, y=~jitter(Matings), fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.8) %>%
layer_points() %>%
add_axis("x", title = "Age") %>%
scale_numeric("x", domain=c(25, 55), nice=FALSE, clamp=TRUE) %>%
add_axis("y", title = "Matings") %>%
scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE)
# It looks like the number of mates tends to be higher
# for older elephants, AND there seems to be more variability
# in the number of mates as age increases.
mean(elephant$Matings)
## [1] 2.682927
var(elephant$Matings)
## [1] 5.071951
mean(elephant$Matings)/var(elephant$Matings)
## [1] 0.5289733
mean(elephant$Age)
## [1] 35.85366
var(elephant$Age)
## [1] 43.27805
eg <- elephant %>%
group_by(Age) %>%
summarize(mean=mean(Matings), var=var(Matings))
plot(log(eg$mean), log(eg$var), xlab="log(Mean matings)", ylab="log(Variance of matings)",
main="Figure 4.1. Mean-Variance Relationship")
mtext("matings by age", padj=-0.5)
abline(0,1)
## Null model
null.model <- glm(Matings ~ 1, family=poisson, data=elephant)
summary(null.model)
##
## Call:
## glm(formula = Matings ~ 1, family = poisson, data = elephant)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3164 -1.1799 -0.4368 0.1899 3.0252
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.98691 0.09535 10.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 75.372 on 40 degrees of freedom
## Residual deviance: 75.372 on 40 degrees of freedom
## AIC: 178.82
##
## Number of Fisher Scoring iterations: 5
exp(coef(null.model))
## (Intercept)
## 2.682927
mean(elephant$Matings)
## [1] 2.682927
## Show predictions
elephant$null.pred <- exp(predict(null.model, elephant))
plot(elephant$Age,jitter(elephant$Matings))
lines(elephant$Age,elephant$null.pred, lw=3, col=1)
## Deviance
deviance(null.model)
## [1] 75.37174
pchisq(0.95, df.residual(null.model))
## [1] 8.940942e-26
## Model with age
model <- glm(Matings ~ Age, family=poisson, data=elephant)
summary(model)
##
## Call:
## glm(formula = Matings ~ Age, family = poisson, data = elephant)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.80798 -0.86137 -0.08629 0.60087 2.17777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.58201 0.54462 -2.905 0.00368 **
## Age 0.06869 0.01375 4.997 5.81e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 75.372 on 40 degrees of freedom
## Residual deviance: 51.012 on 39 degrees of freedom
## AIC: 156.46
##
## Number of Fisher Scoring iterations: 5
exp(coef(model))
## (Intercept) Age
## 0.2055619 1.0711071
## Predict matings from age
elephant$predicted <- exp(predict(model, elephant))
lines(elephant$Age,elephant$predicted, lw=3, col=2)
## Prediction for 30 year-old elephant
exp(coef(model)[1]+coef(model)[2]*30)
## (Intercept)
## 1.614098
## Test model fit
deviance(null.model)
## [1] 75.37174
deviance(model)
## [1] 51.01163
pchisq(0.95, df.residual(model))
## [1] 5.841306e-25
1-pchisq(deviance(null.model)-deviance(model), df.residual(null.model)-df.residual(model))
## [1] 7.99062e-07
# Since the p-value is small,
# there is evidence that the addition of age
# explains a significant amount (more) of the deviance
# Load data
absences <- read.csv("http://www.bradthiessen.com/html5/data/absences.csv")
absences$prog <- factor(absences$prog, levels = 1:3,
labels = c("General", "Academic", "Vocational"))
head(absences)
## id gender math daysabs prog
## 1 1001 male 63 4 Academic
## 2 1002 male 27 4 Academic
## 3 1003 female 20 2 Academic
## 4 1004 female 16 3 Academic
## 5 1005 female 2 3 Academic
## 6 1006 female 71 13 Academic
tail(absences)
## id gender math daysabs prog
## 309 2152 female 46 1 Vocational
## 310 2153 male 26 1 Academic
## 311 2154 female 79 3 Vocational
## 312 2155 female 59 0 Academic
## 313 2156 female 90 0 Vocational
## 314 2157 female 77 2 Vocational
# Summarize
summary(absences)
## id gender math daysabs
## Min. :1001 female:160 Min. : 1.00 Min. : 0.000
## 1st Qu.:1079 male :154 1st Qu.:28.00 1st Qu.: 1.000
## Median :1158 Median :48.00 Median : 4.000
## Mean :1576 Mean :48.27 Mean : 5.955
## 3rd Qu.:2078 3rd Qu.:70.00 3rd Qu.: 8.000
## Max. :2157 Max. :99.00 Max. :35.000
## prog
## General : 40
## Academic :167
## Vocational:107
##
##
##
mean(absences$daysabs)
## [1] 5.955414
var(absences$daysabs)
## [1] 49.51877
ggplot(absences, aes(daysabs, fill = prog)) +
geom_histogram(binwidth = 1, alpha=.5) +
facet_grid(prog ~ ., margins = TRUE, scales = "free")
plot(absences$math, absences$daysabs)
# Let's try a model with all the predictors
pois.model <- glm(daysabs ~ gender + math + prog, family=poisson, data=absences)
summary(pois.model)
##
## Call:
## glm(formula = daysabs ~ gender + math + prog, family = poisson,
## data = absences)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2218 -2.1892 -0.9238 0.7218 7.0048
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7594786 0.0637731 43.270 < 2e-16 ***
## gendermale -0.2424762 0.0467765 -5.184 2.18e-07 ***
## math -0.0069561 0.0009354 -7.437 1.03e-13 ***
## progAcademic -0.4260327 0.0567308 -7.510 5.92e-14 ***
## progVocational -1.2707199 0.0779143 -16.309 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2217.7 on 313 degrees of freedom
## Residual deviance: 1746.8 on 309 degrees of freedom
## AIC: 2640.2
##
## Number of Fisher Scoring iterations: 5
exp(coef(pois.model))
## (Intercept) gendermale math progAcademic progVocational
## 15.7916072 0.7846824 0.9930680 0.6530950 0.2806295
## Fit is terrible
## We're assuming mean = variance
mean(absences$daysabs ~ absences$prog)
## General Academic Vocational
## 10.650000 6.934132 2.672897
var(absences$daysabs ~ absences$prog)
## General Academic Vocational
## 67.25897 55.44744 13.93916
## What can we do with this overdispersion?
## We could try a negative binomial regression
## It has an extra parameter to model overdispersion
library(MASS)
neg.bin.model <- glm.nb(daysabs ~ gender + math + prog, data=absences)
summary(neg.bin.model)
##
## Call:
## glm.nb(formula = daysabs ~ gender + math + prog, data = absences,
## init.theta = 1.047288915, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1567 -1.0761 -0.3810 0.2856 2.7235
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.707484 0.204275 13.254 < 2e-16 ***
## gendermale -0.211086 0.121989 -1.730 0.0836 .
## math -0.006236 0.002492 -2.502 0.0124 *
## progAcademic -0.424540 0.181725 -2.336 0.0195 *
## progVocational -1.252615 0.199699 -6.273 3.55e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0473) family taken to be 1)
##
## Null deviance: 431.67 on 313 degrees of freedom
## Residual deviance: 358.87 on 309 degrees of freedom
## AIC: 1740.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.047
## Std. Err.: 0.108
##
## 2 x log-likelihood: -1728.307
exp(coef(neg.bin.model))
## (Intercept) gendermale math progAcademic progVocational
## 14.9915148 0.8097046 0.9937839 0.6540708 0.2857565
1-pchisq(deviance(neg.bin.model),
df.residual(neg.bin.model))
## [1] 0.02656535
neg.bin.no.prog <- update(neg.bin.model, . ~ . - prog)
anova(neg.bin.model, neg.bin.no.prog)
## Likelihood ratio tests of Negative Binomial Models
##
## Response: daysabs
## Model theta Resid. df 2 x log-lik. Test df
## 1 gender + math 0.8705939 311 -1772.074
## 2 gender + math + prog 1.0472889 309 -1728.307 1 vs 2 2
## LR stat. Pr(Chi)
## 1
## 2 43.76737 3.133546e-10
neg.bin.model <- glm.nb(daysabs ~ math + prog, data=absences)
newdata1 <- data.frame(math = mean(absences$math), prog = factor(1:3, levels = 1:3,
labels = levels(absences$prog)))
newdata1$phat <- predict(neg.bin.model, newdata1, type = "response")
newdata1
## math prog phat
## 1 48.26752 General 10.236899
## 2 48.26752 Academic 6.587927
## 3 48.26752 Vocational 2.850083
newdata2 <- data.frame(
math = rep(seq(from = min(absences$math), to = max(absences$math), length.out = 100), 3),
prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
levels(absences$prog)))
newdata2 <- cbind(newdata2, predict(neg.bin.model, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
DaysAbsent <- exp(fit)
LL <- exp(fit - 1.96 * se.fit)
UL <- exp(fit + 1.96 * se.fit)
})
ggplot(newdata2, aes(math, DaysAbsent)) +
geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
geom_line(aes(colour = prog), size = 2) +
labs(x = "Math Score", y = "Predicted Days Absent")