Chapter 5 Logistic Regression

5.0.1 Simple Example

Generating data

X=runif(100000,0,100)
g=-3+.06 * X
Pi=(1/(1+exp(-g)))
U=runif(100000)
Y=(U<Pi) * 1

Estimating model coefficients

model=glm(Y~X,family=binomial)
summary(model)

## 
## Call:
## glm(formula = Y ~ X, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4713  -0.6868  -0.3107   0.6862   2.4725  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.0096470  0.0198147  -151.9   <2e-16 ***
## X            0.0601529  0.0003612   166.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138629  on 99999  degrees of freedom
## Residual deviance:  93059  on 99998  degrees of freedom
## AIC: 93063
## 
## Number of Fisher Scoring iterations: 4

MLE for beta

betahat=coef(model)
betahat

## (Intercept)           X 
## -3.00964698  0.06015287

New Data

new.data=data.frame(X=1:100)

# Estimating g for New Data
ghat=cbind(1,1:100)%*%betahat
plot(1:100,ghat,type="l",col="red")

ghat=predict(model,newdata=new.data)
plot(1:100,ghat,type="l",col="red")

#Estimating Pi for New Data
Pihat = 1/(1+exp(-ghat))
plot(1:100,Pihat,type="l",col="red")

Pihat = predict(model,newdata=new.data,type="response")
plot(1:100,Pihat,type="l",col="red")

5.0.2 Plots

Y vs. X (Not very useful)

plot(X,Y)
lines(1:100,Pihat,col="red")

#Grouplot of Pihat vs X
source("http://faculty.tarleton.edu/crawford/documents/math505/LogisticRegressionFunctions.txt")

groupplot.results=groupplot(X,Y,20)
plot(groupplot.results$x,groupplot.results$Pi,
     xlab="X",
     ylab="Pihat")

lines(1:100,Pihat,col="red")

Comments The data set has been broken into 20 groups based on values of X

For each point in the plot, the x-coordinate is the average value of X in that group.

For each point in the plot, the y-coordinate is the proportion of observations in that group where Y=1. In other words, the y-coordinates provide an estimate of Pi = P(Y=1) for each group.

5.0.3 Plot of ghat vs X

group.Pihat=groupplot.results$Pi
group.ghat=log(group.Pihat/(1-group.Pihat))

plot(groupplot.results$x,group.ghat)
lines(1:100,ghat,col="red")

plot(groupplot.results$x,groupplot.results$g,
     xlab="X",
     ylab="Logit")
lines(1:100,ghat,col="red")

#Boxplot of X vs. Y
plot(as.factor(Y),X)

5.0.4 An Example Involving Curvature

set.seed(5366)
X=runif(100000,0,100)
g=-3+.05524*X -.002333*X^2 + .00002381*X^3
Pi=(1/(1+exp(-g)))
U=runif(100000)
Y=(U<Pi) * 1


#Degree 1 model Pihat vs X
model1=glm(Y~X,family=binomial)
summary(model1)

## 
## Call:
## glm(formula = Y ~ X, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3886  -0.5858  -0.2904  -0.1404   3.1861  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.0709619  0.0355466  -142.7   <2e-16 ***
## X            0.0555489  0.0004697   118.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92006  on 99999  degrees of freedom
## Residual deviance: 69547  on 99998  degrees of freedom
## AIC: 69551
## 
## Number of Fisher Scoring iterations: 6

groupplot.results=groupplot(X,Y,20)
plot(groupplot.results$x,groupplot.results$Pi,
     xlab="X",
     ylab="Pihat")

x.plot.vector=1:100
Pi.plot.vector=predict(model1,
                       newdata=data.frame(X=x.plot.vector),
                       type="response")
lines(x.plot.vector,
      Pi.plot.vector,
      col="red")

5.0.5 Degree 1 model Logit (ghat) vs X

plot(groupplot.results$x,groupplot.results$g,
     xlab="X",
     ylab="Logit")
x.plot.vector=1:100
g.plot.vector=predict(model1,
                      newdata=data.frame(X=x.plot.vector))
lines(x.plot.vector,
      g.plot.vector,
      col="red")

5.0.6 Degree 2 model Pihat vs X

model2=glm(Y~X+I(X^2),family=binomial)
summary(model2)

## 
## Call:
## glm(formula = Y ~ X + I(X^2), family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0622  -0.4332  -0.2843  -0.2347   2.6938  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.797e+00  3.373e-02  -53.27   <2e-16 ***
## X           -9.966e-02  1.568e-03  -63.56   <2e-16 ***
## I(X^2)       1.376e-03  1.501e-05   91.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92006  on 99999  degrees of freedom
## Residual deviance: 60972  on 99997  degrees of freedom
## AIC: 60978
## 
## Number of Fisher Scoring iterations: 5

groupplot.results=groupplot(X,Y,20)
plot(groupplot.results$x,groupplot.results$Pi,
     xlab="X",
     ylab="Pihat")

x.plot.vector=1:100
Pi.plot.vector=predict(model2,
                       newdata=data.frame(X=x.plot.vector),
                       type="response")
lines(x.plot.vector,
      Pi.plot.vector,
      col="red")

5.0.7 Degree 2 model Logit (ghat) vs X

plot(groupplot.results$x,groupplot.results$g,
     xlab="X",
     ylab="Logit")
x.plot.vector=1:100
g.plot.vector=predict(model2,
                      newdata=data.frame(X=x.plot.vector))
lines(x.plot.vector,
      g.plot.vector,
      col="red")

5.0.8 Degree 3 model Pihat vs X

model3=glm(Y~X+I(X^2)+I(X^3),family=binomial)
summary(model3)

## 
## Call:
## glm(formula = Y ~ X + I(X^2) + I(X^3), family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4714  -0.3712  -0.3361  -0.3006   2.5057  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.026e+00  5.438e-02  -55.65   <2e-16 ***
## X            5.595e-02  4.523e-03   12.37   <2e-16 ***
## I(X^2)      -2.336e-03  9.953e-05  -23.48   <2e-16 ***
## I(X^3)       2.380e-05  6.321e-07   37.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92006  on 99999  degrees of freedom
## Residual deviance: 59457  on 99996  degrees of freedom
## AIC: 59465
## 
## Number of Fisher Scoring iterations: 5

groupplot.results=groupplot(X,Y,20)
plot(groupplot.results$x,groupplot.results$Pi,
     xlab="X",
     ylab="Pihat")

x.plot.vector=1:100
Pi.plot.vector=predict(model3,
                       newdata=data.frame(X=x.plot.vector),
                       type="response")
lines(x.plot.vector,
      Pi.plot.vector,
      col="red")

5.0.9 Degree 3 model Logit (ghat) vs X

plot(groupplot.results$x,groupplot.results$g,
     xlab="X",
     ylab="Logit")
x.plot.vector=1:100
g.plot.vector=predict(model3,
                      newdata=data.frame(X=x.plot.vector))
lines(x.plot.vector,
      g.plot.vector,
      col="red")

5.0.10 Degree 4 model Pihat vs X

model4=glm(Y~X+I(X^2)+I(X^3)+I(X^4),family=binomial)
summary(model4)

## 
## Call:
## glm(formula = Y ~ X + I(X^2) + I(X^3) + I(X^4), family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4759  -0.3704  -0.3365  -0.3013   2.5035  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.012e+00  6.906e-02 -43.615  < 2e-16 ***
## X            5.330e-02  9.144e-03   5.829 5.58e-09 ***
## I(X^2)      -2.221e-03  3.595e-04  -6.179 6.44e-10 ***
## I(X^3)       2.207e-05  5.239e-06   4.212 2.53e-05 ***
## I(X^4)       8.449e-09  2.534e-08   0.333    0.739    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92006  on 99999  degrees of freedom
## Residual deviance: 59457  on 99995  degrees of freedom
## AIC: 59467
## 
## Number of Fisher Scoring iterations: 5

groupplot.results=groupplot(X,Y,20)
plot(groupplot.results$x,groupplot.results$Pi,
     xlab="X",
     ylab="Pihat")

x.plot.vector=1:100
Pi.plot.vector=predict(model4,
                       newdata=data.frame(X=x.plot.vector),
                       type="response")
lines(x.plot.vector,
      Pi.plot.vector,
      col="red")

5.0.11 Degree 4 model Logit (ghat) vs X

plot(groupplot.results$x,groupplot.results$g,
     xlab="X",
     ylab="Logit")
x.plot.vector=1:100
g.plot.vector=predict(model4,
                      newdata=data.frame(X=x.plot.vector))
lines(x.plot.vector,
      g.plot.vector,
      col="red")

5.1 Testing model1 vs model3

G=model1$deviance-model3$deviance
pchisq(G,lower.tail=FALSE,df=model1$df.residual-model2$df.residual)

## [1] 0

LRtest(model1,model3)   #This is a function from LogisticRegressionFunctions.txt

## [1] 0

LRtest(model3,model4)

## [1] 0.7388044

5.2 A Multivariate Example

set.seed(5366)
Rank=rnorm(200,.67,.1)
SAT=round(rnorm(200,980,132),-1)
ShoeSize=round(rnorm(200,9,1),1)
U=runif(200,0,1)
Mother=1:200


for(i in 1:200){
  if(U[i]<.5){Mother[i]="SomeCollege"}
  if((U[i]>.5)&(U[i]<.8)){Mother[i]="Bachelors"}
  if((U[i]>.8)&(U[i]<.95)){Mother[i]="HighSchool"}
  if(U[i]>0.95){Mother[i]="GradDegree"}
}


Mother=as.factor(Mother)
Mother=relevel(Mother,4)


tempmodel=lm(1:200~Rank+SAT+Mother)
X=model.matrix(tempmodel)
beta=c(-9.3,10,.00364,1.5,1.5,0)
g=X%*%beta
Pi=1/(1+exp(-g))
U2=runif(200,0,1)
Retention=(U2<Pi)*1
Retention.data=data.frame(Rank,SAT,ShoeSize,Mother)

#Fitting a model
model=glm(Retention~.,data=Retention.data,family=binomial)
summary(model)

## 
## Call:
## glm(formula = Retention ~ ., family = binomial, data = Retention.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7118   0.1951   0.3948   0.6374   1.3498  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.293975   3.095703  -2.679 0.007380 ** 
## Rank             10.277291   2.467877   4.164 3.12e-05 ***
## SAT               0.004871   0.001576   3.090 0.002002 ** 
## ShoeSize         -0.233967   0.207032  -1.130 0.258435    
## MotherBachelors   1.927406   0.555780   3.468 0.000524 ***
## MotherGradDegree  0.882259   1.180404   0.747 0.454809    
## MotherHighSchool -0.028233   0.514632  -0.055 0.956249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 156.62  on 193  degrees of freedom
## AIC: 170.62
## 
## Number of Fisher Scoring iterations: 5

Better Plotting Function

quantlogitplot=function(Y,x,degree,xname,Yname,numgroups,yrange){
  xrange=range(x)
  n=length(Y)
  X=rep(1,n)
  for(i in 1:degree){
    X=cbind(X,x^i)
  }
  model=glm(Y~X-1,family='binomial')
  L=groupplot(x,Y,numgroups)
  plot(L$x,L$g,xlab=xname,ylab=Yname,ylim=yrange)
  a=xrange[1]
  b=xrange[2]
  index=(1:100)/100
  index=a+(b-a)*index
  Index=rep(1,100)
  for(i in 1:degree){
    Index=cbind(Index,index^i)
  }
  betahat=coef(model)
  lines(index,Index%*%betahat,col='red')
  
  
}

Rank model

quantlogitplot(Retention,Rank,1,"Rank","Logit",10,c(-3,3))

rank.model=glm(Retention~Rank,family=binomial)
rank.model.2=glm(Retention~Rank+I(Rank^2),family=binomial)
summary(rank.model)

## 
## Call:
## glm(formula = Retention ~ Rank, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6285   0.3307   0.5337   0.7007   1.1764  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.522      1.339  -2.631 0.008505 ** 
## Rank           7.685      2.124   3.619 0.000296 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 182.27  on 198  degrees of freedom
## AIC: 186.27
## 
## Number of Fisher Scoring iterations: 5

summary(rank.model.2)

## 
## Call:
## glm(formula = Retention ~ Rank + I(Rank^2), family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4764   0.3629   0.5269   0.6848   1.2464  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -6.161      7.049  -0.874    0.382
## Rank          16.100     22.111   0.728    0.467
## I(Rank^2)     -6.574     17.137  -0.384    0.701
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 182.12  on 197  degrees of freedom
## AIC: 188.12
## 
## Number of Fisher Scoring iterations: 5

LRtest(rank.model,rank.model.2)

## [1] 0.7062637

SAT model

quantlogitplot(Retention,SAT,1,"SAT","Logit",10,c(-3,3))

SAT.model=glm(Retention~SAT,family=binomial)
SAT.model.2=glm(Retention~SAT+I(SAT^2),family=binomial)
summary(SAT.model)

## 
## Call:
## glm(formula = Retention ~ SAT, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1306   0.4343   0.5911   0.7097   1.0585  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -2.020442   1.342033  -1.506   0.1322  
## SAT          0.003604   0.001422   2.534   0.0113 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 190.36  on 198  degrees of freedom
## AIC: 194.36
## 
## Number of Fisher Scoring iterations: 4

summary(SAT.model.2)

## 
## Call:
## glm(formula = Retention ~ SAT + I(SAT^2), family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1492   0.4182   0.5974   0.7177   0.9973  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.841e-01  7.425e+00  -0.052    0.959
## SAT          1.239e-04  1.561e-02   0.008    0.994
## I(SAT^2)     1.817e-06  8.128e-06   0.224    0.823
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 190.31  on 197  degrees of freedom
## AIC: 196.31
## 
## Number of Fisher Scoring iterations: 5

LRtest(SAT.model,SAT.model.2)

## [1] 0.821137

Shoe Size model

quantlogitplot(Retention,ShoeSize,1,"Shoe Size","Logit",10,c(-3,3))

summary(glm(Retention~ShoeSize,family=binomial))

## 
## Call:
## glm(formula = Retention ~ ShoeSize, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0653   0.5434   0.6263   0.6758   0.8619  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   3.9444     1.6823   2.345    0.019 *
## ShoeSize     -0.2808     0.1843  -1.524    0.128  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 195.00  on 198  degrees of freedom
## AIC: 199
## 
## Number of Fisher Scoring iterations: 4

Mother’s Education Level

table(Mother)

## Mother
## SomeCollege   Bachelors  GradDegree  HighSchool 
##          89          71           6          34

model=glm(Retention~.,data=Retention.data,family=binomial)
summary(model)

## 
## Call:
## glm(formula = Retention ~ ., family = binomial, data = Retention.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7118   0.1951   0.3948   0.6374   1.3498  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.293975   3.095703  -2.679 0.007380 ** 
## Rank             10.277291   2.467877   4.164 3.12e-05 ***
## SAT               0.004871   0.001576   3.090 0.002002 ** 
## ShoeSize         -0.233967   0.207032  -1.130 0.258435    
## MotherBachelors   1.927406   0.555780   3.468 0.000524 ***
## MotherGradDegree  0.882259   1.180404   0.747 0.454809    
## MotherHighSchool -0.028233   0.514632  -0.055 0.956249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 156.62  on 193  degrees of freedom
## AIC: 170.62
## 
## Number of Fisher Scoring iterations: 5

model0=glm(Retention~Rank+SAT+ShoeSize,data=Retention.data,family=binomial)
summary(model0)

## 
## Call:
## glm(formula = Retention ~ Rank + SAT + ShoeSize, family = binomial, 
##     data = Retention.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3186   0.2787   0.4652   0.6840   1.3253  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.576068   2.787273  -2.001 0.045441 *  
## Rank         7.552550   2.170519   3.480 0.000502 ***
## SAT          0.003959   0.001482   2.670 0.007580 ** 
## ShoeSize    -0.182597   0.195038  -0.936 0.349163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 173.74  on 196  degrees of freedom
## AIC: 181.74
## 
## Number of Fisher Scoring iterations: 5

LRtest(model0,model)

## [1] 0.0006693412

Recoding Mother’s Education Level

table(Mother)

## Mother
## SomeCollege   Bachelors  GradDegree  HighSchool 
##          89          71           6          34

table(as.numeric(Mother))

## 
##  1  2  3  4 
## 89 71  6 34

Mother.Recode=c("HighSchool/SomeCollege",
                "Bachelors/GradDegree",
                "Bachelors/GradDegree",
                "HighSchool/SomeCollege"
                )
Mother2=Mother.Recode[Mother]
table(Mother2)

## Mother2
##   Bachelors/GradDegree HighSchool/SomeCollege 
##                     77                    123

model2=glm(Retention~Rank+SAT+ShoeSize+Mother2,data=Retention.data,family=binomial)
summary(model2)

## 
## Call:
## glm(formula = Retention ~ Rank + SAT + ShoeSize + Mother2, family = binomial, 
##     data = Retention.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6724   0.2069   0.4023   0.6406   1.3438  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -6.339209   2.964094  -2.139  0.03246 *  
## Rank                          10.073942   2.417513   4.167 3.09e-05 ***
## SAT                            0.004881   0.001572   3.104  0.00191 ** 
## ShoeSize                      -0.235572   0.206512  -1.141  0.25399    
## Mother2HighSchool/SomeCollege -1.827392   0.500327  -3.652  0.00026 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 157.24  on 195  degrees of freedom
## AIC: 167.24
## 
## Number of Fisher Scoring iterations: 5

LRtest(model2,model)

## [1] 0.7323225

Removing Shoe Size

model3=glm(Retention~Rank+SAT+Mother2,data=Retention.data,family=binomial)
summary(model3)

## 
## Call:
## glm(formula = Retention ~ Rank + SAT + Mother2, family = binomial, 
##     data = Retention.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6402   0.2009   0.4144   0.6228   1.3829  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -8.683127   2.187868  -3.969 7.22e-05 ***
## Rank                          10.436952   2.409344   4.332 1.48e-05 ***
## SAT                            0.004852   0.001561   3.107 0.001887 ** 
## Mother2HighSchool/SomeCollege -1.793748   0.496847  -3.610 0.000306 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 158.57  on 196  degrees of freedom
## AIC: 166.57
## 
## Number of Fisher Scoring iterations: 5

LRtest(model3,model)

## [1] 0.584133

5.2.1 Variable Selection

Stepwise

model=glm(Retention~.,data=Retention.data,family=binomial)
step.model=step(model)

## Start:  AIC=170.62
## Retention ~ Rank + SAT + ShoeSize + Mother
## 
##            Df Deviance    AIC
## - ShoeSize  1   157.92 169.92
## <none>          156.62 170.62
## - SAT       1   167.33 179.33
## - Mother    3   173.74 181.74
## - Rank      1   178.21 190.21
## 
## Step:  AIC=169.92
## Retention ~ Rank + SAT + Mother
## 
##          Df Deviance    AIC
## <none>        157.92 169.92
## - SAT     1   168.57 178.57
## - Mother  3   174.62 180.62
## - Rank    1   181.53 191.53

#Best Subsets
library(bestglm)

X=model.matrix(model)
X=X[,2:(ncol(X))]
y=Retention
Xy=data.frame(X,y)

best.model=bestglm(Xy,family=binomial)

## Morgan-Tatar search since family is non-gaussian.

names(best.model)

## [1] "BestModel"   "BestModels"  "Bestq"       "qTable"      "Subsets"     "Title"      
## [7] "ModelReport"

summary(best.model$BestModel)

## 
## Call:
## glm(formula = y ~ ., family = family, data = Xi, weights = weights)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6715   0.1966   0.4066   0.6264   1.3696  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -10.429845   2.332693  -4.471 7.78e-06 ***
## Rank             10.610051   2.447711   4.335 1.46e-05 ***
## SAT               0.004724   0.001547   3.054 0.002257 ** 
## MotherBachelors   1.857860   0.518495   3.583 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.36  on 199  degrees of freedom
## Residual deviance: 158.45  on 196  degrees of freedom
## AIC: 166.45
## 
## Number of Fisher Scoring iterations: 5

5.3 Assessing Model Performance and Fit

source("http://faculty.tarleton.edu/crawford/documents/Math5364/MiscRFunctions.txt")

#Training Data
Retention.data.2=data.frame(Retention,Rank,SAT,Mother2)
train=trainsample(Retention.data.2,0.7)


#Fitting Model

model=glm(Retention~.,
          data=Retention.data.2[train,],
          family=binomial)
summary(model)

## 
## Call:
## glm(formula = Retention ~ ., family = binomial, data = Retention.data.2[train, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4849   0.1980   0.4345   0.6524   1.3690  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -9.168525   2.582276  -3.551 0.000384 ***
## Rank                          10.491690   2.887293   3.634 0.000279 ***
## SAT                            0.005166   0.001989   2.597 0.009392 ** 
## Mother2HighSchool/SomeCollege -1.734604   0.581591  -2.983 0.002859 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 142.84  on 139  degrees of freedom
## Residual deviance: 115.67  on 136  degrees of freedom
## AIC: 123.67
## 
## Number of Fisher Scoring iterations: 5

Classification Accuracy

#confmatrix function from previous tutorial
confmatrix=function(y,predy){
  matrix=table(y,predy)
  accuracy=sum(diag(matrix))/sum(matrix)
  return(list(matrix=matrix,accuracy=accuracy,error=1-accuracy))
}

Pihat=predict(model,
              newdata=Retention.data.2[-train,],
              type="response")
pred.retention=(Pihat>=0.5)*1
table(pred.retention)

## pred.retention
##  0  1 
##  8 52

confmatrix(Retention[-train],pred.retention)

## $matrix
##    predy
## y    0  1
##   0  5  5
##   1  3 47
## 
## $accuracy
## [1] 0.8666667
## 
## $error
## [1] 0.1333333

ROC Curve

library(pROC)

plot(roc(Retention[-train],Pihat))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

Hosmer-Lemeshow Goodness-of-fit Test

# dependecy doesnt exist cant use 
# Hosmer-Lemeshow goodness of fit tests from this package
# library(MKmisc)

model=glm(Retention~.,
          data=Retention.data.2,
          family=binomial)

Pihat=predict(model,
              type="response")

# HLgof.test(fit=Pihat,obs=Retention)

Visualizing HLgof.test

hlgof=function(pihat,Y,ngr){
  sortframe=sort(pihat,index=TRUE)
  pihat=sortframe$x
  Y=Y[sortframe$ix]
  s=floor(length(pihat)/ngr)  #groupsize
  pipred=1:ngr
  piobs=1:ngr
  Chat=0
  
  for(i in 1:ngr){
    index=(((i-1)*s+1):(i*s))
    if(i==ngr){index=(((i-1)*s+1):(length(pihat)))}
    pipred[i]=mean(pihat[index])
    nk=length(index)
    ok=sum(Y[index])
    piobs[i]=ok/nk
    Chat=Chat+(ok-nk*pipred[i])^2/(nk*pipred[i]*(1-pipred[i]))}
  
  pvalue=1-pchisq(Chat,ngr-2)
  return(list(pipred=pipred,piobs=piobs,Chat=Chat,pvalue=pvalue))}



hlgof.results=hlgof(Pihat,Retention,10)

Comparing Pihat and Piobs

plot(hlgof.results$pipred,hlgof.results$piobs,
     xlim=c(0,1),
     ylim=c(0,1))
lines((1:100)/100,(1:100)/100,col="red")

piobs=hlgof.results$piobs
pipred=hlgof.results$pipred
cbind(piobs,pipred)

##       piobs    pipred
##  [1,]  0.35 0.3965338
##  [2,]  0.75 0.6039958
##  [3,]  0.60 0.7213944
##  [4,]  0.85 0.8006625
##  [5,]  0.75 0.8485368
##  [6,]  0.95 0.8831521
##  [7,]  1.00 0.9113384
##  [8,]  0.90 0.9372917
##  [9,]  0.90 0.9635551
## [10,]  1.00 0.9835395

cbind(piobs,pipred)*20

##       piobs    pipred
##  [1,]     7  7.930676
##  [2,]    15 12.079916
##  [3,]    12 14.427888
##  [4,]    17 16.013249
##  [5,]    15 16.970736
##  [6,]    19 17.663041
##  [7,]    20 18.226769
##  [8,]    18 18.745833
##  [9,]    18 19.271102
## [10,]    20 19.670789

sum((20*piobs-20*pipred)^2/(20*pipred*(1-pipred)))

## [1] 11.1661