#  This file  contains a logistic regression analysis of 
#  the Checker data from Will Welch's notes Chapt. 1
#      
#  Authors:
#       R.W. Oldford, 2004
#
#
# just a handy function to capture the pathname for the files we will load.
web441 <- function(x)
         {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/',
                 x,
                 sep='')
         }
# get code for creating densities  
# (in class we just loaded this file since the 
#  machine wasn't connected to the internet).
#
# Get the data:
source(web441('checker.R'))

# plot it
x1.range <- range(checker.train[,"x1"])
x2.range <- range(checker.train[,"x2"])
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)


# Fit the generalized additive (smooth) logistic model
library(mgcv)
fit1 <- gam(y ~ s(x1) + s(x2), family ="binomial", data = checker.train)

# Although the algorithm failed to converge, we can still look at the results:
summary(fit1)

# Note that this table gives some idea of how well this (unconverged) model fits
# the data.  As with generalized linear models, there is a deviance (chi-square) 
# test associated with each term in the model and a percent of the deviance explained.
# Note also the fractional degrees of freedom associated with some of the terms.

# We can look at each term (since the model is additive) to see that term's fitted
# values as a function of the data.
# There are two terms in this model so there will be two functions to view.

# First set up to view both plota in the one window
par(mfrow=c(2,1))
# Display the two fits plus the standard errors (pointwise)
plot(fit1, se=T)

# Note that the algorithm failed to converge.  We could increase the number of 
# iterations.  Increasing them to 50 as below (from the default 20) still
# does not have the algorithm converge.
# The tolerance shown (epsilon) is the default (1 x 10^-4).

fit1a <- gam(y ~ s(x1) + s(x2), 
           family ="binomial", 
           data = checker.train, 
           control = gam.control(epsilon=1e-04, maxit= 50, globit= 50))
           
# Even this fails to converge and though the results might be nearly as before although a 
# larger/smaller percentage of the deviance might be explained by this fit.

summary(fit1a)
quartz()
par(mfrow=c(2,1))
plot(fit1a, se = T)

# Here are the predicted probabilities of belonging to class 1 (y=1, so
# the second of the two classes).

predict(fit1, newdata = xnew, type = "response")

# depending on our rule, "Bob" and "Mary" will be classified by the size
# of these estimated probabilities.

# To see the regions chosen by this, we plot contours of the predicted
# probability over a grid of points.
# Compute the fitted probabilities over a grid so that we could plot them.
xgrid <- expand.grid(x1=seq(x1.range[1], x1.range[2], length = 21),
                     x2=seq(x2.range[1], x2.range[2], length = 21))
pHat <- predict(fit1, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat <- matrix(pHat, nrow = 21, ncol = 21, byrow = FALSE)

# Put contours on corresponding to odds of 10:1, 1:1, and 1:10
# Make the line type for 1:1 be different
# (NB none of these contours appear in the range of the data!
#  Considering the summary of the fit, this is not too surprising.)
# First set up to view both plota in the one window
quartz()
par(mfrow=c(1,1))
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat, 
        levels = 0.5, 
        add=T, 
        lty=1, col="brown", lwd=2)
contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat, 
        levels =c(1/11, 10/11), 
        add=T, 
        lty=2, col="brown", lwd=2)

# Let's look at 4 contours that do appear in the range of the data.
contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat, 
        nlevels = 4, 
        add=T, 
        lty=2, col="green", lwd=2)

#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit1, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit1, newdata= checker.test, type="response") > 0.5)

#
######################
#
# Fit the logistic generalized model with interaction (still additive) predictor.

fit2 <- gam(y ~  s (x1) + s(x2) +  s (x1 * x2), 
           family ="binomial", data = checker.train, 
           control = gam.control(epsilon=1e-04,maxit= 100, globit= 100))

#
# but that failed to converge
#
# Even so, look at results.  Note significance levels associated with the coefficients.
summary(fit2)

# Note that because of the smoothed interaction term, the plot function will not
# work here. (It looks like it should for some combination of parameters, 
# but I did not find a combination which did not produce an error; for example
#          plot(fit2, se = FALSE, pers=TRUE, rug=FALSE)
#          plot(fit2, se = FALSE, select=1)
#  each produced the same error
#          Error in "[.data.frame"(x$model, x$smooth[[i]]$term) : 
#           	undefined columns selected
#   ) 
# Instead, we will look at the regions as chosen (predicted) by the fit.
# To see this, we plot contours of the predicted
# probability over a grid of points
pHat2 <- predict(fit2, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat2 <- matrix(pHat2, nrow = 21, ncol = 21, byrow = FALSE)

# Note that the following puts up a new window in the Mac R  environment
# You may need to do something different for your environment (e.g. X11())
# See help("devices") or dev.cur() for more info.


quartz()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
       col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
       col="blue", cex=1.2)


contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat2, 
        levels = 0.5, 
        add=T, 
        lty=1, col="brown", lwd=2)

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat2, 
        levels = c(1/11, 10/11), 
        add=T, 
        lty=2, col="black", lwd=2)


#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit2, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit2, newdata= checker.test, type="response") > 0.5)


#
######################
#
# Fit the logistic generalized model with interaction (still additive) predictor, but
# dropping the s(x1) term since this did not seem significant, recall:

summary(fit2)


fit3 <- gam(y ~   s(x2) +  s (x1*x2), 
           family ="binomial", data = checker.train, 
           control = gam.control(epsilon=1e-04,maxit= 100, globit= 100))
#
# which at least converges
#
# So, look at results.  Note significance levels associated with the coefficients
# and the proportion of deviance explained.

summary(fit3)
# Now look at the regions as chosen (predicted) by the fit.
pHat3 <- predict(fit3, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat3 <- matrix(pHat3, nrow = 21, ncol = 21, byrow = FALSE)

# Note that the following puts up a new window in the Mac R  environment
# You may need to do something different for your environment (e.g. X11())
# See help("devices") or dev.cur() for more info.


quartz()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
       col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
       col="blue", cex=1.2)


contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat3, 
        levels = 0.5, 
        add=T, 
        lty=1, col="brown", lwd=2)

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat3, 
        levels = c(1/11, 10/11), 
        add=T, 
        lty=2, col="black", lwd=2)

#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit3, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit3, newdata= checker.test, type="response") > 0.5)

#
######################
#
# The same reasoning suggests
# dropping the s(x1 * x2) term since this did not seem significant, recall:

summary(fit3)


fit4 <- gam(y ~   s(x2) , 
           family ="binomial", data = checker.train, 
           control = gam.control(epsilon=1e-04,maxit= 100, globit= 100))
#
# which again converges
#
# Again, look at results.  Note significance levels associated with the coefficients
# and the proportion of deviance explained.

summary(fit4)
# Now look at the regions as chosen (predicted) by the fit.
pHat4 <- predict(fit4, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat4 <- matrix(pHat4, nrow = 21, ncol = 21, byrow = FALSE)

# Note that the following puts up a new window in the Mac R  environment
# You may need to do something different for your environment (e.g. X11())
# See help("devices") or dev.cur() for more info.


quartz()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
       col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
       col="blue", cex=1.2)


contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat4, 
        levels = 0.5, 
        add=T, 
        lty=1, col="brown", lwd=2)

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat4, 
        levels = c(1/11, 10/11), 
        add=T, 
        lty=2, col="black", lwd=2)
# None of which appear within the region of the data, so add

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat4, 
        nlevels = 4, 
        add=T, 
        lty=2, col="black", lwd=2)
#
#  Which looks a lot like the logistic without any smooth!

#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit4, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit4, newdata= checker.test, type="response") > 0.5)
           

#
######################
#
# Instead of dropping the s(x1 * x2) term, try dropping the x2 term.

fit5 <- gam(y ~  s (x1 * x2), 
           family ="binomial", data = checker.train)
           
#
# which again converges
#
# Again, look at results.  Note significance levels associated with the coefficients
# and the proportion of deviance explained.

summary(fit5)
# Now look at the regions as chosen (predicted) by the fit.
pHat5 <- predict(fit5, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat5 <- matrix(pHat5, nrow = 21, ncol = 21, byrow = FALSE)

# Note that the following puts up a new window in the Mac R  environment
# You may need to do something different for your environment (e.g. X11())
# See help("devices") or dev.cur() for more info.


quartz()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
       col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
       col="blue", cex=1.2)


contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat5, 
        levels = 0.5, 
        add=T, 
        lty=1, col="brown", lwd=2)

contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat5, 
        levels = c(1/11, 10/11), 
        add=T, 
        lty=2, col="black", lwd=2)
# Also add
contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat5, 
        levels = c(0.3, 0.45, 0.55, 0.7), 
        add=T, 
        lty=2, col="green", lwd=2)
#
#  Which looks a lot like the logistic without any smooth!
#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit5, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit5, newdata= checker.test, type="response") > 0.5)
           

#
######################
#
# Fit the logistic model with interaction (still additive) predictor.
fit6 <- gam(y ~ s(x1,x2), 
           family ="binomial", 
           data = checker.train)

# Look at results.  Note significance levels associated with the coefficients.
summary(fit6)

# To see the regions chosen by this, we plot contours of the predicted
# probability over a grid of points
pHat6 <- predict(fit6, newdata = xgrid, type="response")

# Put the predicted probabilities in the grid to be used by the contour
#  program

pHat6 <- matrix(pHat6, nrow = 21, ncol = 21, byrow = FALSE)

# Note that the following puts up a new window in the Mac R  environment
# You may need to do something different for your environment (e.g. X11())
# See help("devices") or dev.cur() for more info.

quartz()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
       col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
       col="blue", cex=1.2)


contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat6, 
        levels = c(0.5), 
        add=T, 
        lty=1, col="black", lwd=2)
        
contour(x=unique(xgrid[,1]), y=unique(xgrid[,2]),
        pHat6, 
        levels = c(1/11, 10/11), 
        add=T, 
        lty=2, col="black", lwd=2)

#
#  Get a look at the misclassification probabilities on the training data
#

table(checker.train[,"y"], 
      predict(fit6, newdata= checker.train, type="response") > 0.5)

#
#  Get a look at the misclassification probabilities on the test data
#

table(checker.test[,"y"], 
      predict(fit6, newdata= checker.test, type="response") > 0.5)