#  This file illustrates the use of K nearest neighbour methods by
#  analysis of the "nuggets" data contained in nuggets.train and nuggets.test
# (these are generated like "nuggets2" from Will Welch's notes Chapt. 1)
#      
#  Authors:
#       R.W. Oldford, 2004
#
#
# quartz() is the Mac OS call for a new window.
# Change the call inside from quartz() to whatever the call is
# for the OS you are using. e.g. X11() or pdf(), etc.

newwindow <- function(x) {quartz()}

#
# A handy function to capture the pathname for the files we will load.
web441 <- function(x)
         {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/',
                 x,
                 sep='')
         }

# (in class we just loaded this file since the 
#  machine wasn't connected to the internet).
#
# Get the data:
source(web441('nuggets.R'))

#
# Plot the training data
#
# 
gridlength <- 42
x1.range <- range(nuggets.train[,"x1"])
x2.range <- range(nuggets.train[,"x2"])
xgrid <- expand.grid(x1=seq(x1.range[1], x1.range[2], length = gridlength),
                     x2=seq(x2.range[1], x2.range[2], length = gridlength))

newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "nuggets training data")
points(nuggets.train[nuggets.train[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(nuggets.train[nuggets.train[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)

#
# Plot the test data
#
newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "nuggets test data")
points(nuggets.test[nuggets.test[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(nuggets.test[nuggets.test[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)

#
#  Load the library containing knn code.
#
library(class)

#
# The knn code requires the explanatory variates to be separated 
# from the response (or class) variate.
# It also requires that the class be a factor, so we 
# put the data in this form.

#
# The training data:
#
x.tr <- nuggets.train[, c("x1","x2")]
y.tr <- factor(nuggets.train[,"y"])

#
# The test data:
#
x.te <- nuggets.test[, c("x1","x2")]
y.te <- factor(nuggets.test[,"y"])

#
#  Produce the knn classification (arg description from help(knn) )
#  This classifies the test data according to the class of its
#  k nearest neighbours in the training set.
#  The result is a prediction of the class for each observation in
#  the test data.

nuggets.knn <- 
  knn(train = x.tr,    # matrix or data frame of training set cases. 

    test = x.te,       # matrix or data frame of test set cases. A vector will be
                       # interpreted as a row vector for a single case. 

      cl = y.tr,       # factor of true classifications of training set 

       k = 1,          # number of neighbours considered. 

       l = 0,          # minimum vote for definite decision, otherwise 'doubt'. (More
                       # precisely, less than 'k-l' dissenting votes are allowed, even
                       # if 'k' is increased by ties.) 

    prob = TRUE,       # If this is true, the proportion of the votes for the winning
                       # class are returned as attribute 'prob'. 

 use.all = TRUE        # controls handling of ties. If true, all distances equal to
                       # the 'k'th largest are included. If false, a random selection
                       # of distances equal to the 'k'th is chosen to use exactly 'k'
                       # neighbours. 
       )

#
# There is no special summary method written for knn so
# the following is pretty useless
summary(nuggets.knn)
#
# You can always look at the object itself
#
nuggets.knn   

# i.e. the class predictions for the test set. doubt is NA (if l > 0)
#
# The help function makes reference to the attribute 'prob'
# The value of prob is found as one of the attributes of nuggets.knn
#
attributes(nuggets.knn)

# So prob is found as
# 
attributes(nuggets.knn)$prob

#
# which gives the proportion of the nearest neighbours
# voting for the predicted class (not especially interesting in the k=1 case)
# 
#
# Check the misclassification table on the test data:
#

table(y.te, nuggets.knn)

##########################################################################
#
#  The regions
#
#
#  What do the regions look like?
#  Redo the above but predict on the entire grid of points
#  (Note that knn does the prediction, so there is no "predict" function 
#  for knn as we have seen for others)
#

nuggets.knn1 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 1, 
                     l = 0, prob =TRUE, use.all = TRUE)

newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "Knn k=1 regions for nuggets data")
points(xgrid[nuggets.knn1 == 0, c("x1", "x2")], 
        col="red", cex=1.2)
points(xgrid[nuggets.knn1 == 1, c("x1", "x2")], 
        col="blue", cex=1.2)

# 
#  How about when k = 3?
#

nuggets.knn3 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 3, 
                     l = 0, prob =TRUE, use.all = TRUE)
newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "Knn k=3 regions for nuggets data")
points(xgrid[nuggets.knn3 == 0, c("x1", "x2")], 
        col="red", cex=1.2)
points(xgrid[nuggets.knn3 == 1, c("x1", "x2")], 
        col="blue", cex=1.2)
# 
#  How about when k = 5?
#

nuggets.knn5 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 5, 
                     l = 0, prob =TRUE, use.all = TRUE)
newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "Knn k=5 regions for nuggets data")
points(xgrid[nuggets.knn5 == 0, c("x1", "x2")], 
        col="red", cex=1.2)
points(xgrid[nuggets.knn5 == 1, c("x1", "x2")], 
        col="blue", cex=1.2)
# 
#  How about when k = 10?
#

nuggets.knn10 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 10, 
                     l = 0, prob = TRUE, use.all = TRUE)
newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "Knn k=10 regions for nuggets data")
points(xgrid[nuggets.knn10 == 0, c("x1", "x2")], 
        col="red", cex=1.2)
points(xgrid[nuggets.knn10 == 1, c("x1", "x2")], 
        col="blue", cex=1.2)

#
# Note how the regions change with k, generally removing smaller populated
# regions.  
# Note however also how large k can produce more regions in areas where 
# the data are sparse.
#


#############################################################
#
#  How might we choose the value of k?
#
#############################################################
#
#  There is a function called knn.cv which predicts the class
#  of each point in the training data based on the k nearest
#  neighbours to itself.  It does not use the `true class' of
#  the point itself and so provides the information necessary
#  to produce a leave-one-out cross-validation estimate of the
#  misclassification rate from the training data alone. 
#

#  For the case of k =1, the predictions at the training points
#  (ignoring the known class value at the each point) is had
#  from

nuggets.knncv1 <- knn.cv(train=x.tr, cl = y.tr, k = 1, 
                     l = 0, prob =TRUE, use.all = TRUE)

nuggets.knncv1

#
# Comparing these leave one out predictions to the true values
# allows us to give a cross-validated estimate of the misclassification
# probability, namely

mp1 <- sum(nuggets.knncv1 != y.tr) / length(y.tr)
mp1

#
#  Similarly for k=3
#

nuggets.knncv3 <- knn.cv(train=x.tr, cl = y.tr, k = 3, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp3 <- sum(nuggets.knncv3 != y.tr) / length(y.tr)
mp3

#
#  ... for k=5
#

nuggets.knncv5 <- knn.cv(train=x.tr, cl = y.tr, k = 5, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp5 <- sum(nuggets.knncv5 != y.tr) / length(y.tr)
mp5


#
#  ... and for k=10
#

nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr)
mp10

#
#   And so we might plot these versus k
# 

newwindow()
plot(x = c(1,3,5,10), y = c(mp1, mp3, mp5, mp10),
     main = "Cross-validated misclassification prob",
     sub = "nuggets data",
     xlab = "k",
     ylab = "p"
     )
lines(x = c(1,3,5,10), y = c(mp1, mp3, mp5, mp10), lty =2)

#
# Note BTW what happens when we repeat this for k = 10
#

nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr)
mp10

# and again
nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr)
mp10

# and again
nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, 
                     l = 0, prob =TRUE, use.all = TRUE)

mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr)
mp10

#
#  If the vote is tied for any point it is broken at random.
#  Because there are only two classes in this case, when k
#  is even, there will occasionally be ties.
#  For the two class situation then, we might want to restrict
#  consideration to the cases of k being only odd so that no
#  ties are possible.
#


############################################################
#
#  We could automate this graph with the following function
#  (note that the default k avoids even k which makes some
#  sense for the case of two classes):

knn.cvmin <- function (train, cl, k = seq(1,21,2), 
                       l = 0, use.all = TRUE,
                       plot=TRUE)
 { n <- length(cl)
  # A vector to store the misclassification probabilities
  mp <- vector("numeric", length(k))
  for (i in 1:length(k))
   {c <- knn.cv(train = train, 
                   cl = cl, 
                    k = k[i], 
                    l = l, 
                 prob = FALSE, 
              use.all = use.all)
    mp[i] <- sum(c != cl) / n
   }
   kmin <- order(mp)[1]
   if(plot) 
       {plot(x = k, y = mp,
             main = "Cross-validated misclassification probabilities",
             xlab = "k",
             ylab = "p"
             )
        lines(x = k, y = mp, lty =2)
       }
   pmin <- mp[kmin]
   # Return the relevant values.
   return( list(k = k, p = mp, kmin = k[mp==pmin], pmin = mp[kmin]))
  }

#
#  And try it out on the nuggets data
# 

newwindow()
results <- knn.cvmin(train=x.tr, cl = y.tr)

#
#  Which suggests the value

results$kmin

#
#  k = 3, on the grid
#

nuggets.knn3<- knn(train=x.tr, test=xgrid, cl = y.tr, k = 3, 
                     l = 0, prob =TRUE, use.all = TRUE)
newwindow()
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n",
     main = "Knn k=3 regions for nuggets data")
points(xgrid[nuggets.knn3 == 0, c("x1", "x2")], 
        col="red", cex=1.2)
points(xgrid[nuggets.knn3 == 1, c("x1", "x2")], 
        col="blue", cex=1.2)

#
#  k = 3, on the test data
#

nuggets.knn3te <- knn(train=x.tr, test=x.te, cl = y.tr, k = 3, 
                     l = 0, prob =TRUE, use.all = TRUE)
table(y.te, nuggets.knn3te)