# This file illustrates the use of K nearest neighbour methods by # analysis of the "nuggets" data contained in nuggets.train and nuggets.test # (these are generated like "nuggets2" from Will Welch's notes Chapt. 1) # # Authors: # R.W. Oldford, 2004 # # # quartz() is the Mac OS call for a new window. # Change the call inside from quartz() to whatever the call is # for the OS you are using. e.g. X11() or pdf(), etc. newwindow <- function(x) {quartz()} # # A handy function to capture the pathname for the files we will load. web441 <- function(x) {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/', x, sep='') } # (in class we just loaded this file since the # machine wasn't connected to the internet). # # Get the data: source(web441('nuggets.R')) # # Plot the training data # # gridlength <- 42 x1.range <- range(nuggets.train[,"x1"]) x2.range <- range(nuggets.train[,"x2"]) xgrid <- expand.grid(x1=seq(x1.range[1], x1.range[2], length = gridlength), x2=seq(x2.range[1], x2.range[2], length = gridlength)) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "nuggets training data") points(nuggets.train[nuggets.train[,"y"] == 0, c("x1", "x2")], pch="0", col="red", cex=1.2) points(nuggets.train[nuggets.train[,"y"] == 1, c("x1", "x2")], pch="+", col="blue", cex=1.2) # # Plot the test data # newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "nuggets test data") points(nuggets.test[nuggets.test[,"y"] == 0, c("x1", "x2")], pch="0", col="red", cex=1.2) points(nuggets.test[nuggets.test[,"y"] == 1, c("x1", "x2")], pch="+", col="blue", cex=1.2) # # Load the library containing knn code. # library(class) # # The knn code requires the explanatory variates to be separated # from the response (or class) variate. # It also requires that the class be a factor, so we # put the data in this form. # # The training data: # x.tr <- nuggets.train[, c("x1","x2")] y.tr <- factor(nuggets.train[,"y"]) # # The test data: # x.te <- nuggets.test[, c("x1","x2")] y.te <- factor(nuggets.test[,"y"]) # # Produce the knn classification (arg description from help(knn) ) # This classifies the test data according to the class of its # k nearest neighbours in the training set. # The result is a prediction of the class for each observation in # the test data. nuggets.knn <- knn(train = x.tr, # matrix or data frame of training set cases. test = x.te, # matrix or data frame of test set cases. A vector will be # interpreted as a row vector for a single case. cl = y.tr, # factor of true classifications of training set k = 1, # number of neighbours considered. l = 0, # minimum vote for definite decision, otherwise 'doubt'. (More # precisely, less than 'k-l' dissenting votes are allowed, even # if 'k' is increased by ties.) prob = TRUE, # If this is true, the proportion of the votes for the winning # class are returned as attribute 'prob'. use.all = TRUE # controls handling of ties. If true, all distances equal to # the 'k'th largest are included. If false, a random selection # of distances equal to the 'k'th is chosen to use exactly 'k' # neighbours. ) # # There is no special summary method written for knn so # the following is pretty useless summary(nuggets.knn) # # You can always look at the object itself # nuggets.knn # i.e. the class predictions for the test set. doubt is NA (if l > 0) # # The help function makes reference to the attribute 'prob' # The value of prob is found as one of the attributes of nuggets.knn # attributes(nuggets.knn) # So prob is found as # attributes(nuggets.knn)$prob # # which gives the proportion of the nearest neighbours # voting for the predicted class (not especially interesting in the k=1 case) # # # Check the misclassification table on the test data: # table(y.te, nuggets.knn) ########################################################################## # # The regions # # # What do the regions look like? # Redo the above but predict on the entire grid of points # (Note that knn does the prediction, so there is no "predict" function # for knn as we have seen for others) # nuggets.knn1 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 1, l = 0, prob =TRUE, use.all = TRUE) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "Knn k=1 regions for nuggets data") points(xgrid[nuggets.knn1 == 0, c("x1", "x2")], col="red", cex=1.2) points(xgrid[nuggets.knn1 == 1, c("x1", "x2")], col="blue", cex=1.2) # # How about when k = 3? # nuggets.knn3 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 3, l = 0, prob =TRUE, use.all = TRUE) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "Knn k=3 regions for nuggets data") points(xgrid[nuggets.knn3 == 0, c("x1", "x2")], col="red", cex=1.2) points(xgrid[nuggets.knn3 == 1, c("x1", "x2")], col="blue", cex=1.2) # # How about when k = 5? # nuggets.knn5 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 5, l = 0, prob =TRUE, use.all = TRUE) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "Knn k=5 regions for nuggets data") points(xgrid[nuggets.knn5 == 0, c("x1", "x2")], col="red", cex=1.2) points(xgrid[nuggets.knn5 == 1, c("x1", "x2")], col="blue", cex=1.2) # # How about when k = 10? # nuggets.knn10 <- knn(train=x.tr, test=xgrid, cl = y.tr, k = 10, l = 0, prob = TRUE, use.all = TRUE) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "Knn k=10 regions for nuggets data") points(xgrid[nuggets.knn10 == 0, c("x1", "x2")], col="red", cex=1.2) points(xgrid[nuggets.knn10 == 1, c("x1", "x2")], col="blue", cex=1.2) # # Note how the regions change with k, generally removing smaller populated # regions. # Note however also how large k can produce more regions in areas where # the data are sparse. # ############################################################# # # How might we choose the value of k? # ############################################################# # # There is a function called knn.cv which predicts the class # of each point in the training data based on the k nearest # neighbours to itself. It does not use the `true class' of # the point itself and so provides the information necessary # to produce a leave-one-out cross-validation estimate of the # misclassification rate from the training data alone. # # For the case of k =1, the predictions at the training points # (ignoring the known class value at the each point) is had # from nuggets.knncv1 <- knn.cv(train=x.tr, cl = y.tr, k = 1, l = 0, prob =TRUE, use.all = TRUE) nuggets.knncv1 # # Comparing these leave one out predictions to the true values # allows us to give a cross-validated estimate of the misclassification # probability, namely mp1 <- sum(nuggets.knncv1 != y.tr) / length(y.tr) mp1 # # Similarly for k=3 # nuggets.knncv3 <- knn.cv(train=x.tr, cl = y.tr, k = 3, l = 0, prob =TRUE, use.all = TRUE) mp3 <- sum(nuggets.knncv3 != y.tr) / length(y.tr) mp3 # # ... for k=5 # nuggets.knncv5 <- knn.cv(train=x.tr, cl = y.tr, k = 5, l = 0, prob =TRUE, use.all = TRUE) mp5 <- sum(nuggets.knncv5 != y.tr) / length(y.tr) mp5 # # ... and for k=10 # nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, l = 0, prob =TRUE, use.all = TRUE) mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr) mp10 # # And so we might plot these versus k # newwindow() plot(x = c(1,3,5,10), y = c(mp1, mp3, mp5, mp10), main = "Cross-validated misclassification prob", sub = "nuggets data", xlab = "k", ylab = "p" ) lines(x = c(1,3,5,10), y = c(mp1, mp3, mp5, mp10), lty =2) # # Note BTW what happens when we repeat this for k = 10 # nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, l = 0, prob =TRUE, use.all = TRUE) mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr) mp10 # and again nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, l = 0, prob =TRUE, use.all = TRUE) mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr) mp10 # and again nuggets.knncv10 <- knn.cv(train=x.tr, cl = y.tr, k = 10, l = 0, prob =TRUE, use.all = TRUE) mp10 <- sum(nuggets.knncv10 != y.tr) / length(y.tr) mp10 # # If the vote is tied for any point it is broken at random. # Because there are only two classes in this case, when k # is even, there will occasionally be ties. # For the two class situation then, we might want to restrict # consideration to the cases of k being only odd so that no # ties are possible. # ############################################################ # # We could automate this graph with the following function # (note that the default k avoids even k which makes some # sense for the case of two classes): knn.cvmin <- function (train, cl, k = seq(1,21,2), l = 0, use.all = TRUE, plot=TRUE) { n <- length(cl) # A vector to store the misclassification probabilities mp <- vector("numeric", length(k)) for (i in 1:length(k)) {c <- knn.cv(train = train, cl = cl, k = k[i], l = l, prob = FALSE, use.all = use.all) mp[i] <- sum(c != cl) / n } kmin <- order(mp)[1] if(plot) {plot(x = k, y = mp, main = "Cross-validated misclassification probabilities", xlab = "k", ylab = "p" ) lines(x = k, y = mp, lty =2) } pmin <- mp[kmin] # Return the relevant values. return( list(k = k, p = mp, kmin = k[mp==pmin], pmin = mp[kmin])) } # # And try it out on the nuggets data # newwindow() results <- knn.cvmin(train=x.tr, cl = y.tr) # # Which suggests the value results$kmin # # k = 3, on the grid # nuggets.knn3<- knn(train=x.tr, test=xgrid, cl = y.tr, k = 3, l = 0, prob =TRUE, use.all = TRUE) newwindow() plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n", main = "Knn k=3 regions for nuggets data") points(xgrid[nuggets.knn3 == 0, c("x1", "x2")], col="red", cex=1.2) points(xgrid[nuggets.knn3 == 1, c("x1", "x2")], col="blue", cex=1.2) # # k = 3, on the test data # nuggets.knn3te <- knn(train=x.tr, test=x.te, cl = y.tr, k = 3, l = 0, prob =TRUE, use.all = TRUE) table(y.te, nuggets.knn3te)