# This file contains some dimension reduction methods illustrated on # the Checker data from Will Welch's notes Chapt. 1 # # Authors: # R.W. Oldford, 2004 # # quartz() is the Mac OS call for a new window. # Change the call inside from quartz() to whatever the call is # for the OS you are using. e.g. X11() or pdf(), etc. newwindow <- function(x) {quartz()} # just a handy function to capture the pathname for the files we will load. web441 <- function(x) {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/', x, sep='') } # (in class we just loaded this file since the # machine wasn't connected to the internet). # # Get the data: source(web441('checker.R')) # plot it newwindow() x1.range <- range(checker.train[,"x1"]) x2.range <- range(checker.train[,"x2"]) plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n") points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0", col="red", cex=1.2) points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+", col="blue", cex=1.2) # # Find the directions for the principal components # via the singular value decomposition of the X matrix # # First centre and scale the data checker.x <- checker.train[,c("x1","x2")] checker.xbar <- apply(checker.x,2,mean) checker.sd <- apply(checker.x,2,sd) checker.x01 <- (checker.x - matrix(checker.xbar, nrow(checker.x), ncol(checker.x), byrow = TRUE) ) / matrix(checker.sd, nrow(checker.x), ncol(checker.x), byrow = TRUE) # plot the standardized (zero mean, sd = 1) newwindow() x1.range <- range(checker.x01[,"x1"]) x2.range <- range(checker.x01[,"x2"]) plot(0,0,xlim=x1.range, ylim = x2.range, xlab = "x1", ylab = "x2", type ="n") points(checker.x01[checker.train[,"y"] == 0, c("x1", "x2")], pch="0", col="red", cex=1.2) points(checker.x01[checker.train[,"y"] == 1, c("x1", "x2")], pch="+", col="blue", cex=1.2) # # Now let's look for the principal directions # checker.svd <- svd (checker.x01[,c("x1","x2")]) u <- checker.svd$u d <- checker.svd$d v <- checker.svd$v X <- checker.x01 # # Note that # u %*% diag(d) %*% t(v) # # equals # X # # the difference is zero (subject to floating point arithmetic) # X - u %*% diag(d) %*% t(v) # # Note that the values of d are the square roots of the eigen values # of t(X) %*% X and so are proportional to variances olong the # principal axes. # The proportion of the total variance explained by only the first principal # direction is # d2 <- d^2 d[1]^2/sum(d2) # # Or to do this for every direction sumulatively summing to get # the proportion of the total variance explained by the principal # directions up to that point cumsum(d2/sum(d2)) # I (rwo) think that ratios of standard deviations are more meaningful # (statistically and geometrically) and so would rather look at # d / max(d) # to give me an idea of the relative spreads of the data in the # different principal directions. # We might call these the standardized singular values. # These values kind of describe the dimensionality of the data. # as in sum(d/max(d)) # More interesting is to identify the point at which the # spread drops dramatically or where it drops beyond some # threshold (e.g. 1/30). An interesting plot might be # newwindow() par(mfcol=c(1,2)) plot(0,0,xlim=c(1,length(d)), ylim = c(0,1), main = "Standardized singular values", xlab = "i", ylab = "sing. val", type ="n") points(1:length(d), d / max(d)) lines(1:length(d), d/max(d), lty = 2) plot(0,0,xlim=c(1,length(d)), ylim = c(0,1), main = "Proportion of total variance", xlab = "i", ylab = "Cumulative proportion", type ="n") points(1:length(d),cumsum(d2/sum(d2))) lines(1:length(d), cumsum(d2/sum(d2)), lty = 2) # # The data in the new coordinate system are # xpc <- u %*% diag(d) newwindow() par(mfcol=c(1,1)) plot(0,0,xlim = range(xpc[,1]), ylim = range(xpc[,2]), type ="n", main = "First two principal components", xlab = "pc 1", ylab = "pc2") points(xpc[checker.train[,"y"] == 0,], pch="0", col="red", cex=1.2) points(xpc[checker.train[,"y"] == 1, ], pch="+", col="blue", cex=1.2)