#  This file contains some dimension reduction methods illustrated on 
#  the Checker data from Will Welch's notes Chapt. 1
#      
#  Authors:
#       R.W. Oldford, 2004
#

# quartz() is the Mac OS call for a new window.
# Change the call inside from quartz() to whatever the call is
# for the OS you are using. e.g. X11() or pdf(), etc.

newwindow <- function(x) {quartz()}

# just a handy function to capture the pathname for the files we will load.
web441 <- function(x)
         {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/',
                 x,
                 sep='')
         }

# (in class we just loaded this file since the 
#  machine wasn't connected to the internet).
#
# Get the data:
source(web441('checker.R'))

# plot it
newwindow()
x1.range <- range(checker.train[,"x1"])
x2.range <- range(checker.train[,"x2"])
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.train[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(checker.train[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)


#
# Find the directions for the principal components
# via the singular value decomposition of the X matrix
#
# First centre and scale the data

checker.x <- checker.train[,c("x1","x2")]
checker.xbar <- apply(checker.x,2,mean)
checker.sd <- apply(checker.x,2,sd)
checker.x01 <- (checker.x - 
                matrix(checker.xbar,
                       nrow(checker.x), 
                       ncol(checker.x), 
                       byrow = TRUE)  ) /
                                matrix(checker.sd,
                                       nrow(checker.x), 
                                       ncol(checker.x), 
                                       byrow = TRUE)


# plot the standardized  (zero mean, sd = 1)
newwindow()
x1.range <- range(checker.x01[,"x1"])
x2.range <- range(checker.x01[,"x2"])
plot(0,0,xlim=x1.range, ylim = x2.range,
     xlab = "x1", ylab = "x2", type ="n")
points(checker.x01[checker.train[,"y"] == 0, c("x1", "x2")], pch="0",
        col="red", cex=1.2)
points(checker.x01[checker.train[,"y"] == 1, c("x1", "x2")], pch="+",
        col="blue", cex=1.2)

#
# Now let's look for the principal directions
#
checker.svd <- svd (checker.x01[,c("x1","x2")])

u <- checker.svd$u
d <- checker.svd$d
v <- checker.svd$v

X <- checker.x01

#
# Note that
#

u %*% diag(d) %*% t(v)

#
# equals
# 

X

#
# the difference is zero (subject to floating point arithmetic)
#

X  - u %*% diag(d) %*% t(v)

#
#  Note that the values of d are the square roots of the eigen values
#  of t(X) %*% X  and so are proportional to variances olong the
#  principal axes.
#  The proportion of the total variance explained by only the first principal
#  direction is
#


d2 <- d^2
d[1]^2/sum(d2)

#
#  Or to do this for every direction sumulatively summing to get
#  the proportion of the total variance explained by the principal
#  directions up to that point

cumsum(d2/sum(d2))

#  I (rwo) think that ratios of standard deviations are more meaningful
#  (statistically and geometrically) and so would rather look at
#

d / max(d)

#  to give me an idea of the relative spreads of the data in the
#  different principal directions.
#  We might call these the standardized singular values.
#  These values kind of describe the dimensionality of the data.
#  as in 
sum(d/max(d))
#  More interesting is to identify the point at which the 
#  spread drops dramatically or where it drops beyond some
#  threshold (e.g. 1/30).  An interesting plot might be
#  


newwindow()
par(mfcol=c(1,2))
plot(0,0,xlim=c(1,length(d)), ylim = c(0,1),
     main = "Standardized singular values", 
    xlab = "i", ylab = "sing. val", type ="n")
points(1:length(d), d / max(d))
lines(1:length(d), d/max(d), lty = 2)

plot(0,0,xlim=c(1,length(d)), ylim = c(0,1),
     main = "Proportion of total variance", 
    xlab = "i", ylab = "Cumulative proportion", type ="n")
points(1:length(d),cumsum(d2/sum(d2)))
lines(1:length(d), cumsum(d2/sum(d2)), lty = 2)

#
#  The data in the new coordinate system are
#
 xpc <- u %*% diag(d)

newwindow()
par(mfcol=c(1,1))
plot(0,0,xlim = range(xpc[,1]), ylim = range(xpc[,2]), type ="n",
     main = "First two principal components", 
     xlab = "pc 1", ylab = "pc2")
points(xpc[checker.train[,"y"] == 0,], pch="0", col="red", cex=1.2)
points(xpc[checker.train[,"y"] == 1, ], pch="+", col="blue", cex=1.2)