#  This file contains some dimension reduction methods illustrated on 
#  the iris data
#      
#  Authors:
#       R.W. Oldford, 2004
#

# quartz() is the Mac OS call for a new window.
# Change the call inside from quartz() to whatever the call is
# for the OS you are using. e.g. X11() or pdf(), etc.

newwindow <- function(x) {quartz()}

#
# Get the data:
#
library(iris3)


#
#  Now we will build the iris data frame just as was done
# in the file web441("iris-multinom.R")
#
#  Of the two methods of building a data frame described there,
#  The first will be used here.
#
# Each species is a slice in a three way array,
# which will be separated into three matrices to
# begin with. 

setosa <- iris3[,,1]
versicolor <- iris3[,,2]
virginica <- iris3[,,3]

# Put these together into a single array

iris <- rbind(setosa,versicolor,virginica)

# Construct some Species labels
#

Species <- c(rep("Setosa",nrow(setosa)),
             rep("Versicolor",nrow(versicolor)),
             rep("Virginica",nrow(virginica))
            )

# some meaningful row labels

rownames(iris) <-c(paste("Setosa", 1:nrow(setosa)), 
                   paste("Versicolor", 1:nrow(versicolor)), 
                   paste("Virginica", 1:nrow(virginica)))
#
# And to be perverse, a more meaningful collection
# of variate names
#

irisVars <- c("SepalLength", "SepalWidth", 
                        "PetalLength", "PetalWidth")

#
# which can replace the column names we started with
#

colnames(iris) <- irisVars

#
# Now  to construct the data frame
#

iris.df <- data.frame(iris, Species = Species)

# Select a training set (Say half the data)
# Note that unlike other authors, I choose to randomly
# sample from the whole data set rather than stratify by the
# Species.  The thinking here is to produce a training and
# a test set according to how the data might arrive rather than
# to have two sets which most resemble the whole dataset.
#

set.seed(2568)
n <- nrow(iris.df)
train <- sort(sample(1:n, floor(n/2)))

# Training data will be:

iris.train <- iris.df[train,]

# negate the indices to get the test data:
iris.test <- iris.df[-train,]

# plot the data
xvars <- 1:4
newwindow()
pairs(iris.df[,xvars], main = "Anderson's Iris Data -- 3 species",
            col = c("red", "green3", "blue")[iris.df[,"Species"]])


#
# Find the directions for the principal components
# via the singular value decomposition of the X matrix
#
# First centre and scale the data

iris.x <- iris.train[,xvars]
iris.xbar <- apply(iris.x,2,mean)
iris.sd <- apply(iris.x,2,sd)
iris.x01 <- (iris.x - 
                matrix(iris.xbar,
                       nrow(iris.x), 
                       ncol(iris.x), 
                       byrow = TRUE)  ) /
                                matrix(iris.sd,
                                       nrow(iris.x), 
                                       ncol(iris.x), 
                                       byrow = TRUE)


# plot the standardized  (zero mean, sd = 1)
newwindow()
pairs(iris.x01[,xvars], main = "Anderson's Iris Data (standardized) -- 3 species",
            col = c("red", "green3", "blue")[iris.train[,"Species"]])

#
# Now let's look for the principal directions
#
iris.svd <- svd (iris.x01[,xvars])

u <- iris.svd$u
d <- iris.svd$d
v <- iris.svd$v

X <- iris.x01
#
# Note that
#

u %*% diag(d) %*% t(v)

#
# equals
# 

X

#
# the difference is zero (subject to floating point arithmetic)
#

X  - u %*% diag(d) %*% t(v)

#
#  Note that the values of d are the square roots of the eigen values
#  of t(X) %*% X  and so are proportional to variances olong the
#  principal axes.
#  The proportion of the total variance explained by only the first principal
#  direction is
#


d2 <- d^2
d[1]^2/sum(d2)

#
#  Or to do this for every direction sumulatively summing to get
#  the proportion of the total variance explained by the principal
#  directions up to that point

cumsum(d2/sum(d2))

#  I (rwo) think that ratios of standard deviations are more meaningful
#  (statistically and geometrically) and so would rather look at
#

d / max(d)

#  to give me an idea of the relative spreads of the data in the
#  different principal directions.
#  We might call these the standardized singular values.
#  These values kind of describe the dimensionality of the data.
#  as in 
sum(d/max(d))
#  More interesting is to identify the point at which the 
#  spread drops dramatically or where it drops beyond some
#  threshold (e.g. 1/30).  An interesting plot might be
#  

newwindow()
par(mfcol=c(1,2))
plot(0,0,xlim=c(1,length(d)), ylim = c(0,1),
     main = "Standardized singular values", 
    xlab = "i", ylab = "sing. val", type ="n")
points(1:length(d), d / max(d))
lines(1:length(d), d/max(d), lty = 2)
plot(0,0,xlim=c(1,length(d)), ylim = c(0,1),
     main = "Proportion of total variance", 
    xlab = "i", ylab = "Cumulative proportion", type ="n")
points(1:length(d),cumsum(d2/sum(d2)))
lines(1:length(d), cumsum(d2/sum(d2)), lty = 2)

#
#  Might get away with one or two of the principal components.
#
#
#  The data in the new coordinate system are
#
xpc <- u %*% diag(d)
colnames(xpc) <- paste("PC", 1:ncol(xpc),sep="_")
newwindow()
par(mfcol=c(1,1))
pairs(xpc, main = "Anderson's Iris Data (principal components)-- 3 species",
            col = c("red", "green3", "blue")[iris.train[,"Species"]])


#
#  Let's look at the contents of the vectors that correspond to the first
#  two principal components

v1 <- v[,1]
v2 <- v[,2]

#
#  We can associate the weights with the variates through the
#  following formatting (which is less complicated than it looks)

cat("First principal component: ",
    paste(prettyNum(v1[1:(length(v1)-1)], digits = 2), 
          irisVars[1:(length(v1)-1)], 
          "+"), 
    paste(prettyNum(v1[length(v1)], digits = 2), 
          irisVars[length(v1)])
    )

# looks like a difference on sepal dims (approx) + an average on the petals

cat("Second principal component: ",
    paste(prettyNum(v2[1:(length(v2)-1)], digits = 2), 
          irisVars[1:(length(v2)-1)], 
          "+"), 
    paste(prettyNum(v2[length(v2)], digits = 2), 
          irisVars[length(v2)])
    )

# which seems to be mostly a sepal size.

#####################################
#
#  Try fitting a multinomial model using only the first two pcs
#

#
#  Fit a multinomial model
#

library(MASS)
library(nnet)

iris.pc <- data.frame(xpc, Species = Species[train])

fitpc <- multinom(formula = Species ~ PC_1 + PC_2, 
                          data = iris.pc, 
                          maxit = 1000
                         )

summary(fitpc)

phat <- predict(fitpc, iris.pc[,1:4], type="probs")

pred.tr <- apply(phat,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )


table(c("Setosa", "Versicolor","Virginica")[iris.pc[,"Species"]], pred.tr)

#
#  On the test data
# 

iris.xte <- iris.test[,xvars]
iris.x01te <- (iris.xte - 
                matrix(iris.xbar,
                       nrow(iris.xte), 
                       ncol(iris.xte), 
                       byrow = TRUE)  ) /
                                matrix(iris.sd,
                                       nrow(iris.xte), 
                                       ncol(iris.xte), 
                                       byrow = TRUE)


xpc.te <- as.matrix(iris.x01te) %*% v
colnames(xpc.te) <- colnames(xpc)

phat.te <- predict(fitpc, xpc.te[,1:4], type="probs")

pred.te <- apply(phat.te,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )


table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred.te)

##############
#
#  Compare this to a multinomial model based on all 4 original variates
#

iris.mn4 <- multinom(formula = Species ~ PetalWidth + PetalLength  +
                                        SepalWidth + SepalLength, 
                          data = iris.train, 
                          maxit = 1000
                         )

#
#  On the training data
pred4 <- apply(predict(iris.mn4, iris.train[,1:4], type="probs"),
                  1,function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )

#
#  and to a multinomial model based on 2 of the original variates
#

iris.mn2 <- multinom(formula = Species ~ PetalWidth + PetalLength, 
                          data = iris.train, 
                          maxit = 1000
                         )

#
#  On the training data
pred2 <- apply(predict(iris.mn2, iris.train[,1:4], type="probs"),
                  1,function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )


#
#  Using 4 variates
table(c("Setosa", "Versicolor","Virginica")[iris.train[,"Species"]], pred4)
#
#  Using 2 variates
table(c("Setosa", "Versicolor","Virginica")[iris.train[,"Species"]], pred2)
#
# Using the first two pcs
table(c("Setosa", "Versicolor","Virginica")[iris.pc[,"Species"]], pred.tr)

#
#  On the test data
pred4 <- apply(predict(iris.mn4, iris.test[,1:4], type="probs"),
                  1,function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )
#
#  On the test data
pred2 <- apply(predict(iris.mn2, iris.test[,1:4], type="probs"),
                  1,function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )

#
# Using 4 variates
table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred4)
#
# Using 2 variates
table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred2)
#
# Using the first two pcs
table(c("Setosa", "Versicolor","Virginica")[iris.test[,"Species"]], pred.te)