#  This file  contains multinomial and nnet discrimination with c > 2 classes
#      
#  Authors:
#       R.W. Oldford, 2004
#
#

# quartz() is the Mac OS call for a new window.
# Change the call inside from quartz() to whatever the call is
# for the OS you are using. e.g. X11() or pdf(), etc.

newwindow <- function(x) {quartz()}

#

library(MASS)
library(nnet)
data(iris3)

#
# We need to construct an appropriate "data frame" for
# the iris data to be used by the various modelling
# functions.
# There are several way in which we might
# proceed.  TWO DIFFERENT approaches will be shown here.
#
#
# First some info common to the two approaches.
#
# Extract the separate Species of the data from the
# original source where they are stored in a three way array. 
# Each species is a slice in that three way array,
# and we would like these as separate matrices to
# begin with. 

setosa <- iris3[,,1]
versicolor <- iris3[,,2]
virginica <- iris3[,,3]

# Put these together into a single array

iris <- rbind(setosa,versicolor,virginica)

# Construct some Species labels
#

Species <- c(rep("Setosa",nrow(setosa)),
             rep("Versicolor",nrow(versicolor)),
             rep("Virginica",nrow(virginica))
            )

# some meaningful row labels

rownames(iris) <-c(paste("Setosa", 1:nrow(setosa)), 
                   paste("Versicolor", 1:nrow(versicolor)), 
                   paste("Virginica", 1:nrow(virginica)))
#
# And to be perverse, a more meaningful collection
# of variate names
#

irisVars <- c("SepalLength", "SepalWidth", 
                        "PetalLength", "PetalWidth")

#
# which can replace the column names we started with
#

colnames(iris) <- irisVars

#
# Now  to construct the data frame(s)
#

#  METHOD 1

iris.df1 <- data.frame(iris, Species = Species)

#  METHOD 2

iris.df2 <- data.frame(cbind(iris, Species = factor(Species)))


# plot the data

pairs(iris.df1, main = "Anderson's Iris Data -- 3 species",
            col = c("red", "green3", "blue")[iris.df1[,"Species"]])

# or equivalently,


pairs(iris.df2, main = "Anderson's Iris Data -- 3 species",
            col = c("red", "green3", "blue")[iris.df2[,"Species"]])

# or to avoid the Species appearing,

newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.df1, main = "Anderson's Iris Data -- 3 species",
      col = c("red", "green3", "blue")[iris.df1[,"Species"]])

# or how about,
newwindow()
pairs(cbind(iris.df1[,1:4], 
            sqrt(iris.df1[,1]*iris.df1[,2]),  
            sqrt(iris.df1[,3]*iris.df1[,4]) ) ,
      labels = c(colnames(iris.df1[,1:4]), 
                 "Sqrt(SepalArea)", 
                 "Sqrt(PetalArea)"),
      main = "Anderson's Iris Data -- 3 species",
      col = c("red", "green3", "blue")[iris.df1[,"Species"]])

#
# Select a training sample
# (Say half the data)
# Note that unlike other authors, I choose to randomly
# sample from the whole data set rather than stratify by the
# Species.  The thinking here is to produce a training and
# a test set according to how the data might arrive rather than
# to have two sets which most resemble the whole dataset.
#

set.seed(2568)
n <- nrow(iris.df1)
train <- sort(sample(1:n, floor(n/2)))

# Training data will be:

iris.train1 <- iris.df1[train,]
# or equivalently
iris.train2 <- iris.df2[train,]

# negate the indices to get the test data:
iris.test1 <- iris.df1[-train,]
# or equivalently
iris.test2 <- iris.df2[-train,]

# Training data look like:

newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.train1, main = "Iris Training Data -- 3 species",
      col = c("red", "green3", "blue")[iris.train1[,"Species"]])

# Test data look like:

newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.test1, main = "Iris Test Data -- 3 species",
      col = c("red", "green3", "blue")[iris.test1[,"Species"]])

#
#  Fit a multinomial model
#

iris.mn <- multinom(formula = Species ~ PetalWidth + PetalLength  +
                                        SepalWidth + SepalLength, 
                          data = iris.train1, 
                          maxit = 1000
                         )

summary(iris.mn)

phat <- predict(iris.mn, iris.train1[,1:4], type="probs")

pred.tr <- apply(phat,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )


table(c("Setosa", "Versicolor","Virginica")[iris.train1[,"Species"]], pred.tr)

#
# A picture
#

pred.class <- apply(phat,1,
                     function(x){
                                c("Setosa", 
                                  "Versicolor", 
                                  "Virginica")[x==max(x)]
                                }
                       )
                       
point.symbol <- apply(matrix( pred.class== iris.train1[,"Species"]), 
                    1,
                    function(x){if (x) 1 else 3}
                    )
                    
#
# The above would have to be modified for iris.df2 because there
# Species has numeric valules.  Instead, something like:
# 

point.symbol <- apply(matrix( pred.class == 
                          c("Setosa", "Versicolor", "Virginica")[iris.train2[,"Species"]]), 
                    1,
                    function(x){if (x) 1 else 3}
                    )
                    
# would be necessary.

pred.col <- apply(phat,1,
                     function(x){
                                c("red", "green3", "blue")[x==max(x)]
                                }
                       )
newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.train1, 
      main = "Iris Training Data -- predicted colours (mistakes are +)",
      col = pred.col ,
      cex = 2,
      pch = point.symbol)

#
# Try on the test data
#

phat.te <- predict(iris.mn, iris.test1[,1:4], type="probs")

pred.te <- apply(phat.te,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )
#
# Alternatively we could have used "class" for here instead of the apply
# function.
#

predict(iris.mn, iris.test1[,1:4], type="class")


table(c("Setosa", "Versicolor","Virginica")[iris.test1[,"Species"]], pred.te)

#
# A picture
#

pred.class <- apply(phat.te,1,
                     function(x){
                                c("Setosa", 
                                  "Versicolor", 
                                  "Virginica")[x==max(x)]
                                }
                       )

#
#  See previous comment about iris.df2  to adjust the following
#
point.symbol <- apply(matrix( pred.class== iris.test1[,"Species"]), 
                    1,
                    function(x){if (x) 1 else 3}
                    )

pred.col <- apply(phat.te,1,
                     function(x){
                                c("red", "green3", "blue")[x==max(x)]
                                }
                       )
newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.test1, 
      main = "Iris Test Data -- predicted colours (mistakes are +)",
      col = pred.col ,
      cex = 2,
      pch = point.symbol)

#
#  Except where otherwise noted,
#  the above should work identically with iris.df2, etc.
#  i.e. with the second method of constructing the data frame.
#
#

#
# A new multinomial model
# Include the two "area terms"
#

iris.mn1 <- multinom(formula = Species ~ PetalWidth * PetalLength  +
                                        SepalWidth * SepalLength, 
                          data = iris.df1, 
                          maxit = 1000
                         )

summary(iris.mn1)

phat1 <- predict(iris.mn1, iris.train1[,1:4], type="probs")

pred1.tr <- apply(phat1,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )

table(c("Setosa", "Versicolor","Virginica")[iris.train1[,"Species"]], pred1.tr)

#
# Try on the test data
#

phat1.te <- predict(iris.mn1, iris.test1[,1:4], type="probs")

pred1.te <- apply(phat1.te,1,
                     function(x){
                                c("Setosa pred", 
                                  "Versicolor pred", 
                                  "Virginica pred")[x==max(x)]
                                }
                       )


table(c("Setosa", "Versicolor","Virginica")[iris.test1[,"Species"]], pred1.te)

#
# A picture
#

pred.class <- apply(phat1.te,1,
                     function(x){
                                c("Setosa", 
                                  "Versicolor", 
                                  "Virginica")[x==max(x)]
                                }
                       )

#
#  Note the difference for df2 again (as before) for the following:
#

point.symbol <- apply(matrix( pred.class== iris.test1[,"Species"]), 
                    1,
                    function(x){if (x) 1 else 3}
                    )

pred.col <- apply(phat1.te,1,
                     function(x){
                                c("red", "green3", "blue")[x==max(x)]
                                }
                       )
newwindow()
pairs(~ PetalWidth + PetalLength  + SepalWidth + SepalLength, 
      data = iris.test1, 
      main = "Iris Test Data -- predicted colours (mistakes are +)",
      col = pred.col ,
      cex = 2,
      pch = point.symbol)


###############################################################
#
# A neural net model  (Done in two ways ... see help("nnet")
#


# First way with iris.df1

iris.nnet1 <- nnet(Species ~ . , data = iris.df1, 
                   subset = train,
                   size=4,
                   skip = TRUE,
                   decay=1.0E-2,
                   maxit = 1000) 

#
#  Note the multiple outoput nodes and the direct connection
#  of the input to the output (skip layer). 
#  Note also that the subset argument is used together 
#  with the whole of the dataset ... see help("nnet")
#
summary(iris.nnet1)

#
# Look at the predicted values for the test data
#


table(iris.df1$Species[-train], 
      predict(iris.nnet1, iris.df1[-train,], type = "class")
      )
      

#
# IMPORTANT NOTE: The above does not seeem to work for iris.df2
#

# Try it with iris.df2

iris.nnet2 <- nnet(Species ~ . , data = iris.df2, 
                   subset = train,
                   size=4,
                   skip = TRUE,
                   decay=1.0E-2,
                   maxit = 1000) 

# The above works but seems to converge at a much higher value
# of the criterion.
#
# Indeed,

summary(iris.nnet2)

# shows that there is only one out put node.
# And the following
#
#  predict(iris.nnet2, iris.df2[-train,], type = "class")
#
#  fails (hence commented out here).  As does
#
#  predict(iris.nnet2, iris.df2[-train,], type = "probs")
#
#  The only predictions available are the following:

predict(iris.nnet2, iris.df2[-train,], type = "raw")

# which do not appear to be of much use (single output, all near 1)
#
#  Instead, we have to build the model as follows with iris.df2
#

#
#  First we need to produce multi-valued categorical data for
#  Species.
#  "targets" will be a matrix of three columns full of 1s and 0s
#  to indicate whether the 

targets <- class.ind(Species)

#
#  The nnet is fit using the data from the data frame
#  just as if it were a plain matrix rather than the
#  richer data frame structure.
#
#  IMPORTANT NOTE.  We do not use the subset argument here.
#                   Rather, we subset the data directly *before*
#                   nnet handles it.

iris.nnet2 <- nnet(iris.df2[train,1:4], 
                   targets[train,],
                   size=4,
                   skip = TRUE,
                   decay=1.0E-2,
                   maxit = 1000) 

#  RELATED TO THE ABOVE NOTE:
#  The following fits the whole of the data, the
#  subset argument is ignored.  Try it.
#
set.seed(12345)
iris.nnet2a <- nnet(iris.df2[,1:4], 
                   targets,
                   subset = train,    #  <-- ignored
                   size=4,
                   skip = TRUE,
                   decay=1.0E-2,
                   maxit = 1000) 

set.seed(12345)
iris.nnet2b <- nnet(iris.df2[,1:4], 
                   targets,
                   size=4,
                   skip = TRUE,
                   decay=1.0E-2,
                   maxit = 1000) 


#
#  OK,  back to the model for df2
#

summary(iris.nnet2)

#
#  Which has a network of the correct toplogy.
#
#  To get predictions on its test data takes
#  a little more work; the following, for example, 
#  fails:
#
#      predict(iris.nnet2, iris.df2[-train,], type = "class")
#
#  as does
#
#      predict(iris.nnet2, iris.df2[-train,], type = "probs")
#
#  We must instead use

predict(iris.nnet2, iris.df2[-train,], type = "raw")

# which contains the probability estimates of the three classes.
#
#  Or, equivalently, 
#

predict(iris.nnet2, iris.df2[-train,])

#
#  We can find out which column has the largest value
#  for each row:
#  

pred <- max.col(predict(iris.nnet2, iris.df2[-train,], type = "raw"))

#
# Similarly for the true values we have
#

true <- max.col(targets[-train,])

#
#  And so, build our classification table as follows
#

table(c("Setosa", "Versicolor", "Virginica")[true], 
      c("Pred Setosa", "Pred Versicolor", "Pred Virginica")[pred] )