# This file contains a recursive partitioning for the Fisher Iris data. # # # Authors: # R.W. Oldford, 2004 # # # quartz() is the Mac OS call for a new window. # Change the call inside from quartz() to whatever the call is # for the OS you are using. e.g. X11() or pdf(), etc. newwindow <- function(x) {quartz()} # A handy function to capture the pathname for class files: web441 <- function(x) {paste('http://www.undergrad.math.uwaterloo.ca/~stat441/R-code/', x, sep='') } # # # Get the data: data(iris3) # # Load the tree fitting/recursive partitioning library. # library(rpart) # # Now we will build the iris data frame just as was done # in the file web441("iris-multinom.R") # # Of the two methods of building a data frame described there, # The first will be used here. # # Each species is a slice in a three way array, # which will be separated into three matrices to # begin with. setosa <- iris3[,,1] versicolor <- iris3[,,2] virginica <- iris3[,,3] # Put these together into a single array iris <- rbind(setosa,versicolor,virginica) # Construct some Species labels # Species <- c(rep("Setosa",nrow(setosa)), rep("Versicolor",nrow(versicolor)), rep("Virginica",nrow(virginica)) ) # some meaningful row labels rownames(iris) <-c(paste("Setosa", 1:nrow(setosa)), paste("Versicolor", 1:nrow(versicolor)), paste("Virginica", 1:nrow(virginica))) # # And to be perverse, a more meaningful collection # of variate names # irisVars <- c("SepalLength", "SepalWidth", "PetalLength", "PetalWidth") # # which can replace the column names we started with # colnames(iris) <- irisVars # # Now to construct the data frame # iris.df <- data.frame(iris, Species = Species) # Select a training set (Say half the data) # Note that unlike other authors, I choose to randomly # sample from the whole data set rather than stratify by the # Species. The thinking here is to produce a training and # a test set according to how the data might arrive rather than # to have two sets which most resemble the whole dataset. # set.seed(2568) n <- nrow(iris.df) train <- sort(sample(1:n, floor(n/2))) # Training data will be: iris.train <- iris.df[train,] # negate the indices to get the test data: iris.test <- iris.df[-train,] # # Build a tree for the training data # iris.rp <-rpart(Species ~ . , # formula: . means all data = iris.df, # data frame subset = train, # indices of training set method = "class", # classification tree parms = list(split = "information"), # use entropy/deviance maxsurrogate = 0, # since no missing values cp = 0, # no size penalty minsplit = 5, # Nodes of size 5 (or # more) can be split, minbucket = 2, # provided each sub-node # contains at least 2 obs. ) # The above size-related parameters (cp, minsplit, minbucket) # have essentially little impact on the iris data. # # # The summary (can be huge for large trees). # summary(iris.rp) # # The classification tree itself - plotted and labelled. # See help("plot.rpart") and help("text.rpart") # for parameter details. newwindow() plot(iris.rp, uniform=TRUE, compress=TRUE, margin = .2) text(iris.rp, use.n=TRUE, all = TRUE, fancy = TRUE) # # Note that only the petal variates were used # in the splits. # newwindow() colours <- apply(matrix(iris.df[,"Species"]), 1, function(x){if (x=="Setosa") 1 else if (x == "Versicolor") 2 else 3} ) colours <- c("red", "green3", "blue")[colours] plot(iris.df[train,"PetalWidth"],iris.df[train,"PetalLength"], col = colours[train], main = "Recursive partitoning regions") lines(x=c(0,2.5), y = c(2.45,2.45), lty = 1) lines(x=c(1.65,1.65), y = c(2.45,7), lty = 2) lines(x=c(0,1.65), y = c(4.96,4.95), lty = 3) # # Get the predictions # pred.rp <- predict(iris.rp, newdata = iris.df[-train,], type = "class") pred.rp # # Other predictive info # predict(iris.rp, newdata = iris.df[-train,], type = "prob") predict(iris.rp, newdata = iris.df[-train,], type = "vector") predict(iris.rp, newdata = iris.df[-train,], type = "matrix") # # Classification table on the test data # table(iris.df$Species[-train], pred.rp) # # Some CP (complexity parameter) information # printcp(iris.rp) newwindow() plotcp(iris.rp) # # Prune this tree # iris.pruned <- prune(iris.rp, cp = 0.1) # The pruned tree itself. newwindow() plot(iris.pruned, compress=TRUE, margin = .2) text(iris.pruned, use.n=TRUE, all = TRUE, fancy = TRUE) #################################################### # # A different tree model # #################################################### iris.rpint <-rpart(Species ~ PetalWidth + PetalLength + SepalWidth + SepalLength + sqrt(PetalWidth*PetalLength) + sqrt(SepalWidth*SepalLength), # formula: includes interactions data = iris.df, # data frame subset = train, # indices of training set method = "class", # classification tree parms = list(split = "information"), # use entropy/deviance maxsurrogate = 0, # since no missing values cp = 0, # no size penalty minsplit = 5, # Nodes of size 5 (or # more) can be split, minbucket = 2, # provided each sub-node # contains at least 2 obs. ) # Note that rpart cannot handle interaction terms # The above * is actually a multiplication. # # # The summary (can be huge for large trees). # summary(iris.rpint) # # The classification tree itself - plotted and labelled. # See help("plot.rpart") and help("text.rpart") # for parameter details. newwindow() plot(iris.rpint, uniform=TRUE, compress=TRUE, margin = .15) text(iris.rpint, use.n=TRUE, all = TRUE, fancy = TRUE ) # pred.rpint <- predict(iris.rpint, newdata = iris.df[-train,], type = "class") # # Classification table on the test data # table(iris.df$Species[-train], pred.rpint) # # Some CP (complexity parameter) information # printcp(iris.rpint) newwindow() plotcp(iris.rpint) # # Prune this tree # iris.prunint <- prune(iris.rpint, cp = 0.1) # The pruned tree itself. newwindow() plot(iris.prunint, compress=TRUE, margin = .2) text(iris.prunint, use.n=TRUE, all = TRUE, fancy = TRUE) # # Again a function only of petal-length and width # newwindow() plot(iris.df[train,"PetalWidth"],iris.df[train,"PetalLength"], col = colours[train], main = "Recursive partitoning regions (with int)") lines(x=c(0.8, 0.8), y = c(0,7), lty = 1) x<- seq(0.8,2.5, .1) y <- ( 2.725 ^ 2) / x lines(x= x, y = y, lty = 2) # # Classification table on the test data # table(iris.df$Species[-train], predict(iris.prunint, newdata = iris.df[-train,], type = "class") ) #################################################### # # A different tree model # #################################################### iris.rpint1 <-rpart(Species ~ sqrt(PetalWidth*PetalLength) + sqrt(SepalWidth*SepalLength), # formula: includes interactions data = iris.df, # data frame subset = train, # indices of training set method = "class", # classification tree parms = list(split = "information"), # use entropy/deviance maxsurrogate = 0, # since no missing values cp = 0, # no size penalty minsplit = 5, # Nodes of size 5 (or # more) can be split, minbucket = 2, # provided each sub-node # contains at least 2 obs. ) # Note that rpart cannot handle interaction terms # The above * is actually a multiplication. # # # The summary (can be huge for large trees). # summary(iris.rpint1) # # The classification tree itself - plotted and labelled. # See help("plot.rpart") and help("text.rpart") # for parameter details. newwindow() plot(iris.rpint1, uniform=TRUE, compress=TRUE, margin = .15) text(iris.rpint1, use.n=TRUE, all = TRUE, fancy = TRUE ) # pred.rpint1 <- predict(iris.rpint1, newdata = iris.df[-train,], type = "class") # # Classification table on the test data # table(iris.df$Species[-train], pred.rpint1) # # Some CP (complexity parameter) information # printcp(iris.rpint1) newwindow() plotcp(iris.rpint1) # # Prune this tree # iris.prunint1 <- prune(iris.rpint1, cp = 0.1) # The pruned tree itself. newwindow() plot(iris.prunint1, compress=TRUE, margin = .2) text(iris.prunint1, use.n=TRUE, all = TRUE, fancy = TRUE) # # Classification table on the test data # table(iris.df$Species[-train], predict(iris.prunint1, newdata = iris.df[-train,], type = "class") )