#PubH Stat Learning and Data Mining #Example 5.1: Use of CART function tree(). library(tree) ################Classification tree: (similar for regression tree) data(iris) # This famous (Fisher's or Anderson's) iris data set gives the # measurements in centimeters of the variables sepal length and # width and petal length and width, respectively, for 50 flowers # from each of 3 species of iris. The species are _Iris setosa_, # _versicolor_, and _virginica_. iris[1:3,] # Sepal.Length Sepal.Width Petal.Length Petal.Width Species #1 5.1 3.5 1.4 0.2 setosa #2 4.9 3.0 1.4 0.2 setosa #3 4.7 3.2 1.3 0.2 setosa ###generate a classification tree: # Use tree(): #Usage: # tree(formula, data, weights, subset, # na.action = na.pass, control = tree.control(nobs, ...), # method = "recursive.partition", # split = c("deviance", "gini"), # model = FALSE, x = FALSE, y = TRUE, wts = TRUE, ...) ir.tr <- tree(Species ~., iris) ir.tr #node), split, n, deviance, yval, (yprob) # * denotes terminal node # 1) root 150 329.600 setosa ( 0.33333 0.33333 0.33333 ) # 2) Petal.Length < 2.45 50 0.000 setosa ( 1.00000 0.00000 0.00000 ) * # 3) Petal.Length > 2.45 100 138.600 versicolor ( 0.00000 0.50000 0.50000 ) # 6) Petal.Width < 1.75 54 33.320 versicolor ( 0.00000 0.90741 0.09259 ) # 12) Petal.Length < 4.95 48 9.721 versicolor ( 0.00000 0.97917 0.02083 ) # 24) Sepal.Length < 5.15 5 5.004 versicolor ( 0.00000 0.80000 0.20000 ) * # 25) Sepal.Length > 5.15 43 0.000 versicolor ( 0.00000 1.00000 0.00000 ) * # 13) Petal.Length > 4.95 6 7.638 virginica ( 0.00000 0.33333 0.66667 ) * # 7) Petal.Width > 1.75 46 9.635 virginica ( 0.00000 0.02174 0.97826 ) # 14) Petal.Length < 4.95 6 5.407 virginica ( 0.00000 0.16667 0.83333 ) * # 15) Petal.Length > 4.95 40 0.000 virginica ( 0.00000 0.00000 1.00000 ) * summary(ir.tr) #Classification tree: #tree(formula = Species ~ ., data = iris) #Variables actually used in tree construction: #[1] "Petal.Length" "Petal.Width" "Sepal.Length" #Number of terminal nodes: 6 #Residual mean deviance: 0.1253 = 18.05 / 144 #Misclassification error rate: 0.02667 = 4 / 150 #draw the tree and its label pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex5.1.pdf") par(mfrow=c(1,3)) plot(ir.tr) text(ir.tr) ###Use K-fold CV to select a tree with best predictive power: #Usage: # cv.tree(object, rand, FUN = prune.tree, K = 10, ...) ir.cv<-cv.tree(ir.tr) ir.cv #$size #[1] 6 5 4 3 2 1 # #$dev #[1] 39.83629 32.39274 31.59802 53.38336 140.42627 332.72424 # #$k ##k is the cost-complexity parameter: #[1] -Inf 4.228650 4.717398 15.957916 95.676543 190.954250 # #$method #[1] "deviance" # #attr(,"class") #[1] "prune" "tree.sequence" #you can repeat CV several times: cv.tree(ir.tr) #$size #[1] 6 5 4 3 2 1 #$dev #[1] 37.21388 40.16131 39.47645 77.54604 139.79067 332.15956 #$k #[1] -Inf 4.228650 4.717398 15.957916 95.676543 190.954250 #$method #[1] "deviance" #attr(,"class") #[1] "prune" "tree.sequence" for(i in 2:5) ir.cv$dev<-ir.cv$dev + cv.tree(ir.tr)$dev ir.cv$dev<-ir.cv$dev/5 ir.cv$dev #[1] 62.45148 58.11907 57.43846 71.81656 142.21535 333.65499 plot(ir.cv) ##specify a subtree by k (or by best=s with s as the size of the subtree): ir.cv.tr<-prune.tree(ir.tr, k=4.72) plot(ir.cv.tr); text(ir.cv.tr) dev.off() ##misclassification # (for training data): misclass.tree(ir.tr) #[1] 4 misclass.tree(ir.cv.tr) #[1] 4 ##doing prediction: #Usage: # predict(object, newdata = list(), # type = c("vector", "tree", "class", "where"), # split = FALSE, nwts, eps = 1e-3, ...) Value: If 'type = "vector"': vector of predicted responses or, if the response is a factor, matrix of predicted class probabilities. This new object is obtained by dropping 'newdata' down 'object'. For factor predictors, if an observation contains a level not used to grow the tree, it is left at the deepest possible node and 'frame$yval' or 'frame$yprob' at that node is the prediction. If 'type = "tree"': an object of class '"tree"' is returned with new values for 'frame$n' and 'frame$dev'. If 'newdata' does not contain a column for the response in the formula the value of 'frame$dev' will be 'NA', and if some values in the response are missing, the some of the deviances will be 'NA'. If 'type = "class"': for a classification tree, a factor of the predicted classes (that with highest posterior probability, with ties split randomly). If 'type = "where"': the nodes the cases reach. ...... predict(ir.cv.tr) # setosa versicolor virginica #1 1 0.00000000 0.00000000 #2 1 0.00000000 0.00000000 #3 1 0.00000000 0.00000000 #... #50 1 0.00000000 0.00000000 #51 0 0.97916667 0.02083333 #52 0 0.97916667 0.02083333 #53 0 0.97916667 0.02083333 #... ####Finally, there is a default control on a fully-grown tree: Usage: tree.control(nobs, mincut = 5, minsize = 10, mindev = 0.01) Arguments: nobs: The number of observations in the training set. mincut: The minimum number of observations to include in either child node. This is a weighted quantity; the observational weights are used to compute the 'number'. The default is 5. minsize: The smallest allowed node size: a weighted quantity. The default is 10. mindev: The within-node deviance must be at least this times that of the root node for the node to be split. Details: This function produces default values of 'mincut' and 'minsize', and ensures that 'mincut' is at most half 'minsize'. To produce a tree that fits the data perfectly, set 'mindev = 0' and 'minsize = 2', if the limit on tree depth allows such a tree.