#PubH Stat Learning and Data Mining
#Example 5.1: Use of CART function tree().

library(tree)

################Classification tree: (similar for regression tree)
data(iris)
#     This famous (Fisher's or Anderson's) iris data set gives the
#     measurements in centimeters of the variables sepal length and
#     width and petal length and width, respectively, for 50 flowers
#     from each of 3 species of iris.  The species are _Iris setosa_,
#     _versicolor_, and _virginica_.

iris[1:3,]
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa

###generate a classification tree:
# Use tree():
#Usage:
#     tree(formula, data, weights, subset,
#          na.action = na.pass, control = tree.control(nobs, ...),
#          method = "recursive.partition",
#          split = c("deviance", "gini"),
#          model = FALSE, x = FALSE, y = TRUE, wts = TRUE, ...)
ir.tr <- tree(Species ~., iris)
ir.tr
#node), split, n, deviance, yval, (yprob)
#      * denotes terminal node

# 1) root 150 329.600 setosa ( 0.33333 0.33333 0.33333 )  
#   2) Petal.Length < 2.45 50   0.000 setosa ( 1.00000 0.00000 0.00000 ) *
#   3) Petal.Length > 2.45 100 138.600 versicolor ( 0.00000 0.50000 0.50000 )  
#     6) Petal.Width < 1.75 54  33.320 versicolor ( 0.00000 0.90741 0.09259 )  
#      12) Petal.Length < 4.95 48   9.721 versicolor ( 0.00000 0.97917 0.02083 )  
#        24) Sepal.Length < 5.15 5   5.004 versicolor ( 0.00000 0.80000 0.20000 ) *
#        25) Sepal.Length > 5.15 43   0.000 versicolor ( 0.00000 1.00000 0.00000 ) *
#      13) Petal.Length > 4.95 6   7.638 virginica ( 0.00000 0.33333 0.66667 ) *
#     7) Petal.Width > 1.75 46   9.635 virginica ( 0.00000 0.02174 0.97826 )  
#      14) Petal.Length < 4.95 6   5.407 virginica ( 0.00000 0.16667 0.83333 ) *
#      15) Petal.Length > 4.95 40   0.000 virginica ( 0.00000 0.00000 1.00000 ) *

summary(ir.tr)
#Classification tree:
#tree(formula = Species ~ ., data = iris)
#Variables actually used in tree construction:
#[1] "Petal.Length" "Petal.Width"  "Sepal.Length"
#Number of terminal nodes:  6 
#Residual mean deviance:  0.1253 = 18.05 / 144 
#Misclassification error rate: 0.02667 = 4 / 150

#draw the tree and its label
pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex5.1.pdf")
par(mfrow=c(1,3))
plot(ir.tr)
text(ir.tr)

###Use K-fold CV to select a tree with best predictive power:
#Usage:
#     cv.tree(object, rand, FUN = prune.tree, K = 10, ...)

ir.cv<-cv.tree(ir.tr)
 ir.cv
#$size
#[1] 6 5 4 3 2 1
#
#$dev
#[1]  39.83629  32.39274  31.59802  53.38336 140.42627 332.72424
#
#$k
##k is the cost-complexity parameter:
#[1]       -Inf   4.228650   4.717398  15.957916  95.676543 190.954250
#
#$method
#[1] "deviance"
#
#attr(,"class")
#[1] "prune"         "tree.sequence"

#you can repeat CV several times:
 cv.tree(ir.tr)
#$size
#[1] 6 5 4 3 2 1
#$dev
#[1]  37.21388  40.16131  39.47645  77.54604 139.79067 332.15956
#$k
#[1]       -Inf   4.228650   4.717398  15.957916  95.676543 190.954250
#$method
#[1] "deviance"
#attr(,"class")
#[1] "prune"         "tree.sequence"

for(i in 2:5) 
  ir.cv$dev<-ir.cv$dev + cv.tree(ir.tr)$dev
ir.cv$dev<-ir.cv$dev/5
ir.cv$dev
#[1]  62.45148  58.11907  57.43846  71.81656 142.21535 333.65499
plot(ir.cv)

##specify a subtree by k (or by best=s with s as the size of the subtree):
ir.cv.tr<-prune.tree(ir.tr, k=4.72)
plot(ir.cv.tr); text(ir.cv.tr)

dev.off()

##misclassification # (for training data):
misclass.tree(ir.tr)
#[1] 4
misclass.tree(ir.cv.tr)
#[1] 4

##doing prediction:
#Usage:
#     predict(object, newdata = list(),
#             type = c("vector", "tree", "class", "where"), 
#             split = FALSE, nwts, eps = 1e-3, ...)
Value:

     If 'type = "vector"': vector of predicted responses or, if the
     response is a factor, matrix of predicted class probabilities. 
     This new object is obtained by dropping 'newdata' down 'object'. 
     For factor predictors, if an observation contains a level not used
     to grow the tree, it is left at the deepest possible node and
     'frame$yval' or 'frame$yprob' at that node is the prediction.

     If 'type = "tree"': an object of class '"tree"' is returned with
     new values for 'frame$n' and 'frame$dev'. If 'newdata' does not
     contain a column for the response in the formula the value of
     'frame$dev' will be 'NA', and if some values in the response are
     missing, the some of the deviances will be 'NA'.

     If 'type = "class"': for a classification tree, a factor of the 
     predicted classes (that with highest posterior probability, with
     ties split randomly).

     If 'type = "where"': the nodes the cases reach.
......

predict(ir.cv.tr)
#    setosa versicolor  virginica
#1        1 0.00000000 0.00000000
#2        1 0.00000000 0.00000000
#3        1 0.00000000 0.00000000
#...
#50       1 0.00000000 0.00000000
#51       0 0.97916667 0.02083333
#52       0 0.97916667 0.02083333
#53       0 0.97916667 0.02083333
#...

####Finally, there is a default control on a fully-grown tree:
Usage:

     tree.control(nobs, mincut = 5, minsize = 10, mindev = 0.01)

Arguments:

    nobs: The number of observations in the training set.

  mincut: The minimum number of observations to include in either child
          node. This is a weighted quantity; the observational weights
          are used to compute the 'number'. The default is 5.

 minsize: The smallest allowed node size: a weighted quantity. The
          default is 10.

  mindev: The within-node deviance must be at least this times that of
          the root node for the node to be split.

Details:

     This function produces default values of 'mincut' and 'minsize',
     and ensures that 'mincut' is at most half 'minsize'.

     To produce a tree that fits the data perfectly, set 'mindev = 0'
     and 'minsize = 2', if the limit on tree depth allows such a tree.