#Stat Learning and Data Mining
#Example 6.1: Use of random forest.

#install.packages("randomForest")
library(randomForest)

################RF of classification trees: (similar for RF of regression trees)
data(iris)

###generate a RF:
#training data: n=100; test data: n=50
set.seed(022104)
tr<-sample(1:150, 100)
ir.rf <- randomForest(Species ~., data=iris[tr,], ntree=100, mtry=2,
                      nodesize=1, importance=T)
##########Input: among others,
#   ntree: Number of trees to grow.  This should not be set to too small
#          a number, to ensure that every input row gets predicted at
#          least a few times. 
#    mtry: Number of variables randomly sampled as candidates at each
#          split.  Note that the default values are different for
#          classification (sqrt(p) where p is number of variables in
#          'x') and regression (p/3)
#nodesize: Minimum size of terminal nodes.  Setting this number larger
#          causes smaller trees to be grown (and thus take less time).
#          Note that the default values are different for classification (1)
#          and regression (5).
#importance: Should importance of predictors be assessed? 

#########Returned Value: among others,
#     An object of class 'randomForest', which is a list with the
#     following components:

#    call: the original call to 'randomForest'

#    type: one of 'regression', 'classification', or {unsupervised}.

#predicted: the predicted values of the input data based on out-of-bag
#          samples.

#importance: a matrix with 'nclass' + 2 (for classification) or two (for
#          regression) columns.  For classification, the first 'nclass'
#          columns are the class-specific measures computed as mean
#          descrease in accuracy.  The 'nclass' + 1st column is the mean
#          descrease in accuracy over all classes.  The last column is
#          the mean decrease in Gini index.  For Regression, the first
#          column is the mean decrease in accuracy and the second the
#          mean decrease in MSE. If 'importance=FALSE', the last measure
#          is still returned as a vector.

ir.rf
#Call:
# randomForest.formula(x = Species ~ ., data = iris[tr, ], ntree = 100,      mtry = 2, nodesize = 1, importance = T) 
#               Type of random forest: classification
#                     Number of trees: 100
#No. of variables tried at each split: 2

#        OOB estimate of  error rate: 5%
#Confusion matrix:
#           setosa versicolor virginica class.error
#setosa         39          0         0  0.00000000
#versicolor      0         28         2  0.06666667
#virginica       0          3        28  0.09677419

#importance of predictors:
ir.rf$importance
#                  setosa versicolor  virginica MeanDecreaseAccuracy
#Sepal.Length 0.047338180 0.04280001 0.05037952          0.045145177
#Sepal.Width  0.004380952 0.02358514 0.00454354          0.009634833
#Petal.Length 0.314906637 0.25346277 0.30725495          0.289167170
#Petal.Width  0.259779248 0.27519991 0.35352016          0.288425856
#             MeanDecreaseGini
#Sepal.Length         8.097233
#Sepal.Width          1.332693
#Petal.Length        28.605365
#Petal.Width         27.655176

pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex6.1.pdf")
#par(mfrow=c(1,3))
#but cannot control varImpPlot()--it will plot on next page anyway!

#plot (OOB?) misclassification error rate vs # of trees:
plot(ir.rf)

#draw the importance of predictors:
varImpPlot(ir.rf)

dev.off()

predict(ir.rf, newdata=iris[-tr,], type="prob")
#    type: one of 'response', 'prob'. or 'votes', indicating the type of
#          output: predicted values, matrix of class probabilities, or
#          matrix of vote counts.  'class' is allowed, but automatically
#          converted to "response", for backward compatibility

#    setosa versicolor virginica
#6     1.00       0.00      0.00
#13    1.00       0.00      0.00
#15    0.96       0.04      0.00
#16    0.98       0.02      0.00
#29    1.00       0.00      0.00
#......

table(iris$Species[-tr], predict(ir.rf, iris[-tr,], type="response"))
#             setosa versicolor virginica
#  setosa     11      0          0       
#  versicolor  0     19          1       
#  virginica   0      2         17 

ir.rfL <- randomForest(Species ~., data=iris[tr,], ntree=10000, mtry=2)
table(iris$Species[-tr], predict(ir.rfL, iris[-tr,], type="response"))
#             setosa versicolor virginica
#  setosa     11      0          0       
#  versicolor  0     19          1       
#  virginica   0      2         17 
ir.rfL <- randomForest(Species ~., data=iris[tr,], ntree=10000, mtry=1)
table(iris$Species[-tr], predict(ir.rfL, iris[-tr,], type="response"))
#             setosa versicolor virginica
#  setosa     11      0          0       
#  versicolor  0     19          1       
#  virginica   0      2         17 

#####you can extract the kth tree by
getTree(ir.rf, k=2)
#which returns a tree in a matrix format with the following col's:
# left daughter right daughter split var split point status prediction

#####you can also add a few more trees into a random forest by
ir.rf2<-grow(ir.rf, 100)