#Stat Learning and Data Mining #Example 6.1: Use of random forest. #install.packages("randomForest") library(randomForest) ################RF of classification trees: (similar for RF of regression trees) data(iris) ###generate a RF: #training data: n=100; test data: n=50 set.seed(022104) tr<-sample(1:150, 100) ir.rf <- randomForest(Species ~., data=iris[tr,], ntree=100, mtry=2, nodesize=1, importance=T) ##########Input: among others, # ntree: Number of trees to grow. This should not be set to too small # a number, to ensure that every input row gets predicted at # least a few times. # mtry: Number of variables randomly sampled as candidates at each # split. Note that the default values are different for # classification (sqrt(p) where p is number of variables in # 'x') and regression (p/3) #nodesize: Minimum size of terminal nodes. Setting this number larger # causes smaller trees to be grown (and thus take less time). # Note that the default values are different for classification (1) # and regression (5). #importance: Should importance of predictors be assessed? #########Returned Value: among others, # An object of class 'randomForest', which is a list with the # following components: # call: the original call to 'randomForest' # type: one of 'regression', 'classification', or {unsupervised}. #predicted: the predicted values of the input data based on out-of-bag # samples. #importance: a matrix with 'nclass' + 2 (for classification) or two (for # regression) columns. For classification, the first 'nclass' # columns are the class-specific measures computed as mean # descrease in accuracy. The 'nclass' + 1st column is the mean # descrease in accuracy over all classes. The last column is # the mean decrease in Gini index. For Regression, the first # column is the mean decrease in accuracy and the second the # mean decrease in MSE. If 'importance=FALSE', the last measure # is still returned as a vector. ir.rf #Call: # randomForest.formula(x = Species ~ ., data = iris[tr, ], ntree = 100, mtry = 2, nodesize = 1, importance = T) # Type of random forest: classification # Number of trees: 100 #No. of variables tried at each split: 2 # OOB estimate of error rate: 5% #Confusion matrix: # setosa versicolor virginica class.error #setosa 39 0 0 0.00000000 #versicolor 0 28 2 0.06666667 #virginica 0 3 28 0.09677419 #importance of predictors: ir.rf$importance # setosa versicolor virginica MeanDecreaseAccuracy #Sepal.Length 0.047338180 0.04280001 0.05037952 0.045145177 #Sepal.Width 0.004380952 0.02358514 0.00454354 0.009634833 #Petal.Length 0.314906637 0.25346277 0.30725495 0.289167170 #Petal.Width 0.259779248 0.27519991 0.35352016 0.288425856 # MeanDecreaseGini #Sepal.Length 8.097233 #Sepal.Width 1.332693 #Petal.Length 28.605365 #Petal.Width 27.655176 pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex6.1.pdf") #par(mfrow=c(1,3)) #but cannot control varImpPlot()--it will plot on next page anyway! #plot (OOB?) misclassification error rate vs # of trees: plot(ir.rf) #draw the importance of predictors: varImpPlot(ir.rf) dev.off() predict(ir.rf, newdata=iris[-tr,], type="prob") # type: one of 'response', 'prob'. or 'votes', indicating the type of # output: predicted values, matrix of class probabilities, or # matrix of vote counts. 'class' is allowed, but automatically # converted to "response", for backward compatibility # setosa versicolor virginica #6 1.00 0.00 0.00 #13 1.00 0.00 0.00 #15 0.96 0.04 0.00 #16 0.98 0.02 0.00 #29 1.00 0.00 0.00 #...... table(iris$Species[-tr], predict(ir.rf, iris[-tr,], type="response")) # setosa versicolor virginica # setosa 11 0 0 # versicolor 0 19 1 # virginica 0 2 17 ir.rfL <- randomForest(Species ~., data=iris[tr,], ntree=10000, mtry=2) table(iris$Species[-tr], predict(ir.rfL, iris[-tr,], type="response")) # setosa versicolor virginica # setosa 11 0 0 # versicolor 0 19 1 # virginica 0 2 17 ir.rfL <- randomForest(Species ~., data=iris[tr,], ntree=10000, mtry=1) table(iris$Species[-tr], predict(ir.rfL, iris[-tr,], type="response")) # setosa versicolor virginica # setosa 11 0 0 # versicolor 0 19 1 # virginica 0 2 17 #####you can extract the kth tree by getTree(ir.rf, k=2) #which returns a tree in a matrix format with the following col's: # left daughter right daughter split var split point status prediction #####you can also add a few more trees into a random forest by ir.rf2<-grow(ir.rf, 100)