#Stat Learning and Data Mining #Example 6.2: Use of gradient/generalized boosting models in "gbm". #Currently available options are "gaussian" (squared error), "laplace" (absolute #loss), "tdist" (t-distribution loss), "bernoulli" (logistic regression for 0-1 outcomes), #"huberized" (huberized hinge loss for 0-1 outcomes), "multinomial" #(classification when there are more than 2 classes), "adaboost" (the AdaBoost #exponential loss for 0-1 outcomes), "poisson" (count outcomes), "coxph" (right #censored observations), "quantile", or "pairwise" (ranking measure using the #LambdaMart algorithm). # use either ?gbm or vignette("gbm") for more details. #Ref: Greg Ridgeway (2007). Generalized Boosted Models: A guide to the gbm package. #install.packages("gbm") library(gbm) data(iris) # This famous (Fisher's or Anderson's) iris data set gives the # measurements in centimeters of the variables sepal length and # width and petal length and width, respectively, for 50 flowers # from each of 3 species of iris. The species are _Iris setosa_, # _versicolor_, and _virginica_. iris[1:3,] # Sepal.Length Sepal.Width Petal.Length Petal.Width Species #1 5.1 3.5 1.4 0.2 setosa #2 4.9 3.0 1.4 0.2 setosa #3 4.7 3.2 1.3 0.2 setosa iris2<-iris[1:100,] iris2$Species<-c(rep(0, 50), rep(1, 50)) ###generate a boosted tree model: #training data: n=70; test data: n=30 set.seed(022104) tr<-sample(1:100, 70) ir.adaboost <- gbm(Species ~., data=iris2[tr,], distribution="adaboost", n.trees=100, shrinkage=0.01) ##########Input: among others, #distribution: a description of the error distribution to be used in the # model. Currently available options are "gaussian" (squared # error), "laplace" (absolute loss), "bernoulli" (logistic # regression for 0-1 outcomes), "adaboost" (the AdaBoost # exponential loss for 0-1 outcomes), "poisson" (count # outcomes), and "coxph" (censored observations). The current # version's Laplace distribution does not handle non-constant # weights and will stop. #cv.folds: Number of cross-validation folds to perform. If 'cv.folds'>1 # then 'gbm', in addition to the usual fit, will perform a # cross-validation, calculate an estimate of generalization # error returned in 'cv.error'. dafault=0. #shrinkage=0.001: a shrinkage parameter applied to each tree in the expansion. # Also known as the learning rate or step-size reduction. #bag.fraction=0.5: the fraction of the training set observations randomly # selected to propose the next tree in the expansion. # This introduces randomnesses into the model fit. # ----bag.fraction< 1 USED to obtain OOB error estimates! # Friedman 2002, Stochastic Gradient Boosting. CSDA 38(4):367-378. #train.fraction=1: The first train.fraction * nrows(data) observations are # used to fit the gbm and the remainder are used for computing # out-of-sample estimates of the loss function----TEST errors. #########Returned values in a "gbm.object": among others, # initF: the "intercept" term, the initial predicted value to which # trees make adjustments # fit: a vector containing the fitted values on the scale of # regression function (e.g. log-odds scale for bernoulli, log # scale for poisson) #train.error: a vector of length equal to the number of fitted trees # containing the value of the loss function for each boosting # iteration evaluated on the training data #valid.error: a vector of length equal to the number of fitted trees # containing the value of the loss function for each boosting # iteration evaluated on the validation data #oobag.improve: a vector of length equal to the number of fitted trees # containing an out-of-bag estimate of the marginal reduction # in the expected value of the loss function. The out-of-bag # estimate uses only the training data and is useful for # estimating the optimal number of boosting iterations. See # 'gbm.perf'. # trees: a list containing the tree structures. The components are # best viewed using 'pretty.gbm.tree' ############A very useful function: Usage: gbm.perf(object, plot.it = TRUE, oobag.curve = TRUE, overlay = TRUE, method = c("OOB","test")[1]) Arguments: object: a 'gbm.object' created from an initial call to 'gbm'. plot.it: an indicator of whether or not to plot the performance measures. Setting 'plot.it=TRUE' creates two plots. The first plot plots 'object$train.error' (in black) and 'object$valid.error' (in red) versus the iteration number. The scale of the error measurement, shown on the left vertical axis, depends on the 'distribution' argument used in the initial call to 'gbm'. oobag.curve: indicates whether to plot the out-of-bag performance measures in a second plot. overlay: if TRUE and oobag.curve=TRUE then a right y-axis is added to the training and test error plot and the estimated cumulative improvement in the loss function is plotted versus the iteration number. method: indicate the method used to estimate the optimal number of boosting iterations. 'method="OOB"' computes the out-of-bag estimate and 'method="test"' uses the test (or validation) dataset to compute an out-of-sample estimate. method="cv" extracts the optimal number of iterations using cross-validation if gbm was called with cv.folds>1 Value: 'gbm.perf' returns the estimated optimal number of iterations. The method of computation depends on the 'method' argument. gbm.perf(ir.adaboost, method="OOB") #[1] 100 #Warning message: #OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv.folds>0 when calling gbm usually results in improved predictive performance. in: gbm.perf(ir.logitboost2, method = "OOB") ##########Try gradient boosting (MART-like): ir.logitboost <- gbm(Species ~., data=iris2[tr,], distribution="bernoulli", n.trees=100, shrinkage=0.01) #doing prediction: predict.gbm(ir.adaboost, newdata=iris2[-tr,], n.trees=100, type="response") # [1] 0.1253360 0.1253360 0.1253360 0.1297872 0.1253360 0.1253360 0.1253360 # [8] 0.1253360 0.1253360 0.1297872 0.1253360 0.1253360 0.1253360 0.1253360 #[15] 0.1253360 0.1253360 0.8866689 0.8866689 0.8866689 0.8825869 0.8825869 #[22] 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689 #[29] 0.8866689 0.8825869 predict.gbm(ir.adaboost, newdata=iris2[-tr,], n.trees=100, type="link") # [1] -0.9714208 -0.9714208 -0.9714208 -0.9514208 -0.9714208 -0.9714208 # [7] -0.9714208 -0.9714208 -0.9714208 -0.9514208 -0.9714208 -0.9714208 #[13] -0.9714208 -0.9714208 -0.9714208 -0.9714208 1.0285792 1.0285792 #[19] 1.0285792 1.0085792 1.0085792 1.0285792 1.0285792 1.0285792 #[25] 1.0285792 1.0285792 1.0285792 1.0285792 1.0285792 1.0085792 predict.gbm(ir.logitboost, newdata=iris2[-tr,], n.trees=100, type="response") # [1] 0.1886994 0.1886994 0.1886994 0.1925906 0.1886994 0.1886994 0.1886994 # [8] 0.1886994 0.1886994 0.1925906 0.1886994 0.1886994 0.1886994 0.1886994 #[15] 0.1886994 0.1886994 0.8217412 0.8217412 0.8217412 0.8180170 0.8180170 #[22] 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412 #[29] 0.8217412 0.8180170 predict.gbm(ir.logitboost, newdata=iris2[-tr,], n.trees=100, type="link") # [1] -1.458484 -1.458484 -1.458484 -1.433264 -1.458484 -1.458484 -1.458484 # [8] -1.458484 -1.458484 -1.433264 -1.458484 -1.458484 -1.458484 -1.458484 #[15] -1.458484 -1.458484 1.528189 1.528189 1.528189 1.502970 1.502970 #[22] 1.528189 1.528189 1.528189 1.528189 1.528189 1.528189 1.528189 #[29] 1.528189 1.502970 #####Why below??? #> 1/(1+exp(-1.502970)) #[1] 0.818017 #> 1/(1+exp(-1.0085792)) #[1] 0.732742 #> log(0.8825869/(1-0.8825869))/2 #[1] 1.008579 #> 1/(1+exp(-2*1.0085792)) #[1] 0.8825869 iris2$Species[-tr] # [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 #########can it be further improved? ir.logitboost2<-gbm.more(ir.logitboost, 3000) pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex6.2.pdf") par(mfrow=c(3,1)) gbm.perf(ir.adaboost, method="OOB") #[1] 100 gbm.perf(ir.logitboost, method="OOB") #[1] 100 gbm.perf(ir.logitboost2, method="OOB") #[1] 2996 dev.off()