#Stat Learning and Data Mining
#Example 6.2: Use of gradient/generalized boosting models in "gbm".
#Currently available options are "gaussian" (squared error), "laplace" (absolute
#loss),  "tdist" (t-distribution loss), "bernoulli" (logistic regression for 0-1 outcomes),
#"huberized" (huberized hinge loss for 0-1 outcomes), "multinomial"
#(classification when there are more than 2 classes), "adaboost" (the AdaBoost
#exponential loss for 0-1 outcomes), "poisson" (count outcomes), "coxph" (right
#censored observations), "quantile", or "pairwise" (ranking measure using the
#LambdaMart algorithm).

# use either ?gbm or vignette("gbm") for more details.

#Ref: Greg Ridgeway (2007). Generalized Boosted Models: A guide to the gbm package.

#install.packages("gbm")
library(gbm)

data(iris)
#     This famous (Fisher's or Anderson's) iris data set gives the
#     measurements in centimeters of the variables sepal length and
#     width and petal length and width, respectively, for 50 flowers
#     from each of 3 species of iris.  The species are _Iris setosa_,
#     _versicolor_, and _virginica_.

iris[1:3,]
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa

iris2<-iris[1:100,]
iris2$Species<-c(rep(0, 50), rep(1, 50))

###generate a boosted tree model:
#training data: n=70; test data: n=30
set.seed(022104)
tr<-sample(1:100, 70)

ir.adaboost <- gbm(Species ~., data=iris2[tr,], 
             distribution="adaboost",
             n.trees=100, shrinkage=0.01)

##########Input: among others,
#distribution: a description of the error distribution to be used in the
#          model. Currently available options are "gaussian" (squared
#          error), "laplace" (absolute loss), "bernoulli" (logistic
#          regression for 0-1 outcomes), "adaboost" (the AdaBoost
#          exponential loss for 0-1 outcomes), "poisson" (count
#          outcomes), and "coxph" (censored observations). The current
#          version's Laplace distribution does not handle non-constant
#          weights and will stop.
#cv.folds: Number of cross-validation folds to perform. If 'cv.folds'>1
#          then 'gbm', in addition to the usual fit, will perform a
#          cross-validation, calculate an estimate of generalization
#          error returned in 'cv.error'. dafault=0.
#shrinkage=0.001: a shrinkage parameter applied to each tree in the expansion. 
#           Also known as the learning rate or step-size reduction.
#bag.fraction=0.5: the fraction of the training set observations randomly 
#           selected to propose the next tree in the expansion. 
#           This introduces randomnesses into the model fit. 
#           ----bag.fraction< 1 USED to obtain OOB error estimates!
#           Friedman 2002, Stochastic Gradient Boosting. CSDA 38(4):367-378.
#train.fraction=1: The first train.fraction * nrows(data) observations are 
#           used to fit the gbm and the remainder are used for computing 
#           out-of-sample estimates of the loss function----TEST errors.
 

#########Returned values in a "gbm.object": among others,
#   initF: the "intercept" term, the initial predicted value to which
#          trees make adjustments

#     fit: a vector containing the fitted values on the scale of
#          regression function (e.g. log-odds scale for bernoulli, log
#          scale for poisson)

#train.error: a vector of length equal to the number of fitted trees
#          containing the value of the loss function for each boosting
#          iteration evaluated on the training data

#valid.error: a vector of length equal to the number of fitted trees
#          containing the value of the loss function for each boosting
#          iteration evaluated on the validation data

#oobag.improve: a vector of length equal to the number of fitted trees
#          containing an out-of-bag estimate of the marginal reduction
#          in the expected value of the loss function. The out-of-bag
#          estimate uses only the training data and is useful for
#          estimating the optimal number of boosting iterations. See
#          'gbm.perf'.

#   trees: a list containing the tree structures. The components are
#         best viewed using 'pretty.gbm.tree'


############A very useful function:
Usage:

     gbm.perf(object, 
              plot.it = TRUE, 
              oobag.curve = TRUE, 
              overlay = TRUE, 
              method = c("OOB","test")[1])

Arguments:

  object: a 'gbm.object' created from an initial call to  'gbm'.

 plot.it: an indicator of whether or not to plot the performance
          measures. Setting 'plot.it=TRUE' creates two plots. The first
          plot plots  'object$train.error' (in black) and
          'object$valid.error' (in red)  versus the iteration number.
          The scale of the error measurement, shown on the  left
          vertical axis, depends on the 'distribution' argument used in
          the  initial call to 'gbm'.

oobag.curve: indicates whether to plot the out-of-bag performance
          measures in a second plot.

 overlay: if TRUE and oobag.curve=TRUE then a right y-axis is added to
          the  training and test error plot and the estimated
          cumulative improvement in the loss  function is plotted
          versus the iteration number.

  method: indicate the method used to estimate the optimal number of
          boosting iterations. 'method="OOB"' computes the out-of-bag
          estimate and 'method="test"' uses the test (or validation)
          dataset  to compute an out-of-sample estimate. method="cv" 
          extracts the optimal number of iterations using cross-validation 
          if gbm was called with cv.folds>1

Value:

     'gbm.perf' returns the estimated optimal number of iterations. The
     method  of computation depends on the 'method' argument.

gbm.perf(ir.adaboost, method="OOB")
#[1] 100
#Warning message:
#OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv.folds>0 when calling gbm usually results in improved predictive performance. in: gbm.perf(ir.logitboost2, method = "OOB") 

##########Try gradient boosting (MART-like):
ir.logitboost <- gbm(Species ~., data=iris2[tr,], 
             distribution="bernoulli",
             n.trees=100, shrinkage=0.01)

#doing prediction:
predict.gbm(ir.adaboost, newdata=iris2[-tr,], n.trees=100, type="response")
# [1] 0.1253360 0.1253360 0.1253360 0.1297872 0.1253360 0.1253360 0.1253360
# [8] 0.1253360 0.1253360 0.1297872 0.1253360 0.1253360 0.1253360 0.1253360
#[15] 0.1253360 0.1253360 0.8866689 0.8866689 0.8866689 0.8825869 0.8825869
#[22] 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689 0.8866689
#[29] 0.8866689 0.8825869

predict.gbm(ir.adaboost, newdata=iris2[-tr,], n.trees=100, type="link")
# [1] -0.9714208 -0.9714208 -0.9714208 -0.9514208 -0.9714208 -0.9714208
# [7] -0.9714208 -0.9714208 -0.9714208 -0.9514208 -0.9714208 -0.9714208
#[13] -0.9714208 -0.9714208 -0.9714208 -0.9714208  1.0285792  1.0285792
#[19]  1.0285792  1.0085792  1.0085792  1.0285792  1.0285792  1.0285792
#[25]  1.0285792  1.0285792  1.0285792  1.0285792  1.0285792  1.0085792

predict.gbm(ir.logitboost, newdata=iris2[-tr,], n.trees=100, type="response")
# [1] 0.1886994 0.1886994 0.1886994 0.1925906 0.1886994 0.1886994 0.1886994
# [8] 0.1886994 0.1886994 0.1925906 0.1886994 0.1886994 0.1886994 0.1886994
#[15] 0.1886994 0.1886994 0.8217412 0.8217412 0.8217412 0.8180170 0.8180170
#[22] 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412 0.8217412
#[29] 0.8217412 0.8180170
 predict.gbm(ir.logitboost, newdata=iris2[-tr,], n.trees=100, type="link")
# [1] -1.458484 -1.458484 -1.458484 -1.433264 -1.458484 -1.458484 -1.458484
# [8] -1.458484 -1.458484 -1.433264 -1.458484 -1.458484 -1.458484 -1.458484
#[15] -1.458484 -1.458484  1.528189  1.528189  1.528189  1.502970  1.502970
#[22]  1.528189  1.528189  1.528189  1.528189  1.528189  1.528189  1.528189
#[29]  1.528189  1.502970

#####Why below???
#>  1/(1+exp(-1.502970))
#[1] 0.818017
#> 1/(1+exp(-1.0085792))
#[1] 0.732742
#> log(0.8825869/(1-0.8825869))/2
#[1] 1.008579
#>  1/(1+exp(-2*1.0085792))
#[1] 0.8825869


iris2$Species[-tr]
# [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1

#########can it be further improved?
ir.logitboost2<-gbm.more(ir.logitboost, 3000)

pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex6.2.pdf")
par(mfrow=c(3,1))

gbm.perf(ir.adaboost, method="OOB")
#[1] 100

gbm.perf(ir.logitboost, method="OOB")
#[1] 100

gbm.perf(ir.logitboost2, method="OOB")
#[1] 2996

dev.off()