#Stat Learning and Data Mining #Example 6.0: comparing the performance of single unpruned trees, # single pruned trees, bagging unpruned trees, bagging pruned trees # uisng the simulated data as descried in Hastie, Tibshirani and # Friedman's 2001 book in section 8.7.1. library(tree) #available on smelt and Linux machines (e.g. chromium) HTFBagtree5<-function(V1=1,B=2, fname="C:/Users/panxx014/Documents/courses/7475/Examples/Ex6.0traintest.dat", foutname="C:/Users/panxx014/Documents/courses/7475/Examples/Ex6.0Res5.dat") #########function does not work, but pasting and running works! #########with error message: Error in eval(expr, envir, enclos) : Object "y" not found #fname<-"C:/Users/panxx014/Documents/courses/7475/Examples/Ex6.0traintest.dat" #foutname<-"C:/Users/panxx014/Documents/courses/7475/Examples/Ex6.0Res5.dat" #B<-100 #V1<-10 n<-30 N<-n0<-2000 #set.seed(030504) #seed for the first 1 run #set.seed(1111) #seed for the first 10 runs #set.seed(11111) #seed for the next 5 runs #set.seed(1111111) #seed for the next 7 runs set.seed(11) #seed for the next 3 runs for(v in 1:V1){ genHTF8.7.1(fname, n=n+N) sdat<-read.table(fname, col.names=c("y","x1","x2","x3","x4","x5")) sdat$y<-factor(sdat$y) tdat<-sdat[(n+1):(n+N),] sdat<-sdat[1:n,] ones<-rep(1,N) predb<-predsb<-predse<-matrix(0, nrow=B,ncol=N) pred<-rep(0,N) size1<-size2<-size3<-rep(0,B) singleerr<-userr<- s1err<- s2err<- 0 #trb<-strb<-stre<-list() tr0<-tree(y~., data=sdat, mindev=1e-6,minsize=2) pred<-predict(tr0,tdat,type="class") singleerr<-sum(ones[as.numeric(pred)!=as.numeric(tdat$y)]) if ((summary(tr0))$size==1){ size0<-size0s<-1 singleerr2<-singleerr } else { tr.cv<-cv.tree(tr0, FUN=prune.tree) mindev<-min(tr.cv$dev) ids<-(1:length(tr.cv$dev))[mindev==tr.cv$dev] # ids<-which.min(tr.cv$dev) k<-tr.cv$k[mindev==tr.cv$dev] if (length(k)>1) {k<-k[length(k)]; ids<-ids[length(ids)]} size0<-tr.cv$size[1] size2t<-tr.cv$size[mindev==tr.cv$dev] if (length(size2t)==1) size0s<-size2t else size0s<-size2t[length(size2t)] if (ids==1) ptr0<-tr0 else ptr0<-prune.tree(tr0,k=k) pred<-predict(ptr0,tdat,type="class") singleerr2<-sum(ones[as.numeric(pred)!=as.numeric(tdat$y)]) } cat("Results for v=",v," and B=", B, ":\n") cat("Error rate of a single tree w/o pruning = ", singleerr/(n0),"\n") cat(" Size=",size0,"\n") cat("Error rate of a single tree with pruning = ", singleerr2/(n0),"\n") cat(" Size=",size0s,"\n") for(b in 1:B){ ind<-sample(1:n,n,replace=T) dat1<-sdat[ind,] trb<-tree(y~., data=dat1, mindev=1e-6,minsize=2) if ((summary(trb))$size==1){ size1[b]<-size2[b]<-1 ptrb<-trb } else { tr.cv<-cv.tree(trb, FUN=prune.tree) mindev<-min(tr.cv$dev) ids<-(1:length(tr.cv$dev))[mindev==tr.cv$dev] # ids<-which.min(tr.cv$dev) k<-tr.cv$k[mindev==tr.cv$dev] if (length(k)>1) {k<-k[length(k)]; ids<-ids[length(ids)]} size2t<-tr.cv$size[mindev==tr.cv$dev] if (length(size2t)==1) size2[b]<-size2t else size2[b]<-size2t[length(size2t)] size1[b]<-tr.cv$size[1] ##if the pruned tree has size=1, then it is NOT even a tree in the R functions!!! ##---if prune.tree(tr, k) gives a one-node tree, say ptr, then summary(ptr) ## will not even return a summary of a tree as usual but gives some weird info ## we canNOT prune or cv.tree a 1-node tree either! if (ids==1 || ids==length(tr.cv$dev) || size1[b]==1) ptrb<-trb else ptrb<-prune.tree(trb,k=k) } predb[b,]<-predict(trb,tdat,type="class") predsb[b,]<-predict(ptrb,tdat,type="class") if (b==1) { userr<-sum(ones[HTFmajor(matrix(predb[1:b,], nrow=1))!=as.numeric(tdat$y)]) s1err<-sum(ones[HTFmajor(matrix(predsb[1:b,], nrow=1))!=as.numeric(tdat$y)]) } else{ userr<-sum(ones[HTFmajor(predb[1:b,])!=as.numeric(tdat$y)]) s1err<-sum(ones[HTFmajor(predsb[1:b,])!=as.numeric(tdat$y)]) } cat("Error rate of bagged trees w/o pruning = ", userr/(n0),"\n") cat(" Av. Size=",sum(size1[1:b])/b,"\n") cat("Error rate of bagged trees with pruning = ", s1err/(n0),"\n") cat(" Av. Size=",sum(size2[1:b])/b,"\n") sink(foutname, append=T) cat(singleerr/(n0),size0,singleerr2/(n0),size0s,userr/(n0),sum(size1[1:b])/b, s1err/(n0), sum(size2[1:b])/b,V1,b,"\n") sink() } #B } #V1 } ## summarize majority votes: HTFmajor<-function(pred, vals=c(1,2)){ m<-nrow(pred) n<-ncol(pred) #pred<-as.numeric(pred) mxv0<-rep(0,n) for(i in 1:n){ freq<-rep(0,length(vals)) for (j in 1:m) freq[vals==as.numeric(pred[j,i])]<-freq[vals==as.numeric(pred[j,i])]+1 mxv<-vals[max(freq)==freq] if (length(mxv)>1) mxv0[i]<-sample(mxv,1) else mxv0[i]<-mxv } mxv0 } #generate simulated data as discussed in HTF 8.7.1 (p.247 of HTF 2001) genHTF8.7.1<-function(fname, n=20){ ###generate X_Normal with mean=(1,...,1) and Var(Xi)=1 and Cov(Xi, Xj)=0.95: ###first decompose the covariance matrix of X: Sigma<-matrix(0.95, nrow=5, ncol=5) for(i in 1:5) Sigma[i,i]<-1 Sigma.svd<-svd(Sigma) U<-Sigma.svd$u; d<-Sigma.svd$d; V<-Sigma.svd$v # U %*% diag(d) %*% t(V) = Sigma # for a symmetric Sigma, U=V Sigma12<-U %*% diag(sqrt(d)) %*% t(V) #Sigma12 %*% Sigma12 = Sigma Sigma12inv<-solve(Sigma12) # Sigma12inv %*% Sigma %*% t(Sigma12inv) x<-rnorm(5) for(i in 1:n){ x0<-rnorm(5) x<-Sigma12inv %*% x0 z<-runif(1, 0, 1) if (x[1,1]<=0.5){ if (z<=0.2) y<-1 else y<-0 } else{ if (z<=0.8) y<-1 else y<-0 } if (i==1) sink(fname) else sink(fname, append=T) cat(y,x,"\n") sink() } }