#PubH Stat Learning and Data Mining #Example 5.2: GAM and use AIC/BIC/CV to select smoothing parameter in GAM library(gam) #available on titanium #use Kyphosis data data(kyphosis) kyphosis[1:5,] # Kyphosis Age Number Start #1 absent 71 3 5 #2 absent 158 3 14 #3 present 128 4 5 #4 absent 2 5 1 #5 absent 1 4 15 n<-length(kyphosis$Age) # 81 kyphosis[1,]$Kyphosis #[1] absent #Levels: #[1] "absent" "present" #Splus (R too?) glm() or gam() rule: taking the first level as 0 and # the other as 1 #You can more explicitly specify a binary response as Kyphosis=="present" # in the formula option of glm() or gam() #fit a GAM: fit1<-gam(Kyphosis~lo(Age)+lo(Number)+lo(Start), family=binomial, data=kyphosis) summary(fit1) # Null Deviance: 83.2345 on 80 degrees of freedom #Residual Deviance: 42.9555 on 69.4146 degrees of freedom #AIC: 66.1264 # #Number of Local Scoring Iterations: 11 # #DF for Terms and Chi-squares for Nonparametric Effects # # Df Npar Df Npar Chisq P(Chi) #(Intercept) 1.0 #lo(Age) 1.0 2.5 6.4581 0.0646 #lo(Number) 1.0 2.7 4.1960 0.2061 #lo(Start) 1.0 2.3 5.3104 0.0921 fit2<-gam((Kyphosis=="present")~s(Age)+s(Number)+s(Start), family=binomial, data=kyphosis) summary(fit2) # Null Deviance: 83.2345 on 80 degrees of freedom #Residual Deviance: 40.526 on 67.9997 degrees of freedom #AIC: 66.5266 # #Number of Local Scoring Iterations: 11 # #DF for Terms and Chi-squares for Nonparametric Effects # # Df Npar Df Npar Chisq P(Chi) #(Intercept) 1 #s(Age) 1 3 5.7937 0.1221 #s(Number) 1 3 5.6880 0.1278 #s(Start) 1 3 5.8876 0.1172 postscript("ex5.2.ps") par(mfrow=c(2,3)) plot(fit1, se=T) # the dashed curves are pointwise +- 2*SE plot(fit2, se=T) dev.off() #the smoothing parameter is the target degrees of freedom in s() spSet<-(1:10) AIC<-BIC<-rep(0, length(spSet)) for (k in 1:length(spSet)){ sp0<-spSet[k] fit<-gam(Kyphosis~s(Age, sp0)+s(Start), family=binomial, data=kyphosis) predp<-predict(fit, type="response") AIC[k]<-fit$deviance + 2*(n-fit$df.residual) BIC[k]<-fit$deviance + log(n)*(n-fit$df.residual) } #print out: cat("df=", spSet, "\n") cat("AIC=", AIC , "\n") cat("BIC=", BIC , "\n") #df= 1 2 3 4 5 6 7 8 9 10 #AIC= 67.73905 64.10075 64.88276 66.29924 67.57357 68.74138 69.85456 70.89715 71.83167 72.65876 #BIC= 82.1086 80.8621 84.03873 87.84968 91.51828 95.08143 98.58787 102.0244 105.3531 108.5762 #########CV: #the smoothing parameter is span in lo() spanSet<-(1:10)/10 logL<-rep(0, length(spanSet)) for (k in 1:length(spanSet)){ span0<-spanSet[k] for(i in 1:n){ fit<-gam(Kyphosis~lo(Age, span=span0)+lo(Start), family=binomial, data=kyphosis, subset=((1:n)[-i])) predpi<-predict(fit, newdata=kyphosis[i,], type="response") if (kyphosis[i,]$Kyphosis=="present") logL[k]<-logL[k]+log(predpi) else logL[k]<-logL[k]+log(1-predpi) } } #There were 50 or more warnings (use warnings() to see the first 50) ### related to convergence and extrapolation... #print out: cat("span=", spanSet, "\n") cat("CV logL=", logL, "\n") #span= 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 #CV logL= -208.7682 -76.56352 -47.05613 -37.47847 -35.11352 -34.12871 -33.30827 -32.56275 -32.07728 -32.18931 ****************** > names(fit1) [1] "coefficients" "residuals" "fitted.values" [4] "R" "rank" "smooth" [7] "nl.df" "df.residual" "var" [10] "assign" "terms" "call" [13] "formula" "family" "nl.chisq" [16] "y" "weights" "iter" [19] "additive.predictors" "deviance" "null.deviance" [22] "contrasts" #see any big difference with various df=2, 5, 10: fit1<-gam(Kyphosis~s(Age, 2)+s(Start), family=binomial, data=kyphosis) fit2<-gam(Kyphosis~s(Age, 5)+s(Start), family=binomial, data=kyphosis) fit3<-gam(Kyphosis~s(Age, 10)+s(Start), family=binomial, data=kyphosis) postscript("ex5.2b.ps") par(mfcol=c(2,3)) plot(fit1, se=T) title(main="df=2") plot(fit2, se=T) title(main="df=5") plot(fit3, se=T) title(main="df=10") dev.off()