#PubH Stat Learning and Data Mining #Example 3.1: Use of lars() function for LASSO # LASSO implemented in a package: library("lars") data(diabetes) # a dataset in the package, which contains 442 rows and 3 columns; # the 3 columns: x, a matrix with 10 col's, standardized with L2 norm=1 and # mean=0; x1-x4: age, sex, BMI, BP; x5-x10: serum measurements # y, a numeric vector; a measure of disease progression one # year after baseline # x2, a matrix with 64 col's= x + some interactions. attach(diabetes) #fit a LM using LASSO: lasso.fit<-lars(x, y, type="lasso") #plots: postscript("ex3.1.ps", horizontal=T) par(mfrow=c(2,2)) #plot: coef's vs L1 norm constraint plot(lasso.fit, xvar="norm", plottype="coef"); #plot: coef's vs Cp plot(lasso.fit, xvar="norm", plottype="Cp"); #uisng CV to select the constraint value: # a plot of CV error vs fraction is automatically generated # (fraction: norm of the estimates over that of the unconstrained/LS one.) cv.lasso.fit<-cv.lars(x, y, K=10, index=seq(from=0, to=1, length=20)) #CV curve: cv.lasso.fit$cv # [1] 5948.330 5232.499 4619.749 4121.997 3743.630 3463.015 3273.865 3150.120 # [9] 3068.908 3020.260 2998.642 3004.509 3010.827 3018.382 3020.136 3019.254 #[17] 3017.750 3018.513 3018.650 3019.232 #SE of CV curve cv.lasso.fit$cv.error # [1] 267.4495 240.5127 228.3613 222.9639 217.1919 209.7210 195.1075 178.4819 # [9] 168.6708 162.8642 164.0596 167.4581 168.9714 169.7249 170.0747 169.5920 #[17] 169.0756 168.7129 167.8434 167.0005 #cv.lasso.fit$frac # [1] 0.00000000 0.05263158 0.10526316 0.15789474 0.21052632 0.26315789 # [7] 0.31578947 0.36842105 0.42105263 0.47368421 0.52631579 0.57894737 #[13] 0.63157895 0.68421053 0.73684211 0.78947368 0.84210526 0.89473684 #[19] 0.94736842 1.00000000 #plot(cv.lasso.fit$fraction, cv.lasso.fit$cv) #####Using "1-SE rule": fraction=0.35 is selected #Using fact=0.368 to do prediction and see its estimated coef's: yhat<-predict(lasso.fit, newx=x, s=0.368, type="fit", mode="fraction") plot(y, yhat$fit) abline(a=0, b=1) # coef's: coef(lasso.fit, s=0.368, mode="fraction") # age sex bmi map tc ldl # 0.000000 -8.900973 506.335421 196.365100 0.000000 0.000000 # hdl tch ltg glu #-120.708159 0.000000 440.972170 0.000000 # how about that at s=0.526 with minimum CV error: coef(lasso.fit, s=0.526, mode="fraction") # age sex bmi map tc ldl hdl # 0.00000 -176.25935 519.70338 285.97415 -77.89586 0.00000 -216.93877 # tch ltg glu # 0.00000 499.12201 44.06909 yhat2<-predict(lasso.fit, newx=x, s=0.526, type="fit", mode="fraction") #plot(y, yhat2$fit) dev.off()