#PubH Stat Learning and Data Mining #Example 3.1: Use of glmnet for LASSO, Enet, ... # LASSO implemented in a package: library("glmnet") ### # glmnet(x, y, family = c("gaussian", "binomial", "poisson", "multinomial", # "cox", "mgaussian"), weights, offset = NULL, alpha = 1, # nlambda = 100, lambda = NULL, standardize = TRUE, intercept = TRUE, # penalty.factor = rep(1, nvars), ...) ### # alpha: coef for the Enet penalty; # penalty.factor: lambda[j]= penalty.factor[j]*lambda; # diabetes data in: library("lars") data(diabetes) # a dataset in the package, which contains 442 rows and 3 columns; # the 3 columns: x, a matrix with 10 col's, **standardized** with L2 norm=1 and # mean=0; x1-x4: age, sex, BMI, BP; x5-x10: serum measurements # y, a numeric vector; a measure of disease progression one # year after baseline # x2, a matrix with 64 col's= x + some interactions. ###Q: why standardize x? attach(diabetes) #fit a LM using LASSO: lasso.fit<-glmnet(x, y, family="gaussian" ) #plots: pdf("ex3.1.pdf") par(mfrow=c(2,2)) #plot: coef's vs L1 norm constraint plot(lasso.fit, xvar="norm" ); #plot: coef's vs log(lambda): plot(lasso.fit, xvar="lambda"); #uisng CV to select the constraint value: # a plot of CV error vs fraction is automatically generated # (fraction: norm of the estimates over that of the unconstrained/LS one.) set.seed(1) cv.lasso.fit<-cv.glmnet(x, y, nfold=10, family="gaussian") plot(cv.lasso.fit) #mean CV error: cv.lasso.fit$cvm # [1] 5919.483 5585.973 5199.052 4866.371 4590.263 4361.120 4170.954 4016.069 # [9] 3886.919 3769.986 3666.861 3577.415 3501.421 3432.665 3373.224 3322.938 #[17] 3281.198 3246.549 3217.788 3193.914 3174.221 3158.506 3142.561 3123.397 #[25] 3101.438 3084.115 3070.882 3061.001 3052.903 3045.784 3039.402 3033.060 #[33] 3025.011 3017.431 3011.489 3006.987 3003.267 3000.326 2998.119 2996.797 #[41] 2995.953 2995.729 2995.760 2995.899 2996.256 2996.915 2997.307 2997.572 #[49] 2997.892 2998.252 2998.637 2999.039 2999.439 2999.854 3000.347 3001.063 #[57] 3001.765 3002.141 3002.440 3002.793 3002.737 3002.484 3002.198 3001.831 #[65] 3001.078 2999.967 2999.033 2998.396 2997.978 2997.774 2997.718 2997.792 #[73] 2997.890 2998.007 2998.134 2998.259 2998.373 2998.454 2998.596 2998.717 #[81] 2998.868 2998.992 2999.093 2999.164 2999.194 2999.222 2999.261 2999.284 #SE of CV error: cv.lasso.fit$cvsd #lambda's used in CV cv.lasso.fit$lambda # [1] 45.16003002 41.14813742 37.49265030 34.16190658 31.12705696 28.36181501 # [7] 25.84222953 23.54647708 21.45467295 19.54869894 17.81204641 16.22967329 #[13] 14.78787385 13.47415989 12.27715267 11.18648426 10.19270783 9.28721576 #[19] 8.46216511 7.71040968 7.02543814 6.40131758 5.83264216 5.31448631 #[25] 4.84236199 4.41217990 4.02021400 3.66306927 3.33765229 3.04114446 #[31] 2.77097757 2.52481156 2.30051426 2.09614291 1.90992735 1.74025467 #[37] 1.58565524 1.44479000 1.31643884 1.19949004 1.09293065 0.99583770 #[43] 0.90737023 0.82676196 0.75331470 0.68639230 0.62541510 0.56985495 #[49] 0.51923060 0.47310359 0.43107437 0.39277891 0.35788551 0.32609195 #[55] 0.29712284 0.27072727 0.24667660 0.22476253 0.20479525 0.18660180 #[61] 0.17002461 0.15492010 0.14115742 0.12861739 0.11719137 0.10678041 #[67] 0.09729434 0.08865097 0.08077547 0.07359960 0.06706121 0.06110368 #[73] 0.05567540 0.05072935 0.04622269 0.04211640 0.03837489 0.03496577 #[79] 0.03185951 0.02902920 0.02645032 0.02410055 0.02195952 0.02000870 #[85] 0.01823118 0.01661157 0.01513585 0.01379122 #value of lambda giving min cve: cv.lasso.fit$lambda.min #[1] 0.9958377 #Using "1-SE rule": #largest value of lambda such that error is within 1 standard error of the minimum: cv.lasso.fit$lambda.1se #[1] 7.71041 #do prediction and see its estimated coef's: yhat<-predict(cv.lasso.fit, newx=x) plot(y, yhat) abline(a=0, b=1) # coef's: coef(cv.lasso.fit) #11 x 1 sparse Matrix of class "dgCMatrix" # 1 #(Intercept) 152.13348 #age . #sex . #bmi 493.42401 #map 172.00436 #tc . #ldl . #hdl -94.48438 #tch . #ltg 428.55200 #glu . dev.off() ###USUALLY, you'd use one of the following two: fit1<-glmnet(x, y, family="gaussian", lambda=cv.lasso.fit$lambda.min) coef(fit1) fit2<-glmnet(x, y, family="gaussian", lambda=cv.lasso.fit$lambda.1se) coef(fit2) #same as cv.lasso.fit.