#PubH Stat Learning and Data Mining #Example 3.4: High-Deimensional Inference with LASSO in (generalized) linear models. #install.packages("hdi") library(hdi) # riboflavin (vitamin B2) production by Bacillus subtilis (a bacterium) data(riboflavin) x=riboflavin$x y=riboflavin$y dim(x) #[1] 71 4088 lasso.fit<-lasso.proj(x=x, y=y) ##very slow! x1=x[, 1:200] lasso.fit<-lasso.proj(x=x1, y=y) summary(lasso.fit$pval) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.0004842 0.2533296 0.4914348 0.4897518 0.7144223 0.9988645 summary(lasso.fit$pval.corr) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.09684 1.00000 1.00000 0.99548 1.00000 1.00000 #bootstrap-based pvalues, perhaps better finite-sample performance # instead of using the (approx or asympt) Normal distr, use the empirical one from the bootstrap estimates. boot.lasso.fit<-boot.lasso.proj(x=x1, y=y, B=1000) summary(boot.lasso.fit$pval) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.000999 0.269730 0.480519 0.496324 0.734266 0.992008 summary(boot.lasso.fit$pval.corr) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.06593 1.00000 1.00000 0.99513 1.00000 1.00000 # Multi-split p-values set.seed(1) fit.multi <- multi.split(x, y, B = 100) fit.multi$pval #[1] NA names(fit.multi) # [1] "pval" "pval.corr" "pvals.nonaggr" "ci.level" [5] "lci" "uci" "gamma.min" "sel.models" [9] "method" "call" "clusterGroupTest" # BUT "pval" is NOT in the output list given in the R manual! #Multiple testing corrected p-values for each parameter: summary(fit.multi$pval.corr) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.00775 1.00000 1.00000 0.99952 1.00000 1.00000 fit.multi #alpha = 0.01: Selected predictors: 4003 #alpha = 0.05: Selected predictors: 4003 #------ #Familywise error rate controlled at level alpha.