### PubH 7450 ### Example: Penalized semi-parametric Cox regression with Lasso. library(survival) library(glmnet) # glmnet can also deal with penalized linear reg, binary and multinomial # logistic reg, Poisson reg. # It can also use the elastic net (or ridge) penalty. ##generate a simulated data: set.seed(1) n=100; p=1000 #true reg coef's: beta0=c(1, 2, 3, rep(0, 997)) #design matrix: Xi iid N(0,1) Z<-matrix(rnorm(n*p), nrow=n, ncol=p) #survival time: h<-as.vector(exp(Z %*% beta0)) X<-rexp(n, h) #> summary(X) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 1.939e-05 3.348e-02 6.332e-01 5.527e+02 5.122e+00 4.376e+04 # censoring time: C<-runif(n, 0, 10) delta<-ifelse(C>=X, 1, 0) #> sum(delta) #[1] 76 T<-ifelse(C>=X, X, C) Y<-cbind(time=T, status=delta) res1<-glmnet(Z, Y, family="cox") #Warning message: #from glmnet Fortran code - Outer loop convergence not reached after 100 iterations #penalty parameter values: res1$lambda # [1] 0.65013305 0.62058350 0.59237702 0.56545257 0.53975188 0.51521932 # [7] 0.49180181 0.46944866 0.44811149 0.42774413 0.40830250 0.38974452 #[13] 0.37203003 0.35512069 0.33897991 0.32357275 0.30886587 0.29482744 #[19] 0.28142708 0.26863578 0.25642587 0.24477092 0.23364570 0.22302614 #[25] 0.21288926 0.20321312 0.19397677 0.18516022 0.17674441 0.16871110 #[31] 0.16104292 0.15372327 0.14673631 0.14006692 0.13370066 0.12762376 #[37] 0.12182306 0.11628602 0.11100064 0.10595549 0.10113965 0.09654269 #[43] 0.09215468 0.08796611 0.08396791 0.08015144 0.07650843 0.07303101 #[49] 0.06971164 0.06654313 0.06351865 0.06063163 0.05787582 0.05524528 #[55] 0.05273430 0.05033744 0.04804953 0.04586560 0.04378094 0.04179103 #[61] 0.03989156 0.03807843 0.03634770 0.03469565 0.03311867 0.03161338 #[67] 0.03017650 #Null deviance: > res1$nulldev [1] 601.8533 #Deviance as a fraction of the null deviance for the corresponding lambda: > res1$dev [1] 0.00000000 0.01062054 0.02052076 0.02974111 0.03832101 0.04631108 [7] 0.05374275 0.06065118 0.06706927 0.07302793 0.07855629 0.08367805 [13] 0.08842734 0.09282429 0.10237042 0.11382253 0.12444713 0.13430092 [19] 0.14343912 0.15191279 0.16062819 0.17016732 0.17910639 0.18749282 [25] 0.19562691 0.20423179 0.21298364 0.22219673 0.23100201 0.23924252 [31] 0.24714496 0.25571052 0.26477460 0.27517969 0.28543554 0.29566129 [37] 0.30726133 0.31985019 0.33197020 0.34446094 0.35671794 0.36926598 [43] 0.38184288 0.39428854 0.40651672 0.41867575 0.43153121 0.44409017 [49] 0.45642381 0.46854189 0.48048540 0.49227939 0.50393533 0.51535304 [55] 0.52646636 0.53735829 0.54806059 0.55855532 0.56895749 0.57921176 [61] 0.58928815 0.59914642 0.60885713 0.61845640 0.62790481 0.63726685 [67] 0.64650201 dim(res1$beta) #[1] 1000 67 res1$beta[1:10, 1] V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 0 0 0 0 0 0 0 0 0 0 sum(abs(res1$beta[,1])>1e-20) [1] 0 res1$beta[1:10, 30] V1 V2 V3 V4 V5 V6 V7 V8 0.1654565 0.6562847 1.3480505 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 V9 V10 0.0000000 0.0000000 sum(abs(res1$beta[,30])>1e-20) [1] 10 res1$beta[1:10, 45] V1 V2 V3 V4 V5 V6 V7 V8 0.424818 1.180838 2.358454 0.000000 0.000000 0.000000 0.000000 0.000000 V9 V10 0.000000 0.000000 sum(abs(res1$beta[,45])>1e-20) [1] 49 res1$beta[1:10, 67] V1 V2 V3 V4 V5 V6 V7 V8 1.021282 2.823881 5.458915 0.000000 0.000000 0.000000 0.000000 0.000000 V9 V10 0.000000 0.000000 sum(abs(res1$beta[,67])>1e-20) [1] 80 #########using 5-fold CV to select lambda: res2<-cv.glmnet(Z, Y, family="cox", nfolds=5) #Warning message: #from glmnet Fortran code - Outer loop convergence not reached after 100 iterations pdf("pcoxCV.pdf") plot(res2) title("Lasso Cox regression") dev.off() #print out CV results: res2$lambda [1] 0.65013305 0.62058350 0.59237702 0.56545257 0.53975188 0.51521932 [7] 0.49180181 0.46944866 0.44811149 0.42774413 0.40830250 0.38974452 [13] 0.37203003 0.35512069 0.33897991 0.32357275 0.30886587 0.29482744 [19] 0.28142708 0.26863578 0.25642587 0.24477092 0.23364570 0.22302614 [25] 0.21288926 0.20321312 0.19397677 0.18516022 0.17674441 0.16871110 [31] 0.16104292 0.15372327 0.14673631 0.14006692 0.13370066 0.12762376 [37] 0.12182306 0.11628602 0.11100064 0.10595549 0.10113965 0.09654269 [43] 0.09215468 0.08796611 0.08396791 0.08015144 0.07650843 0.07303101 [49] 0.06971164 0.06654313 0.06351865 0.06063163 0.05787582 0.05524528 [55] 0.05273430 0.05033744 0.04804953 0.04586560 0.04378094 0.04179103 [61] 0.03989156 0.03807843 0.03634770 0.03469565 0.03311867 0.03161338 [67] 0.03017650 res2$cvm [1] 9.664710 9.606496 9.524515 9.448465 9.377873 9.312365 9.251605 [8] 9.195269 9.143051 9.094677 9.049870 9.008398 8.970029 8.925157 [15] 8.857393 8.770150 8.685377 8.607392 8.534373 8.466004 8.399745 [22] 8.336294 8.275185 8.213578 8.154475 8.098025 8.044808 7.995840 [29] 7.955251 7.921977 7.890160 7.863180 7.843444 7.828542 7.820240 [36] 7.818977 7.825354 7.845237 7.881529 7.934868 7.996644 8.070384 [43] 8.164903 8.266987 8.380068 8.496078 8.623225 8.761917 8.910487 [50] 9.072136 9.265243 9.503291 9.779534 10.068070 10.409026 10.786633 [57] 11.196116 11.627112 12.097158 12.598039 13.099406 13.629745 14.192416 [64] 14.793643 15.429928 16.116877 16.813191 ##you can print out cvsd, cvup, cvlo (for SE, cvm+cvsd, cvm-cvsd), or simply: res2$lambda.min [1] 0.1276238 res2$lambda.1se [1] 0.1851602 ##fitted models for the full data: > names(res2$glmnet.fit) [1] "a0" "beta" "df" "dim" "lambda" "dev.ratio" [7] "nulldev" "npasses" "jerr" "offset" "call" > dim(res2$glmnet.fit$beta) [1] 1000 67 ########final model: > which(res2$lambda==res2$lambda.min) [1] 36 res2$glmnet.fit$beta[1:10, 36] V1 V2 V3 V4 V5 V6 V7 V8 0.2525216 0.8540193 1.6910531 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 V9 V10 0.0000000 0.0000000 sum(abs(res1$beta[,36])>1e-20) [1] 28 #########compare to the MLE with a much simplified model: # final Lasso model: res3<-glmnet(Z, Y, family="cox", lambda=0.1276238) res3$beta[1:10,1] V1 V2 V3 V4 V5 V6 V7 V8 0.2525085 0.8539859 1.6909904 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 V9 V10 0.0000000 0.0000000 # MLE with 50 predictors: res4<-glmnet(Z[, 1:50], Y, family="cox", lambda=0) #Error in validObject(.Object) : # invalid class "dgCMatrix" object: length(Dimnames[[2]])' must match Dim[2] #In addition: Warning message: #from glmnet Fortran code - Outer loop convergence not reached after 100 iterations # MLE with 30 predictors: res4<-glmnet(Z[, 1:30], Y, family="cox", lambda=0) res4$beta[1:10,1] V1 V2 V3 V4 V5 V6 1.075992762 2.198681433 3.398372600 0.096284602 -0.087567099 -0.014552377 V7 V8 V9 V10 V11 V12 0.137935138 -0.256006874 -0.178628134 -0.019035074 -0.484301474 0.093833576 V13 V14 V15 V16 V17 V18 -0.177783426 0.269747501 -0.022192903 0.092025267 -0.077416822 -0.329273972 V19 V20 V21 V22 V23 V24 0.405845251 0.120664153 0.233159765 -0.122793279 0.240442236 -0.157742718 V25 V26 V27 V28 V29 V30 0.298364931 0.119590131 0.001046124 -0.175602793 0.114496084 -0.017214743