##################################################### ## setting up data attach(fms) akt1Dat <- fms[,substr(names(fms),1,4)=="akt1"] #> dim(akt1Dat) #[1] 1397 24 # check if all SNPs have 3 levels q1=rep(NA,24) for(i in 1:24) q1[i]=nlevels(akt1Dat[,i]) which(q1!=3) table(akt1Dat[,15]) table(akt1Dat[,17]) # exclude the 15th SNP and make the Und elements of akt1Dat into NA levels(akt1Dat[,17])=c("AA", "GA", "GG",NA) akt1Dat=akt1Dat[,-15] # here is indicator of having at least 1 copy of the minor allele getMnrAll <- function(y){ t1=table(y) n1=levels(y) if(t1[1]<=t1[3]) x=ifelse(y!=n1[3],1,0) else x=ifelse(y!=n1[1],1,0) x } akt1Bin=matrix(NA,1397,23) for(i in 1:23) akt1Bin[,i]=getMnrAll(akt1Dat[,i]) # get a complete data set MissDat <- apply(is.na(akt1Bin),1,any) | is.na(NDRM.CH) akt1BinC <- akt1Bin[!MissDat,] > dim(akt1BinC) [1] 707 23 TraitC <- NDRM.CH[!MissDat] # now dichotomize TraitCD <- TraitC > median(TraitC) > summary(TraitCD) Mode FALSE TRUE NA's logical 398 309 0 then set 100 observations aside as test cases set.seed(102) testSmp <- sample(1:707,100) akt1BinC.ts <- akt1BinC[testSmp,] Trait.ts <- TraitCD[testSmp] akt1BinC.ls <- akt1BinC[-testSmp,] Trait.ls <- TraitCD[-testSmp] > summary(Trait.ts) Mode FALSE TRUE NA's logical 56 44 0 > summary(Trait.ls) Mode FALSE TRUE NA's logical 342 265 0 # so 44% TRUE overall and in both samples ###################################### ## linear model clsLM <- lm(Trait.ls~.,data=data.frame(akt1BinC.ls)) prdLM <- predict(clsLM,newdata=data.frame(akt1BinC.ts)) install.packages("pROC") library(pROC) roc(Trait.ts,prdLM,plot=T) ############################# ## random forests install.packages("randomForest") library(randomForest) clsRF <- randomForest(akt1BinC.ls, factor(Trait.ls)) prdRF <- predict(clsRF,akt1BinC.ts) t1=table(prdRF,Trait.ts) # Trait.ts #prdRF FALSE TRUE # FALSE 50 39 # TRUE 6 5 t1[2,2]/sum(t1[,2]) t1[1,1]/sum(t1[,1]) prdRFp <- predict(clsRF,akt1BinC.ts,type="prob") install.packages("pROC") library(pROC) rfROC <- roc(Trait.ts,prdRFp[,1],plot=TRUE) # but the output is random, so perhaps average over many tries? # hard way: by hand #M=10 #sensRF=matrix(NA,M,100) #specRF=matrix(NA,M,100) #for(i in 1:M){ # clsRF <- randomForest(akt1BinC.ls, factor(Trait.ls)) # prdRFp <- predict(clsRF,akt1BinC.ts,type="prob") # rfROC <- roc(Trait.ts,prdRFp[,1]) # nn <- length(rfROC$sensitivities) # sensRF[i,1:nn]=rfROC$sensitivities # specRF[i,1:nn]=rfROC$specificities #} #plot(specRF[1,],sensRF[1,],type="l") #for(i in 2:10) lines(specRF[i,],sens[i,]) # or simpler for(i in 1:M){ clsRF <- randomForest(akt1BinC.ls, factor(Trait.ls)) prdRFp <- predict(clsRF,akt1BinC.ts,type="prob") rfROC <- lines.roc(Trait.ts,prdRFp[,1]) } ############################ ## MARS install.packages("earth") library(earth) # a little tricky how predict works with earth, for example clsMARS <- earth(factor(Trait.ls)~akt1BinC.ls, degree=2) prdMARS <- predict(clsMARS,akt1BinC.ts) # gives an error, while the following works, although not sure how to set threshold, default is 0.5 but that performs poorly clsMARS <- earth(factor(Trait.ls)~., data=data.frame(akt1BinC.ls), degree=2) prdMARS <- predict(clsMARS,akt1BinC.ts,type="class",thresh=0.5) t2=table(prdMARS,Trait.ts) # Trait.ts #prdMARS FALSE TRUE # FALSE 54 41 # TRUE 2 3 t2[2,2]/sum(t2[,2]) t2[1,1]/sum(t2[,1]) prdMARS <- predict(clsMARS,akt1BinC.ts,type="class",thresh=.442) table(prdMARS,Trait.ts) prdMARS <- predict(clsMARS,akt1BinC.ts,type="class",thresh=.443) table(prdMARS,Trait.ts) # so very sensitive to the threshold, maybe try different degree? clsMARS <- earth(factor(Trait.ls)~., data=data.frame(akt1BinC.ls), degree=1) # same problem # to get ROC curve we will vary the threshold M=200 specMARS=rep(NA,M) sensMARS=rep(NA,M) threshVec=rep(NA,M) thr=seq(0,1,len=M) for(i in 1:M){ prdMARS <- predict(clsMARS,akt1BinC.ts,type="class",thresh=thr[i]) t2=table(prdMARS,Trait.ts) sensMARS[i] <- t2[2,2]/sum(t2[,2]) specMARS[i] <- t2[1,1]/sum(t2[,1]) } plot(specMARS,sensMARS,type="l") #hmm specMARS thr[78] thr[144] M=200 specMARS=rep(NA,M) sensMARS=rep(NA,M) threshVec=rep(NA,M) thr=seq(.38,.72,len=M) for(i in 1:M){ prdMARS <- predict(clsMARS,akt1BinC.ts,type="class",thresh=thr[i]) t2=table(prdMARS,Trait.ts) sensMARS[i] <- t2[2,2]/sum(t2[,2]) specMARS[i] <- t2[1,1]/sum(t2[,1]) } specMARS > thr[33:34] [1] 0.4346734 0.4363819 ############################ ## support vector machines install.packages("kernlab") library("kernlab") clsSVM <- ksvm(factor(Trait.ls)~., data=data.frame(akt1BinC.ls), kernel="rbfdot", kpar="automatic", C=60, cross=3, prob.model=TRUE) prdSVM <- predict(clsSVM,akt1BinC.ts) t3=table(prdSVM,Trait.ts) # Trait.ts #prdSVM FALSE TRUE # FALSE 38 31 # TRUE 18 13 t3[2,2]/sum(t3[,2]) t3[1,1]/sum(t3[,1]) # but most would recommend "tuning" the parameters, i.e. change at least C and examine the CV error estimate clsSVM <- ksvm(factor(Trait.ls)~., data=data.frame(akt1BinC.ls), kernel="rbfdot", kpar="automatic", C=10, cross=3, prob.model=TRUE) # for instance this does well by that measure clsSVM <- ksvm(factor(Trait.ls)~., data=data.frame(akt1BinC.ls), kernel="rbfdot", kpar="automatic", C=.1, cross=3, prob.model=TRUE) > t3[2,2]/sum(t3[,2]) #[1] 0.2954545 > t3[1,1]/sum(t3[,1]) #[1] 0.6785714 # but gives exactly the same out of sample performance prdSVMp <- predict(clsSVM,akt1BinC.ts,type="probabilities") svmROC <- roc(Trait.ts,prdSVMp[,1],plot=TRUE) can compare the methods in terms of confidence intervals for the AUC ci(svmROC) ci(rfROC) # can also test for differences roc.test(rfROC,svmROC,paired=TRUE)