###################################################### # R functions for Fang H and Pan W (2010). # Powerful Multi-marker Association Tests: Unifying Genomic # Distance-Based Regression and Logistic Regression. # To appear in Genet Epi. # Downloadable as Rearch Report 2010-5 at: # http://www.sph.umn.edu/biostatistics/research.asp # April 6, 2010 # Programmer: Fang HAN # Email: fanghan@biostat.umn.edu ## Acknowledgement: GDBR() function is largely based on the program of ## Dr WY Lin from Lin and Schaid (2009, Genet Epi, 33:432-441); ## We thank Dr Lin for generously sharing his program with us. ###WARNING: GDBR F-test is only applicable for an equal # of ### cases and controls; it should eb easy to implement a more ### general version based on the formula given by Lin and Schaid (2009). ####################################### ### Base Functions ################### ####################################### library(mvtnorm) library(haplo.stats) alpha=0.05 PowerUniv <- function(U,V){ n <- dim(V)[1] x <- as.numeric(max(abs(U))) TER <- as.numeric(1-pmvnorm(lower=c(rep(-x,n)),upper=c(rep(x,n)),mean=c(rep(0,n)),sigma=V)) return(TER) } Xsquare <- function(X){ n <- nrow(X) p <- ncol(X) XX2 = NULL for(i in 1:p) for(j in i:p) XX2 = cbind(XX2,X[,i]*X[,j]) return(XX2) } Xsquare2 <- function(X){ n <- nrow(X) p <- ncol(X) XX2 = NULL for(i in 1:(p-1)) for(j in (i+1):p) XX2 = cbind(XX2,X[,i]*X[,j]) return(XX2) } getorders<-function(x,c){ return(x[order(x)][c*length(x)]) } pvTest <- function(Y,X){ Xg<-X Xbar<-apply(Xg, 2, mean) Xgb<-Xg for(i in 1:nrow(Xg)) Xgb[i,]<-Xg[i,]-Xbar #########SumSqUs: U<-t(Xg) %*% (Y-mean(Y)) #SumSqU: Tg1<- t(U) %*% U ##cov of the score stats: CovS<- mean(Y)*(1-mean(Y))*(t(Xgb) %*% Xgb) aaa <- min(abs(CovS[CovS>0]))/2 CovS[CovS==0] <- CovS[CovS==0]+aaa ##distr of Tg1 is sum of cr Chisq_1: cr<-eigen(CovS, only.values=TRUE)$values ##approximate the distri by alpha Chisq_d + beta: alpha1<-sum(cr*cr*cr)/sum(cr*cr) beta1<-sum(cr) - (sum(cr*cr)^2)/(sum(cr*cr*cr)) d1<-(sum(cr*cr)^3)/(sum(cr*cr*cr)^2) alpha1<-as.real(alpha1) beta1<-as.real(beta1) d1<-as.real(d1) pTg1<-as.numeric(1-pchisq((Tg1-beta1)/alpha1, d1)) #SumSqUw: Tg2<- t(U) %*% diag(1/diag(CovS)) %*% U ##distr of Tg1 is sum of cr Chisq_1: cr<-eigen(CovS %*% diag(1/diag(CovS)), only.values=TRUE)$values ##approximate the distri by alpha Chisq_d + beta: alpha2<-sum(cr*cr*cr)/sum(cr*cr) beta2<-sum(cr) - (sum(cr*cr)^2)/(sum(cr*cr*cr)) d2<-(sum(cr*cr)^3)/(sum(cr*cr*cr)^2) alpha2<-as.real(alpha2) beta2<-as.real(beta2) d2<-as.real(d2) pTg2<-as.numeric(1-pchisq((Tg2-beta2)/alpha2, d2)) ##########score test: ##gInv of CovS: CovS.edecomp<-eigen(CovS) CovS.rank<-sum(abs(CovS.edecomp$values)> 1e-8) inveigen<-ifelse(abs(CovS.edecomp$values) >1e-8, 1/CovS.edecomp$values, 0) P<-solve(CovS.edecomp$vectors) gInv.CovS<-t(P) %*% diag(inveigen) %*% P Tscore<- t(U) %*% gInv.CovS %*% U pTscore<-as.numeric( 1-pchisq(Tscore, CovS.rank) ) #### V1 %*% t(V1)=CovS: CovS.ev<-ifelse(abs(CovS.edecomp$values) >1e-8, CovS.edecomp$values, 0) V1<-CovS.edecomp$vectors %*% diag(sqrt(abs(CovS.ev))) ##univariate/marginal tests: nSNP<-ncol(Xg) Tus<-as.vector(abs(U)/sqrt(diag(CovS))) Vs <- matrix(c(rep(0,nSNP^2)),nrow=nSNP) for(i in 1:nSNP){ for(j in 1:nSNP){ Vs[i,j] <- CovS[i,j]/sqrt(CovS[i,i]*CovS[j,j]) } } pTus <- as.numeric(PowerUniv(Tus,Vs)) return(cbind(pTscore,pTg1,pTg2,pTus)) } SSU <- function(Y,Z){ nSNP<-ncol(Z) ###########SSU##################### Xg<-Z Xbar<-apply(Xg, 2, mean) Xgb<-Xg for(i in 1:nrow(Xg)) Xgb[i,]<-Xg[i,]-Xbar #########SumSqUs: U<-t(Xg) %*% (Y-mean(Y)) #SumSqU: Tg1<- t(U) %*% U ##cov of the score stats: CovS<- mean(Y)*(1-mean(Y))*(t(Xgb) %*% Xgb) ##distr of Tg1 is sum of cr Chisq_1: cr<-eigen(CovS, only.values=TRUE)$values ##approximate the distri by alpha Chisq_d + beta: alpha1<-sum(cr*cr*cr)/sum(cr*cr) beta1<-sum(cr) - (sum(cr*cr)^2)/(sum(cr*cr*cr)) d1<-(sum(cr*cr)^3)/(sum(cr*cr*cr)^2) alpha1<-as.real(alpha1) beta1<-as.real(beta1) d1<-as.real(d1) pTg1<-as.numeric(1-pchisq((Tg1-beta1)/alpha1, d1)) ##gInv of CovS: CovS.edecomp<-eigen(CovS) CovS.rank<-sum(abs(CovS.edecomp$values)> 1e-8) inveigen<-ifelse(abs(CovS.edecomp$values) >1e-8, 1/CovS.edecomp$values, 0) P<-solve(CovS.edecomp$vectors) gInv.CovS<-t(P) %*% diag(inveigen) %*% P #### V1 %*% t(V1)=CovS: CovS.ev<-ifelse(abs(CovS.edecomp$values) >1e-8, CovS.edecomp$values, 0) V1<-CovS.edecomp$vectors %*% diag(sqrt(CovS.ev)) return(cbind(rep(pTg1,nSNP),V1)) } simpleSSU <- function(Us,V){ #SumSqU: Tg1<- t(Us) %*% Us ##cov of the score stats: CovS<- V ##distr of Tg1 is sum of cr Chisq_1: cr<-eigen(CovS, only.values=TRUE)$values ##approximate the distri by alpha Chisq_d + beta: alpha1<-sum(cr*cr*cr)/sum(cr*cr) beta1<-sum(cr) - (sum(cr*cr)^2)/(sum(cr*cr*cr)) d1<-(sum(cr*cr)^3)/(sum(cr*cr*cr)^2) alpha1<-as.real(alpha1) beta1<-as.real(beta1) d1<-as.real(d1) pTg1<-1-pchisq((Tg1-beta1)/alpha1, d1) return(pTg1) } SSULMODEL <-function(V1,V2,V3,V4,V5,B){ n1 <- dim(V1)[1] n2 <- dim(V2)[1] n3 <- dim(V3)[1] n4 <- dim(V4)[1] n5 <- dim(V5)[1] pL2 <- pL3 <- pL4 <- pL5 <- NULL for(sim in 1:B){ set.seed(sim) Us1<-rnorm(n1) Us1<-V1 %*% Us1 Us2<-rnorm(n2) Us2<-V2 %*% Us2 Us3<-rnorm(n3) Us3<-V3 %*% Us3 Us4<-rnorm(n4) Us4<-V4 %*% Us4 Us5<-rnorm(n5) Us5<-V5 %*% Us5 p1 <- simpleSSU(Us1,V1%*%t(V1)) p2 <- simpleSSU(Us2,V2%*%t(V2)) p3 <- simpleSSU(Us3,V3%*%t(V3)) p4 <- simpleSSU(Us4,V4%*%t(V4)) p5 <- simpleSSU(Us5,V5%*%t(V5)) pvs <- c(p1,p2,p3,p4,p5) a <- c(1,2,3,4,5) pvs3 <- pvs[a] a <- c(1,2) pvs2 <- pvs[a] a <- c(1,3,4,5) pvs4 <- pvs[a] a <- c(1,5) pvs5 <- pvs[a] pL2<-rbind(pL2, cbind(min(pvs2), prod(pvs2), prod(pvs2[pvs2<0.05]))) pL3<-rbind(pL3, cbind(min(pvs3), prod(pvs3), prod(pvs3[pvs3<0.05]))) pL4<-rbind(pL4, cbind(min(pvs4), prod(pvs4), prod(pvs4[pvs4<0.05]))) pL5<-rbind(pL5, cbind(min(pvs5), prod(pvs5), prod(pvs5[pvs5<0.05]))) } return(cbind(pL2,pL3,pL4,pL5)) } UnivSSU <- function(Y,X){ p <- dim(X)[2] pv<-NULL for(i in 1:p){ pv1<-summary(glm(Y~X[,i],family=binomial(link="logit")))$coefficients[,4][2] pv <- c(pv,pv1) } return(pv) } arank<-function(a,alpha){ p <- length(a) a <- a[a>0] a <- sort(a,decreasing=T) s0 <- sum(a) s <- 0 for(i in 1:p){ s = s+a[i] if(s/s0>0.95) break } return(max(i-1,2)) } ################################################## ################################################## ###WARNING: GDBR F-test is only applicable for an equal # of ### cases and controls ##############Introduction to function parameters # Y: the disease indicator # X: the genotype matrix # alpha: the cut-off value that we merge rare haplotypes # n.per: the number of permutations in each repetition # sig: the significance level # B: the repetition times for test combination part of GDBR model. GDBR <- function(Y,X,alpha=0.01,n.per=1e3,sig=0.05,B=1e3){ n.indi<-length(Y) n.CN <- n.indi/2 n.CS <- n.indi/2 Copy<-X # geno-sim Dm <- matrix(0,n.indi,n.indi) # Distance matrix for(i in 1:(n.indi)){ if(i1){ for(j in 1:(i-1)){ Dm[i,j] <- Dm[j,i] } } } A1 <- (-0.5)*(Dm**2) # the association matrix F.va1 <- sum(A1[1:n.CN,(n.CN+1):n.indi])/(sum(A1[1:n.CN,1:n.CN])+sum(A1[(n.CN+1):n.indi,(n.CN+1):n.indi]))-0.5 # haplo-sim (counting measure) # use haplo.em function first genotype <- matrix(NA,n.indi,(ncol(Copy)*2)) # create input file for haplo.em for(i in 1:n.indi){ for(k in 1:ncol(Copy)){ if(Copy[i,k]==0){ genotype[i,(2*k-1):(2*k)] <- 1 } else if(Copy[i,k]==1){ genotype[i,(2*k-1)] <- 1 genotype[i,(2*k)] <- 2 } else if(Copy[i,k]==2){ genotype[i,(2*k-1):(2*k)] <- 2 } } } haplo.object <- haplo.em(genotype) haplotype <- haplo.object$haplotype haplo.data <- cbind(haplo.object$subj.id, haplo.object$hap1code, haplo.object$hap2code, haplo.object$post) ### merge rare haplotypes with its most similar common haplotypes for(i in 1:nrow(haplo.object$haplotype)){ if(haplo.object$hap.prob[i]=alpha)){ tmp.dis <- sum(haplo.object$haplotype[i,]!=haplo.object$haplotype[j,]) if(tmp.dishaplo.object$hap.prob[tmp.sub.hap]){ tmp.dis.min <- tmp.dis tmp.sub.hap <- j } } } } haplo.data[haplo.data[,2]==i,2] <- tmp.sub.hap haplo.data[haplo.data[,3]==i,3] <- tmp.sub.hap } } # haplo-score # haplo-max y <- rep(0,n.CN) y[(n.CN+1):n.indi] <- rep(1,n.CS) tmp <- haplo.score(y,genotype,trait.type='binomial',simulate=TRUE,skip.haplo=alpha) global.pvalue <- tmp$score.global.p.sim # keep the permutation p value for 'haplo-score' and 'haplo-max' max.pvalue <- tmp$score.max.p.sim # We won't permute for 'haplo-score' nor 'haplo-max' # haplo-sim (counting measure) # haplo-match (matching measure) S <- matrix(1,n.indi,n.indi) # similarity matrix for haplo-sim S2 <- matrix(1,n.indi,n.indi) # similarity matrix for haplo-match for(i in 1:(n.indi)){ if(i1){ for(j in 1:(i-1)){ S[i,j] <- S[j,i] S2[i,j] <- S2[j,i] } } } Dm1 <- (1-S) # Distance matrix A4 <- (-0.5)*(Dm1**2) # Association matrix F.va4 <- sum(A4[1:n.CN,(n.CN+1):n.indi])/(sum(A4[1:n.CN,1:n.CN])+sum(A4[(n.CN+1):n.indi,(n.CN+1):n.indi]))-0.5 Dm2 <- (1-S2) # Distance matrix A7 <- (-0.5)*(Dm2**2) # Association matrix F.va7 <- sum(A7[1:n.CN,(n.CN+1):n.indi])/(sum(A7[1:n.CN,1:n.CN])+sum(A7[(n.CN+1):n.indi,(n.CN+1):n.indi]))-0.5 # can-cor can <- (cancor(y,Copy)$cor)**2 ### geno-LRT dev <- glm(y~Copy,family=binomial)$null.deviance-glm(y~Copy,family=binomial)$deviance ### permutation ### We permute for 'geno-sim', 'haplo-sim', 'geno-LRT', 'can-cor', and 'haplo-match' ### 'haplo-score' and 'haplo-max' have already permuted by the R package 'haplo.stats' count <- rep(0,7) # counting the number that the permutated statistic is more extreme than the observed statistic for(pp in 1:n.per){ # 'pp' is the index for permutation ## n.per is the number of permutations X <- matrix(1,n.indi,1) ran <- runif(n.indi,0,1) rank.ran <- rank(ran) for(i in 1:n.indi){ if(rank.ran[i]<=n.CN){ X[i] <- (-1) ### we randomly shuffle the phenotypes } } H <- (1/n.indi)*(X%*%t(X)) # geno-sim F.null <- (n.indi*sum(H*A1))/(-2*sum((H>0)*A1)) if(F.null>F.va1){ count[1] <- count[1]+1 } # haplo-sim F.null <- (n.indi*sum(H*A4))/(-2*sum((H>0)*A4)) if(F.null>F.va4){ count[4] <- count[4]+1 } # haplo-match F.null <- (n.indi*sum(H*A7))/(-2*sum((H>0)*A7)) if(F.null>F.va7){ count[7] <- count[7]+1 } # can-cor for(i in 1:n.indi){ if(X[i] == (-1)){ X[i] <- 0 } } if(((cancor(X,Copy)$cor)**2)>can){ count[5] <- count[5]+1 } # geno-LRT if((glm(X~Copy,family=binomial)$null.deviance-glm(X~Copy,family=binomial)$deviance)>dev){ count[3] <- count[3]+1 } #if((count[1]>(n.per*sig)) & (count[3]>(n.per*sig)) & (count[4]>(n.per*sig)) & (count[5]>(n.per*sig)) & (count[7]>(n.per*sig))){ # count[1] <- n.per*0.5 # count[3] <- n.per*0.5 # count[4] <- n.per*0.5 # count[5] <- n.per*0.5 # count[7] <- n.per*0.5 # break #} } # end permutation loop per.pvalue <- (count/n.per) # permutation p value ppG1 = per.pvalue[1] ppH1 = per.pvalue[4] ppH2 = per.pvalue[7] #####Calculating Z. I <- matrix(0,n.indi,n.indi) for(i in 1:n.indi) I[i,i]<-1 ones <- rep(1,n.indi) G1 = (I-ones%*%t(ones)/n.indi)%*%A1%*%(I-ones%*%t(ones)/n.indi) G4 = (I-ones%*%t(ones)/n.indi)%*%A4%*%(I-ones%*%t(ones)/n.indi) G7 = (I-ones%*%t(ones)/n.indi)%*%A7%*%(I-ones%*%t(ones)/n.indi) ### Z1 %*% t(Z1)=G1: G1.edecomp<-eigen(G1) G1.rank<-arank(G1.edecomp$values) Z1=cmdscale(d=Dm, k=max(G1.rank,3)) Z1.indi<-c(1,2) for(i in 3:ncol(Z1)){ if(getorders(abs(Z1[,i]),0.9)>1e-5) Z1.indi<-c(Z1.indi,i) } Z1 <- Z1[,Z1.indi] ### Z2 %*% t(Z2)=G4: G4.edecomp<-eigen(G4) G4.rank<-arank(G4.edecomp$values) Z2=cmdscale(d=Dm1, k=max(G4.rank,3)) Z2.indi<-c(1,2) for(i in 3:ncol(Z2)){ if(getorders(abs(Z2[,i]),0.9)>1e-5) Z2.indi<-c(Z2.indi,i) } Z2 <- Z2[,Z2.indi] ### Z3 %*% t(Z3)=G7: G7.edecomp<-eigen(G7) G7.rank<-arank(G7.edecomp$values) Z3=cmdscale(d=Dm2, k=max(G7.rank,3)) Z3.indi<-c(1,2) for(i in 3:ncol(Z3)){ if(getorders(abs(Z3[,i]),0.9)>1e-5) Z3.indi<-c(Z3.indi,i) } Z3 <- Z3[,Z3.indi] nZ1<-dim(Z1)[2] nZ2<-dim(Z2)[2] nZ3<-dim(Z3)[2] XX2 <- Xsquare2(Copy) X <- Copy pvG1 <- pvTest(Y,Z1) pvH1 <- pvTest(Y,Z2) pvH2 <- pvTest(Y,Z3) pvL1 <- pvTest(Y,X) pvL2 <- pvTest(Y,cbind(X,XX2)) pvL3 <- pvTest(Y,cbind(X,XX2,Z1,Z2,Z3)) pvL4 <- pvTest(Y,cbind(X,Z1,Z2,Z3)) pvL5 <- pvTest(Y,cbind(X,Z3)) #####TC on L2-L5 MODEL pX <- SSU(Y,X) pXX2 <- SSU(Y,XX2) pG <- SSU(Y,Z1) pH1 <- SSU(Y,Z2) pH2 <- SSU(Y,Z3) pvs <- c(pX[1,1],pXX2[1,1],pG[1,1],pH1[1,1],pH2[1,1]) V1 <-pX[,-1] V2 <-pXX2[,-1] V3 <-pG[,-1] V4 <-pH1[,-1] V5 <-pH2[,-1] ######L2-l5 MODEL a <- c(1,2,3,4,5) pvs3 <- pvs[a] a <- c(1,2) pvs2 <- pvs[a] a <- c(1,3,4,5) pvs4 <- pvs[a] a <- c(1,5) pvs5 <- pvs[a] pL2<-c(min(pvs2), prod(pvs2), prod(pvs2[pvs2<0.05])) pL3<-c(min(pvs3), prod(pvs3), prod(pvs3[pvs3<0.05])) pL4<-c(min(pvs4), prod(pvs4), prod(pvs4[pvs4<0.05])) pL5<-c(min(pvs5), prod(pvs5), prod(pvs5[pvs5<0.05])) simpvs <- SSULMODEL(V1,V2,V3,V4,V5,B) epL2<-cbind(sum(pL2[1]>simpvs[,1])/B, sum(pL2[2]>simpvs[,2])/B, sum(pL2[3]>simpvs[,3])/B) epL3<-cbind(sum(pL3[1]>simpvs[,4])/B, sum(pL3[2]>simpvs[,5])/B, sum(pL3[3]>simpvs[,6])/B) epL4<-cbind(sum(pL4[1]>simpvs[,7])/B, sum(pL4[2]>simpvs[,8])/B, sum(pL4[3]>simpvs[,9])/B) epL5<-cbind(sum(pL5[1]>simpvs[,10])/B, sum(pL5[2]>simpvs[,11])/B, sum(pL5[3]>simpvs[,12])/B) return(ppG1,ppH1,ppH2,pvL1,pvL2,pvL3,pvL4,pvL5,epL2,epL3,epL4,epL5) } # Output is the pvalues of the methods, the order is: # 1-3: G1, H1 and H2 model under Lin's paper # 4-7: Score, SSU, SSUw, UminP tests on L1 # 8-11: Score, SSU, SSUw, UminP tests on L2 # 12-15: Score, SSU, SSUw, UminP tests on L3 # 16-19: Score, SSU, SSUw, UminP tests on L4 # 20-23: Score, SSU, SSUw, UminP tests on L5 # 24-26: minP, Fisher,TPM based on "univariate" SSU tests on L2 # 27-29: minP, Fisher,TPM based on "univariate" SSU tests on L3 # 30-32: minP, Fisher,TPM based on "univariate" SSU tests on L4 # 33-35: minP, Fisher,TPM based on "univariate" SSU tests on L5