############################################################### #### Baolin Wu, baolin@biostat.umn.edu; updated 2004/12/08 #### ############################################################### ## readin the normalized gene expression dataset tusher = read.table("http://www-stat.stanford.edu/~tibs/tusher/normave.txt") ## create the x expression matrix and the y group indicator gx = as.matrix( tusher[,3:10] ) y = c("U1A", "U1B", "I1A", "I1B", "U2A", "U2B", "I2A", "I2B") m = dim(gx)[1] ## total 7129 genes ## yA are four samples from the 1st cell line and yB from the 2nd yA = 1:4; yB = 5:8 ## id0 matrix contains 6 balanced permutations for the first group ## balance requires each permuted group contains two samples ## from each cell line id = expand.grid(1:4,1:4); id0 = as.matrix(id[id[,1]<id[,2], ]) ############################## ## SAM statistics analysis dib = matrix(0, m,36) s0 = 3.3 ## fudge factor b = 0 ## total 36 balanced permutations for(k1 in 1:6){ for(k2 in 1:6){ b = b+1 y0 = c(yA[id0[k1,]], yB[id0[k2,]]) y1 = setdiff(1:8, y0) ## permuted data gxb = gx[,c(y0,y1)] mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8]) s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2 si = sqrt(s21/12+s22/12) dib[,b] = sort( (mu1-mu2)/(si+s0) ) } } di = dib[,1]; d0 = rowMeans(dib) plot(d0, di, pch=20, ylim=c(-10,10), xlim=c(-10,10)) abline(1.2,1, col=2); abline(-1.2,1, col=2) sum( abs(di-d0)>=1.2 ) c1 = min(di[di-d0>=1.2]); c0 = max(di[di-d0<=-1.2]) mean(colSums((dib>=c1)|(dib<=c0))) ############################################### ## lambda matching s0 Lam1 = sqrt(6)*s0 ## 8.083316 ## minimize cor to select optimal lambda gxb = gx[,c(1:2,5:6, 3:4,7:8)] ## raw data mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8]) s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2 si = sqrt(s21/12+s22/12) Lambda = seq(5, 100, length=1e3) crL = sapply(Lambda, function(lambda){ mu12 = ifelse( abs(mu1-mu2)>lambda, mu1-mu2-sign(mu1-mu2)*lambda, 0) tmp = mu12/sqrt(2*si^2+lambda^2/3) idm = order(-si)[1] ## remove the biggest si (outlier) return( cor(tmp[-idm],si[-idm]) ) }) Lam2 = Lambda[order(abs(crL))[1]] ## 27.14 plot(Lambda, abs(crL), pch=20, xlab=expression(lambda), ylab="|R|") abline(v=Lam2, col=2, lty=2) ## maximize SSE/SSTO to select optimal lambda using lowess fitting Lambda = seq(5, 100, length=1e2) rL = sapply(Lambda, function(lambda){ mu12 = ifelse( abs(mu1-mu2)>lambda, mu1-mu2-sign(mu1-mu2)*lambda, 0) tmp = mu12/sqrt(2*si^2+lambda^2/3) a = lowess(si, tmp, f=2/3)$y; mean((tmp[order(si)]-a)^2)/var(tmp) ## SSE/SSTO }) Lam3 = Lambda[order(-rL)[1]] ## 32.83 plot(Lambda, sqrt(rL), pch=20, xlab=expression(lambda), ylab=expression(sqrt(SSE/SSTO))) abline(v=Lam3, col=2, lty=2) ## penalized t-statistics analysis ## run the analysis for different optimal lambda Lam = Lam1 ## Lam2, Lam3 pib = matrix(0, m,36) b = 0 for(k1 in 1:6){ for(k2 in 1:6){ b = b+1 y0 = c(yA[id0[k1,]], yB[id0[k2,]]) y1 = setdiff(1:8, y0) ## permuted data gxb = gx[,c(y0,y1)] mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8]) s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2 si = sqrt(s21/12+s22/12) mu12 = ifelse( abs(mu1-mu2)>Lam, mu1-mu2-sign(mu1-mu2)*Lam, 0) pib[,b] = sort( mu12/sqrt(2*si^2+Lam^2/3) ) } } pi1 = pib[,1]; p0 = rowMeans(pib) crt = -sort(-abs(pi1-p0))[46] sum( abs(pi1-p0)>=crt ) c1 = min(pi1[pi1-p0>=crt]); c0 = max(pi1[pi1-p0<=-crt]) mean(colSums((pib>=c1)|(pib<=c0))) plot(p0, pi1, pch=20, ylim=c(-10,10), xlim=c(-10,10)) abline(1.2,1, col=2); abline(-1.2,1, col=2)
Penalized t-statistics vs. the permuted values: pdf plot 1 and pdf plot 2
Penalized F-statistics vs. the permuted values: pdf plot 1 and pdf plot 2
Comparison of penalized t/F-statistics with the SAM statistics: pdf plot
## R functions for penalized t/F tFL1 = function(x, y, lambda){ ycl = as.integer(as.factor(y)) n1 = sum(ycl==1); n2 = sum(ycl==2) mu1 = rowMeans(x[,ycl==1]); mu2 = rowMeans(x[,ycl==2]) si2 = (rowSums(x[,ycl==1]^2)-n1*mu1^2+rowSums(x[,ycl==2]^2)-n2*mu2^2)/(n1+n2-2) mu12 = mu1-mu2 nom = ifelse(abs(mu12)>lambda, mu12-sign(mu12)*lambda, 0) den = si2*(n1+n2)/n1/n2+lambda^2/(n1+n2-2) tL1 = nom/sqrt(den) nom = ifelse(abs(mu12)>lambda, mu12^2-lambda^2, 0) FL1 = nom/den return( cbind(tL1,FL1) ) } ## example x = matrix(rnorm(1000*20),1000,20) y = sample(letters[1:2], size=20, rep=TRUE) a = tFL1(x,y,lambda=0.25) hist(a[,1], prob=TRUE)