Differential gene expression detection using penalized regression model

R codes for the analysis

###############################################################
#### Baolin Wu, baolin@biostat.umn.edu; updated 2004/12/08 ####
###############################################################
## readin the normalized gene expression dataset
tusher = read.table("http://www-stat.stanford.edu/~tibs/tusher/normave.txt")
## create the x expression matrix and the y group indicator
gx = as.matrix( tusher[,3:10] )
y = c("U1A", "U1B", "I1A", "I1B", "U2A", "U2B", "I2A", "I2B")
m = dim(gx)[1] ## total 7129 genes
## yA are four samples from the 1st cell line and yB from the 2nd
yA = 1:4; yB = 5:8
## id0 matrix contains 6 balanced permutations for the first group
##  balance requires each permuted group contains two samples
##  from each cell line
id = expand.grid(1:4,1:4); id0 = as.matrix(id[id[,1]<id[,2], ])

##############################
## SAM statistics analysis
dib = matrix(0, m,36)
s0 = 3.3 ## fudge factor
b = 0 ## total 36 balanced permutations
for(k1 in 1:6){
  for(k2 in 1:6){
    b = b+1
    y0 = c(yA[id0[k1,]], yB[id0[k2,]])
    y1 = setdiff(1:8, y0)

    ## permuted data
    gxb = gx[,c(y0,y1)]
    mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8])
    s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2
    si = sqrt(s21/12+s22/12)
    dib[,b] = sort( (mu1-mu2)/(si+s0) )
  }
}

di = dib[,1]; d0 = rowMeans(dib)
plot(d0, di, pch=20, ylim=c(-10,10), xlim=c(-10,10))
abline(1.2,1, col=2); abline(-1.2,1, col=2)

sum( abs(di-d0)>=1.2 )
c1 = min(di[di-d0>=1.2]); c0 = max(di[di-d0<=-1.2])
mean(colSums((dib>=c1)|(dib<=c0)))

###############################################
## lambda matching s0
Lam1 = sqrt(6)*s0 ## 8.083316
## minimize cor to select optimal lambda
gxb = gx[,c(1:2,5:6, 3:4,7:8)] ## raw data
mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8])
s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2
si = sqrt(s21/12+s22/12)
Lambda = seq(5, 100, length=1e3)
crL = sapply(Lambda, function(lambda){
  mu12 = ifelse( abs(mu1-mu2)>lambda, mu1-mu2-sign(mu1-mu2)*lambda, 0)
  tmp = mu12/sqrt(2*si^2+lambda^2/3)
  idm = order(-si)[1] ## remove the biggest si (outlier)
  return( cor(tmp[-idm],si[-idm]) )
})
Lam2 = Lambda[order(abs(crL))[1]] ## 27.14
plot(Lambda, abs(crL), pch=20, xlab=expression(lambda), ylab="|R|")
abline(v=Lam2, col=2, lty=2)
## maximize SSE/SSTO to select optimal lambda using lowess fitting
Lambda = seq(5, 100, length=1e2)
rL = sapply(Lambda, function(lambda){
  mu12 = ifelse( abs(mu1-mu2)>lambda, mu1-mu2-sign(mu1-mu2)*lambda, 0)
  tmp = mu12/sqrt(2*si^2+lambda^2/3)
  a = lowess(si, tmp, f=2/3)$y; 
  mean((tmp[order(si)]-a)^2)/var(tmp) ## SSE/SSTO
})
Lam3 = Lambda[order(-rL)[1]] ## 32.83
plot(Lambda, sqrt(rL), pch=20, xlab=expression(lambda), 
     ylab=expression(sqrt(SSE/SSTO)))
abline(v=Lam3, col=2, lty=2)

## penalized t-statistics analysis
## run the analysis for different optimal lambda
Lam = Lam1 ## Lam2, Lam3
pib = matrix(0, m,36)
b = 0
for(k1 in 1:6){
  for(k2 in 1:6){
    b = b+1
    y0 = c(yA[id0[k1,]], yB[id0[k2,]])
    y1 = setdiff(1:8, y0)

    ## permuted data
    gxb = gx[,c(y0,y1)]
    mu1 = rowMeans(gxb[,1:4]); mu2 = rowMeans(gxb[,5:8])
    s21 = rowSums(gxb[,1:4]^2)-4*mu1^2; s22 = rowSums(gxb[,5:8]^2)-4*mu2^2
    si = sqrt(s21/12+s22/12)
    mu12 = ifelse( abs(mu1-mu2)>Lam, mu1-mu2-sign(mu1-mu2)*Lam, 0)
    pib[,b] = sort( mu12/sqrt(2*si^2+Lam^2/3) )
  }
}

pi1 = pib[,1]; p0 = rowMeans(pib)
crt = -sort(-abs(pi1-p0))[46]
sum( abs(pi1-p0)>=crt )
c1 = min(pi1[pi1-p0>=crt]); c0 = max(pi1[pi1-p0<=-crt])
mean(colSums((pib>=c1)|(pib<=c0)))

plot(p0, pi1, pch=20, ylim=c(-10,10), xlim=c(-10,10))
abline(1.2,1, col=2); abline(-1.2,1, col=2)

More analysis results

Penalized t-statistics vs. the permuted values: pdf plot 1 and pdf plot 2

Penalized F-statistics vs. the permuted values: pdf plot 1 and pdf plot 2

Comparison of penalized t/F-statistics with the SAM statistics: pdf plot

R functions for the penalized t/F-statistics

## R functions for penalized t/F
tFL1 = function(x, y, lambda){
  ycl = as.integer(as.factor(y))
  n1 = sum(ycl==1); n2 = sum(ycl==2)
  mu1 = rowMeans(x[,ycl==1]); mu2 = rowMeans(x[,ycl==2])
  si2 = (rowSums(x[,ycl==1]^2)-n1*mu1^2+rowSums(x[,ycl==2]^2)-n2*mu2^2)/(n1+n2-2)
  mu12 = mu1-mu2

  nom = ifelse(abs(mu12)>lambda, mu12-sign(mu12)*lambda, 0)
  den = si2*(n1+n2)/n1/n2+lambda^2/(n1+n2-2)
  tL1 = nom/sqrt(den)

  nom = ifelse(abs(mu12)>lambda, mu12^2-lambda^2, 0)
  FL1 = nom/den

  return( cbind(tL1,FL1) )
}

## example
x = matrix(rnorm(1000*20),1000,20)
y = sample(letters[1:2], size=20, rep=TRUE)
a = tFL1(x,y,lambda=0.25)
hist(a[,1], prob=TRUE)