A unified statistical framework for differential gene expression detection and sample classification using penalized linear regression models

R codes for the analysis of SRBCT microarray data

###############################################################
#### Baolin Wu, baolin@biostat.umn.edu; updated 2005/10/10 ####
###############################################################
## readin the normalized gene expression dataset
URL = "http://research.nhgri.nih.gov/microarray/Supplement/Images/supplemental_data"
srbct = read.table(URL, header=TRUE, sep="\t", skip=1, quote="")

## False positive estimation for the training data
train = log(srbct[,3:65])
m = dim(train)[1]
y = substring(names(train),1,3-(substring(names(train),3,3)=="."))

x = as.matrix(train); ycl = as.integer(as.factor(y))
ng = as.vector(table(ycl)); n = sum(ng)

yid = matrix(0, n, length(ng))
for(k in 1:length(ng)) yid[which(ycl==k),k] = 1/ng[k]
mug = x%*%yid; mu = rowMeans(x)

B = 1e3; set.seed(1027)
Bid = sapply(1:B, function(k) sample(1:n, size=n, rep=FALSE))

K = 3e2; pg = fp = rep(0,K)
Lam = seq(20,100, length=K)
for(k in 1:K){
  lambda = Lam[k]; thrh = lambda/2/ng
  tmp = t((mug-mu)^2)-thrh^2
  shid = colSums(tmp>0)
  pg[k] = sum(shid>0)

  a = apply(Bid, 2, function(id){
    bid = yid[id,]
    mug = x%*%bid
    tmp = t((mug-mu)^2)-thrh^2
    shid = colSums(tmp>0)
    return( sum(shid>0) )
  })
  fp[k] = mean(a)
}
## Number of significant genes vs. number of false positives
plot(pg, fp, xlab="R", ylab="FP")

### Empirical bayesian estimation of pi0
K = 3e2; pg = fp = rep(0,K)
Lam = seq(0,10, length=K)
for(k in 1:K){
  lambda = Lam[k]; thrh = lambda/2/ng
  tmp = t((mug-mu)^2)-thrh^2
  shid = colSums(tmp>0)
  pg[k] = sum(shid>0)
  
  a = apply(Bid, 2, function(id){
    bid = yid[id,]
    mug = x%*%bid
    tmp = t((mug-mu)^2)-thrh^2
    shid = colSums(tmp>0)
    return( sum(shid>0) )
  })
  fp[k] = mean(a)
}
## pi0[lambda] ~ lambda
rng = which(Lam>1.0)
plot(Lam[rng], (1-pg[rng]/2308)/(1-fp[rng]/2308), xlab=expression(lambda),
     ylab=expression(hat(pi)[0](lambda)))

R functions for the penalized F-statistics and shrunken-centroids

## R functions for penalized F/shrunken centroids
penF = function(ycl,x,lambda,standard=TRUE,scale=TRUE){
  y = as.integer(as.factor(ycl))
  ng = as.integer(table(y)); n = sum(ng); G = length(ng)
  yid = matrix(0, n,G)
  for(k in 1:G) yid[which(ycl==k),k] = 1/ng[k]
  mug = x%*%yid; mu = rowMeans(x)
  
  x2s = rowSums(x^2)
  si = sqrt(x2s - colSums(t((mug-mu)^2)*ng))
  
  if(standard){
    sh1 = sqrt((n-ng)/ng)/2
  } else{
    sh1 = ng/n/2
  }
  if(scale){
    thrh = outer(si, lambda*sh1, "*")
    tmp = abs(mug-mu)-thrh
    skct = ifelse(tmp>0,tmp,0)*sign(tmp)
    tmp = t((mug-mu)^2-thrh^2)*ng
    nom = colSums( ifelse(tmp>0,tmp,0) )/(G-1)
    den = x2s + colSums(t(thrh^2)*ng)/(n-G)
  } else{
    thrh = lambda*sh1
    tmp = t(t(abs(mug-mu))-thrh)
    skct = ifelse(tmp>0,tmp,0)*sign(tmp)
    tmp = (t((mug-mu)^2)-thrh^2)*ng
    nom = colSums( ifelse(tmp>0,tmp,0) )/(G-1)
    den = x2s + sum(thrh^2*ng)/(n-G)
  }

  return(list(FL1=nom/den, skct=skct))
}

Classification error for some public microarray data

For the 50 CVs, each time one-third of the samples was selected as the testing set. The classifier was trained on the training set using K-fold CV, which was then applied to the testing set.

Breast cancer microarray data

The data can be downloaded from http://data.cgt.duke.edu/west.php. K=5. This plot summaries the classification errors for the four methods compared in the paper.

Lymphoma microarray data

The data can be downloaded from http://llmpp.nih.gov/lymphoma. K=3. This plot summaries the classification errors for the four methods compared in the paper.

Brain tumor microarray data

The data can be downloaded from http://www.broad.mit.edu/cgi-bin/cancer/datasets.cgi. K=3. This plot summaries the classification errors for the four methods compared in the paper.