R codes for the analysis of SRBCT microarray data
URL = "http://research.nhgri.nih.gov/microarray/Supplement/Images/supplemental_data"
srbct = read.table(URL, header=TRUE, sep="\t", skip=1, quote="")
train = log(srbct[,3:65])
m = dim(train)[1]
y = substring(names(train),1,3-(substring(names(train),3,3)=="."))
x = as.matrix(train); ycl = as.integer(as.factor(y))
ng = as.vector(table(ycl)); n = sum(ng)
yid = matrix(0, n, length(ng))
for(k in 1:length(ng)) yid[which(ycl==k),k] = 1/ng[k]
mug = x%*%yid; mu = rowMeans(x)
B = 1e3; set.seed(1027)
Bid = sapply(1:B, function(k) sample(1:n, size=n, rep=FALSE))
K = 3e2; pg = fp = rep(0,K)
Lam = seq(20,100, length=K)
for(k in 1:K){
lambda = Lam[k]; thrh = lambda/2/ng
tmp = t((mug-mu)^2)-thrh^2
shid = colSums(tmp>0)
pg[k] = sum(shid>0)
a = apply(Bid, 2, function(id){
bid = yid[id,]
mug = x%*%bid
tmp = t((mug-mu)^2)-thrh^2
shid = colSums(tmp>0)
return( sum(shid>0) )
})
fp[k] = mean(a)
}
plot(pg, fp, xlab="R", ylab="FP")
K = 3e2; pg = fp = rep(0,K)
Lam = seq(0,10, length=K)
for(k in 1:K){
lambda = Lam[k]; thrh = lambda/2/ng
tmp = t((mug-mu)^2)-thrh^2
shid = colSums(tmp>0)
pg[k] = sum(shid>0)
a = apply(Bid, 2, function(id){
bid = yid[id,]
mug = x%*%bid
tmp = t((mug-mu)^2)-thrh^2
shid = colSums(tmp>0)
return( sum(shid>0) )
})
fp[k] = mean(a)
}
rng = which(Lam>1.0)
plot(Lam[rng], (1-pg[rng]/2308)/(1-fp[rng]/2308), xlab=expression(lambda),
ylab=expression(hat(pi)[0](lambda)))
R functions for the penalized F-statistics and shrunken-centroids
penF = function(ycl,x,lambda,standard=TRUE,scale=TRUE){
y = as.integer(as.factor(ycl))
ng = as.integer(table(y)); n = sum(ng); G = length(ng)
yid = matrix(0, n,G)
for(k in 1:G) yid[which(ycl==k),k] = 1/ng[k]
mug = x%*%yid; mu = rowMeans(x)
x2s = rowSums(x^2)
si = sqrt(x2s - colSums(t((mug-mu)^2)*ng))
if(standard){
sh1 = sqrt((n-ng)/ng)/2
} else{
sh1 = ng/n/2
}
if(scale){
thrh = outer(si, lambda*sh1, "*")
tmp = abs(mug-mu)-thrh
skct = ifelse(tmp>0,tmp,0)*sign(tmp)
tmp = t((mug-mu)^2-thrh^2)*ng
nom = colSums( ifelse(tmp>0,tmp,0) )/(G-1)
den = x2s + colSums(t(thrh^2)*ng)/(n-G)
} else{
thrh = lambda*sh1
tmp = t(t(abs(mug-mu))-thrh)
skct = ifelse(tmp>0,tmp,0)*sign(tmp)
tmp = (t((mug-mu)^2)-thrh^2)*ng
nom = colSums( ifelse(tmp>0,tmp,0) )/(G-1)
den = x2s + sum(thrh^2*ng)/(n-G)
}
return(list(FL1=nom/den, skct=skct))
}
Classification error for some public microarray data
For the 50 CVs, each time one-third of the samples was selected as the testing
set. The classifier was trained on the training set using K-fold CV, which was then
applied to the testing set.
Breast cancer microarray data
The data can be downloaded from
http://data.cgt.duke.edu/west.php. K=5.
This plot summaries the classification errors
for the four methods compared in the paper.
Lymphoma microarray data
The data can be downloaded from
http://llmpp.nih.gov/lymphoma. K=3.
This plot summaries the classification errors
for the four methods compared in the paper.
Brain tumor microarray data
The data can be downloaded from
http://www.broad.mit.edu/cgi-bin/cancer/datasets.cgi.
K=3.
This plot summaries the classification errors
for the four methods compared in the paper.