########################################
## Seme-supervised spectral clustering algorithm

SSSC <- function(X,CN,nodes_SS,labels_SS,K,alpha,beta){

## Input:
## 1) X: Data matrix (n by p).
## 2) CN: the number of clusters.
## 3) nodes_SS: the nodes with semi-supervised information.
## 4) labels_SS: the labels of nodes_SS in 3).
## 5) K: the local scale delta[i] can be estimated by the 
##       distance between the sample i and its K-th neighbour.
## 6) alpha: the first parameter in our Algorithm SSSC.
## 7) beta: the second parameter in our Algorithm SSSC.
## Output: 
## 1) label vector (n by 1) of clustering result of SSSC.

X=as.matrix(X)
n=nrow(X)
p=ncol(X)
MM=matrix(0,n,n)
CC=matrix(0,n,n)

if (length(nodes_SS)==n) 
{
labels_SS=labels_SS[order(nodes_SS)]
nodes_SS=1:n
return(labels_SS)
}

if (length(nodes_SS)==0)
{
d=as.matrix(dist(X))
delta=rep(0,n)
A=matrix(0,n,n)
for (i in 1:n)
    { delta[i]=sort(d[i,])[K+1]
    }
for (i in 1:n)
for (j in 1:n)
    { A[i,j]=exp(-(d[i,j])^2/(delta[i]*delta[j]))    
    }
diag(A)=0
D=rowSums(A)
D=diag(1/(sqrt(D)+10^(-20)))
L=D%*%A%*%D
eig=eigen(L,symmetric=TRUE)
U=eig$vectors[1:n,1:CN]
q=sqrt(rowSums(U^2))+10^(-20)
for (i in 1:n)
    { for (j in 1:CN) U[i,j]=U[i,j]/q[i]          
    }
return(kmeans(U,CN,nstart=1000,iter.max=10000)$cluster)
}

if (length(nodes_SS)>0 & length(nodes_SS)<n)
{
labels_SS=labels_SS[order(nodes_SS)]
nodes_SS=nodes_SS[order(nodes_SS)]
a=union(labels_SS,labels_SS)
nl=length(a)
SNum=rep(0,nl)
for (i in 1:nl) SNum[i]=length(which(labels_SS==a[i]))
sk=matrix(0,nl,max(SNum))
nodes_nolabel=1:n
for (k in 1:nl)
     { sk[k,1:SNum[k]]=nodes_SS[which(labels_SS==a[k])]  
       MM[sk[k,1:SNum[k]],sk[k,1:SNum[k]]]=1
       nodes_nolabel=setdiff(nodes_nolabel,sk[k,1:SNum[k]])   
     }
Num_nn=length(nodes_nolabel)
nodes_withlabel=setdiff(1:n,nodes_nolabel)
for (ki in 1:(nl-1))
for (kj in (ki+1):nl)
     { CC[sk[ki,1:SNum[ki]],sk[kj,1:SNum[kj]]]=1
       CC[sk[kj,1:SNum[kj]],sk[ki,1:SNum[ki]]]=1
     }
diag(MM)=0
diag(CC)=0

d=as.matrix(dist(X))
delta=rep(0,n)
A=matrix(0,n,n)
for (i in 1:n)
    { delta[i]=sort(d[i,])[K+1]
    }
for (i in 1:n)
for (j in 1:n)
    { A[i,j]=exp(-(d[i,j])^2/(delta[i]*delta[j]))
      if (MM[i,j]==1) A[i,j]=1
      if (CC[i,j]==1) A[i,j]=0      
    }
diag(A)=0

if (length(nodes_SS)!=n)
    { for (t in 1:Num_nn)
          { b=rep(0,nl)
            for (i in 1:nl) b[i]=max(A[nodes_nolabel[t],sk[i,1:SNum[i]]])
            bm=which.max(b)
            for (i in setdiff(1:nl,bm)) 
                { A[nodes_nolabel[t],sk[i,1:SNum[i]]]=A[nodes_nolabel[t],sk[i,1:SNum[i]]]/beta
                  A[sk[i,1:SNum[i]],nodes_nolabel[t]]=A[sk[i,1:SNum[i]],nodes_nolabel[t]]/beta
                }
            if (alpha==1) 
                { alpha=A[sk[bm,1:SNum[bm]],nodes_nolabel[t]]=max(b)
                  A[nodes_nolabel[t],sk[bm,1:SNum[bm]]]=max(b)
                }
          }
    }
if (length(nodes_SS)==n) A=MM

D=rowSums(A)
D=diag(1/(sqrt(D)+10^(-20)))
L=D%*%A%*%D
eig=eigen(L,symmetric=TRUE)
U=eig$vectors[1:n,1:CN]
q=sqrt(rowSums(U^2))+10^(-20)
for (i in 1:n)
    { for (j in 1:CN) U[i,j]=U[i,j]/q[i]          
    }
return(kmeans(U,CN,nstart=1000,iter.max=10000)$cluster)
}

}

########################################
## Given the real clustering labels ('label') and a semi-supervised 
## ratio ('SSR'), randomly select SSR*100% samples from each 
## cluster as the nodes with semi-supervised information (nodes_SS),
## based on which we use Algorithm SSSC to get better clustering.

test_SSSC <- function(X,CN,label,SSR,K,alpha,beta,seed){

## Input:
## 1) X: Data matrix (n by p).
## 2) CN: the number of clusters.
## 3) label: the real label of the data X.
## 4) SSR: the semi-supervised ratio.
## 5) K: the local scale delta[i] can be estimated by the 
##       distance between the sample i and its K-th neighbour.
## 6) alpha: the first parameter in our Algorithm SSSC.
## 7) beta: the second parameter in our Algorithm SSSC.
## 8) seed: the seed for randomly select SSR*100% samples from clusters.
## Output: 
## 1) label vector of clustering result of SSSC given a SSR.

n=length(label)
if (SSR==1)
    { nodes_SS=1:n
      labels_SS=label
    }
if (SSR<=0)
    { nodes_SS=NULL
      labels_SS=NULL
    }
if (SSR>0 & SSR<=1)
    { a=union(label,label)
      nl=length(a)
      size=rep(0,nl)
      for (i in 1:nl) size[i]=length(which(label==a[i]))
      SSNum=max(size)
      SNum=as.integer(size*SSR)
      SNum[SNum<=1]=2
      sk=matrix(0,nl,SSNum)
      nodes_SS=NULL
      labels_SS=NULL
      for (k in 1:nl)
          { set.seed(seed)
            nodes_SS=c(nodes_SS,sort(sample(which(label==a[k]),SNum[k])))  
            labels_SS=c(labels_SS,rep(a[k],SNum[k])) 
          }
    }
res=SSSC(X,CN,nodes_SS,labels_SS,K,alpha,beta)
return(res)
}

########################################
## PCA for dimension reduction

PCA <- function(H,n_PCs){

## Input:
## 1) H: sample covariance matrix (n by n) of data matrix (H=X*t(X)).
## 2) n_PCs: the number of PCs.
## Output: 
## 1) Y: the new data matrix (n by n_PCs) after dimension reduction of PCA.

n=nrow(H)
eig=eigen(H,symmetric=TRUE)
s=sort.list(eig$values,decreasing=TRUE)
U=eig$vectors[1:n,s]
v=eig$values[s]
V=diag(sqrt(v[1:n_PCs]))
Y=U[,1:n_PCs]%*%V 
return(Y)
}

########################################
##

data_analysis <- function(CN,n_PCs,SSR,nRep,K,alpha,beta){

## Input:
## 1) CN: the number of clusters.
## 2) n_PCs: the number of PCs.
## 3) SSR: the semi-supervised ratio.
## 4) nRep: the number of simulations (Note: seed in 1:nRep). 
## 5) K: the local scale delta[i] can be estimated by the 
##       distance between the sample i and its K-th neighbour.
## 6) alpha: the first parameter in our Algorithm SSSC.
## 7) beta: the second parameter in our Algorithm SSSC.
## Output: 
## 1) print the clustering result for the 1st simulation.
## 2) print the average Rand index between real sub-group 
##    label and cluster label obtained by 'SSSC' for all 
##    the 'nRep' simulations.

library(clues)
c12=c(rep(1,35), rep(2,2),rep(1,43),rep(2,65),rep(3,24),rep(4,32),rep(5,17),rep(4,11),rep(5,19),rep(6,90),rep(7,92), rep(8,25),rep(9,68), rep(10,84) )
H=as.matrix(read.table("H_data.sim"))
n=nrow(H)
#A=diag(n)-matrix(1,n,1)%*%matrix(1,1,n)/n
#H=A%*%H%*%t(A)
c12sort=1:n
c12sort=sort.list(c12)
H=H[c12sort,c12sort]
c12sort=c12[c12sort]
eig=eigen(H,symmetric=TRUE)
s=sort.list(eig$values,decreasing=TRUE)
U=eig$vectors[1:n,s]
v=eig$values[s]
V=diag(sqrt(v[1:n_PCs]))
x=U[,1:n_PCs]%*%V 
cc12=c(rep(1,35), rep(2,2),rep(1,43),rep(2,65),rep(3,24),rep(4,32),rep(5,17),rep(4,11),rep(5,19),rep(6,90),rep(7,92),  rep(8,25),rep(9,68), rep("a",84) ) 
s12=union(c12sort,c12sort)
r=rep(0,5)

for (seed in 1:nRep)
    { cl=test_SSSC(x,CN,c12sort,SSR,K,alpha,beta,seed)
      r=r+adjustedRand(cl, c12sort)
      if (seed==1) 
          { print(paste("The clustering results for 10 sub-groups v.s. ",CN, " clusters with top ",n_PCs, " PCs (K=", K," SSR=",SSR, " alpha=",alpha, " beta=", beta, "):", sep =""))
            print("1) The Rand index between subgroups and obtained clusters for the 1st simulation:")
            print(adjustedRand(cl, c12sort))
          } 
      c=union(cl,cl)    
      vs=matrix(0,10,CN,dimnames = list(c( "YRI","LWK","ASW","GBR","FIN","CEU", "TSI","CHS","CHB","JPT"),1:CN))
      for (ii in 1:10)
          { for (jj in 1:CN)
                { vs[ii,jj]=length(intersect(which(c12sort==s12[ii]),which(cl==c[jj])))
                }
          }
      if (seed==1) print("2) The numbers of subjects assigned to each cluster for the 1st simulation:")
      if (seed==1) print(vs)
    }
r=r/nRep
print("3) The average Rand index between subgroups and obtained clusters for all the simulations:")
print(r)
}

########################################