######################################## ## Seme-supervised spectral clustering algorithm SSSC <- function(X,CN,nodes_SS,labels_SS,K,alpha,beta){ ## Input: ## 1) X: Data matrix (n by p). ## 2) CN: the number of clusters. ## 3) nodes_SS: the nodes with semi-supervised information. ## 4) labels_SS: the labels of nodes_SS in 3). ## 5) K: the local scale delta[i] can be estimated by the ## distance between the sample i and its K-th neighbour. ## 6) alpha: the first parameter in our Algorithm SSSC. ## 7) beta: the second parameter in our Algorithm SSSC. ## Output: ## 1) label vector (n by 1) of clustering result of SSSC. X=as.matrix(X) n=nrow(X) p=ncol(X) MM=matrix(0,n,n) CC=matrix(0,n,n) if (length(nodes_SS)==n) { labels_SS=labels_SS[order(nodes_SS)] nodes_SS=1:n return(labels_SS) } if (length(nodes_SS)==0) { d=as.matrix(dist(X)) delta=rep(0,n) A=matrix(0,n,n) for (i in 1:n) { delta[i]=sort(d[i,])[K+1] } for (i in 1:n) for (j in 1:n) { A[i,j]=exp(-(d[i,j])^2/(delta[i]*delta[j])) } diag(A)=0 D=rowSums(A) D=diag(1/(sqrt(D)+10^(-20))) L=D%*%A%*%D eig=eigen(L,symmetric=TRUE) U=eig$vectors[1:n,1:CN] q=sqrt(rowSums(U^2))+10^(-20) for (i in 1:n) { for (j in 1:CN) U[i,j]=U[i,j]/q[i] } return(kmeans(U,CN,nstart=1000,iter.max=10000)$cluster) } if (length(nodes_SS)>0 & length(nodes_SS)0 & SSR<=1) { a=union(label,label) nl=length(a) size=rep(0,nl) for (i in 1:nl) size[i]=length(which(label==a[i])) SSNum=max(size) SNum=as.integer(size*SSR) SNum[SNum<=1]=2 sk=matrix(0,nl,SSNum) nodes_SS=NULL labels_SS=NULL for (k in 1:nl) { set.seed(seed) nodes_SS=c(nodes_SS,sort(sample(which(label==a[k]),SNum[k]))) labels_SS=c(labels_SS,rep(a[k],SNum[k])) } } res=SSSC(X,CN,nodes_SS,labels_SS,K,alpha,beta) return(res) } ######################################## ## PCA for dimension reduction PCA <- function(H,n_PCs){ ## Input: ## 1) H: sample covariance matrix (n by n) of data matrix (H=X*t(X)). ## 2) n_PCs: the number of PCs. ## Output: ## 1) Y: the new data matrix (n by n_PCs) after dimension reduction of PCA. n=nrow(H) eig=eigen(H,symmetric=TRUE) s=sort.list(eig$values,decreasing=TRUE) U=eig$vectors[1:n,s] v=eig$values[s] V=diag(sqrt(v[1:n_PCs])) Y=U[,1:n_PCs]%*%V return(Y) } ######################################## ## data_analysis <- function(CN,n_PCs,SSR,nRep,K,alpha,beta){ ## Input: ## 1) CN: the number of clusters. ## 2) n_PCs: the number of PCs. ## 3) SSR: the semi-supervised ratio. ## 4) nRep: the number of simulations (Note: seed in 1:nRep). ## 5) K: the local scale delta[i] can be estimated by the ## distance between the sample i and its K-th neighbour. ## 6) alpha: the first parameter in our Algorithm SSSC. ## 7) beta: the second parameter in our Algorithm SSSC. ## Output: ## 1) print the clustering result for the 1st simulation. ## 2) print the average Rand index between real sub-group ## label and cluster label obtained by 'SSSC' for all ## the 'nRep' simulations. library(clues) c12=c(rep(1,35), rep(2,2),rep(1,43),rep(2,65),rep(3,24),rep(4,32),rep(5,17),rep(4,11),rep(5,19),rep(6,90),rep(7,92), rep(8,25),rep(9,68), rep(10,84) ) H=as.matrix(read.table("H_data.sim")) n=nrow(H) #A=diag(n)-matrix(1,n,1)%*%matrix(1,1,n)/n #H=A%*%H%*%t(A) c12sort=1:n c12sort=sort.list(c12) H=H[c12sort,c12sort] c12sort=c12[c12sort] eig=eigen(H,symmetric=TRUE) s=sort.list(eig$values,decreasing=TRUE) U=eig$vectors[1:n,s] v=eig$values[s] V=diag(sqrt(v[1:n_PCs])) x=U[,1:n_PCs]%*%V cc12=c(rep(1,35), rep(2,2),rep(1,43),rep(2,65),rep(3,24),rep(4,32),rep(5,17),rep(4,11),rep(5,19),rep(6,90),rep(7,92), rep(8,25),rep(9,68), rep("a",84) ) s12=union(c12sort,c12sort) r=rep(0,5) for (seed in 1:nRep) { cl=test_SSSC(x,CN,c12sort,SSR,K,alpha,beta,seed) r=r+adjustedRand(cl, c12sort) if (seed==1) { print(paste("The clustering results for 10 sub-groups v.s. ",CN, " clusters with top ",n_PCs, " PCs (K=", K," SSR=",SSR, " alpha=",alpha, " beta=", beta, "):", sep ="")) print("1) The Rand index between subgroups and obtained clusters for the 1st simulation:") print(adjustedRand(cl, c12sort)) } c=union(cl,cl) vs=matrix(0,10,CN,dimnames = list(c( "YRI","LWK","ASW","GBR","FIN","CEU", "TSI","CHS","CHB","JPT"),1:CN)) for (ii in 1:10) { for (jj in 1:CN) { vs[ii,jj]=length(intersect(which(c12sort==s12[ii]),which(cl==c[jj]))) } } if (seed==1) print("2) The numbers of subjects assigned to each cluster for the 1st simulation:") if (seed==1) print(vs) } r=r/nRep print("3) The average Rand index between subgroups and obtained clusters for all the simulations:") print(r) } ########################################