#Stat Learning and Data Mining
#Example 9.1: Use K-means, model-based clustering, spectral clustering.

#to calculate Rand index:
library(clues)

data(iris)
#     This famous (Fisher's or Anderson's) iris data set gives the
#     measurements in centimeters of the variables sepal length and
#     width and petal length and width, respectively, for 50 flowers
#     from each of 3 species of iris.  The species are _Iris setosa_,
#     _versicolor_, and _virginica_.

iris[1:3,]
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#1          5.1         3.5          1.4         0.2  setosa
#2          4.9         3.0          1.4         0.2  setosa
#3          4.7         3.2          1.3         0.2  setosa

#data standardization:
x<-as.matrix(iris[,-5])
n=nrow(x); p=ncol(x)
mus<-apply(x, 2, mean)
sds<-apply(x, 2, sd)
for(i in 1:n)
  x[i,]<-(x[i,]-mus)/sds

y<-as.numeric(iris[,5])
y
#  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3
#[112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[149] 3 3

#show data:
pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex9.1.pdf")
pairs(iris[1:4], main = "Anderson's Iris Data -- Original",
      pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

pairs(x, main = "Anderson's Iris Data -- Standardized",
      pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])
dev.off()

########K-means:
set.seed(1)
kmeansres3 <- kmeans(x, 3, nstart=100)
kmeansres3
#K-means clustering with 3 clusters of sizes 50, 53, 47
#
#Cluster means:
#  Sepal.Length Sepal.Width Petal.Length Petal.Width
#1  -1.01119138  0.85041372   -1.3006301  -1.2507035
#2  -0.05005221 -0.88042696    0.3465767   0.2805873
#3   1.13217737  0.08812645    0.9928284   1.0141287
#
#Clustering vector:
#  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
# [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
#[112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
#[149] 3 2
#
#Within cluster sum of squares by cluster:
#[1] 47.35062 44.08754 47.45019
# (between_SS / total_SS =  76.7 %)

#Rand index:
adjustedRand(kmeansres3$cluster, y)
#     Rand        HA        MA        FM   Jaccard 
#0.8322148 0.6201352 0.6252249 0.7452105 0.5938921 


####Try two clusters (since the second and third species are overlapped):
kmeansres2 <- kmeans(x, 2, nstart=100)
adjustedRand(kmeansres2$cluster, c(rep(1,50), rep(2,100) ))
#   Rand      HA      MA      FM Jaccard 
#      1       1       1       1       1 

######model-based clustering:
library(mclust)

#default: it searches for the best model with G=1,...,9,
res<-Mclust(x)
res
#'Mclust' model object:
# best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 2 components

names(res)
# [1] "call"           "data"           "modelName"      "n"              "d"             
# [6] "G"              "BIC"            "bic"            "loglik"         "df"            
#[11] "hypvol"         "parameters"     "z"              "classification" "uncertainty"   

#parameter estimates (MLEs):
res$para
$Vinv
NULL

$pro
[1] 0.3333291 0.6666709

$mean
                   [,1]       [,2]
Sepal.Length -1.0111837  0.5055822
Sepal.Width   0.8504463 -0.4252151
Petal.Length -1.3006289  0.6503021
Petal.Width  -1.2507044  0.6253403

$variance
$variance$modelName
[1] "VVV"

$variance$d
[1] 4

$variance$G
[1] 2

$variance$sigma
, , 1

             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length   0.17757541  0.26937936  0.010964119 0.016040461
Sepal.Width    0.26937936  0.74114227  0.014896467 0.027429123
Petal.Length   0.01096412  0.01489647  0.009484405 0.004420546
Petal.Width    0.01604046  0.02742912  0.004420546 0.018733189

, , 2

             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length    0.6343547   0.3350867    0.3070669   0.2622096
Sepal.Width     0.3350867   0.5769960    0.1837455   0.2384840
Petal.Length    0.3070669   0.1837455    0.2165539   0.2124543
Petal.Width     0.2622096   0.2384840    0.2124543   0.3074577
......

#show model selection results:
 mclustBIC(x)

BIC:
        EII       VII        EEI        VEI        EVI        VVI        EEE        EEV
1 -1723.766 -1723.766 -1738.7979 -1738.7979 -1738.7979 -1738.7979 -1046.6559 -1046.6559
2 -1344.053 -1325.273 -1259.6457 -1172.9600 -1223.9860 -1074.2293  -904.7750  -873.6590
3 -1214.526 -1222.840 -1029.7318  -995.8345 -1014.5122  -961.3152  -849.6435  -873.0320
4 -1177.157 -1139.393 -1044.1093  -965.1341 -1054.0810  -957.3083  -862.7295  -884.9026
5 -1135.479 -1107.769  -958.6200  -905.0295  -983.5087  -917.7421  -821.5165  -920.3064
6 -1150.550 -1122.557  -940.5658  -891.8990  -953.9741  -923.7943  -854.8822  -920.0409
7 -1140.580 -1117.073  -888.3655  -883.5454  -920.8466  -920.4096  -830.1964  -908.1671
8 -1121.389 -1067.484  -908.0867  -894.3584  -957.0332  -943.9553  -852.4652  -960.3896
9 -1115.153 -1057.656  -895.2568  -888.5205  -953.9837  -966.2672  -860.9755 -1008.9819
         VEV        VVV
1 -1046.6559 -1046.6559
2  -802.5253  -790.6956
3  -832.3626  -797.5178
4  -861.8030         NA
5  -859.7821         NA
6  -877.6475         NA
7  -907.9526         NA
8  -960.9389 -1064.9436

#Rand index:
adjustedRand(res$class, y)
#     Rand        HA        MA        FM   Jaccard 
#0.7762864 0.5681159 0.5714286 0.7714543 0.5951417 

adjustedRand(res$class, c(rep(1,50), rep(2,100) ))
#   Rand      HA      MA      FM Jaccard 
#      1       1       1       1       1 

res3<-Mclust(x, G=3:9)
#Warning messages:
#1: In summary.mclustBIC(Bic, data, G = G, modelNames = modelNames) :
#  best model occurs at the min or max # of components considered
#2: In Mclust(x, G = 3:9) : optimal number of clusters occurs at min choice

res3
#'Mclust' model object:
# best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 3 components

adjustedRand(res3$class, y)
#     Rand        HA        MA        FM   Jaccard 
#0.9574944 0.9038742 0.9051581 0.9355986 0.8789809 

######very useful to use plot:
plot(res)


#######################Use spectral clustering:
library(kernlab)

SCres <- specc(x, centers=3, kernel="rbfdot", kpar="automatic")

 SCres
Spectral Clustering object of class "specc" 

 Cluster memberships: 
 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
 
Gaussian Radial Basis kernel function. 
 Hyperparameter : sigma =  7.26257829977622 

Centers:  
           [,1]       [,2]       [,3]       [,4]
[1,]  0.4556221 -0.4863246  0.6241403  0.6027379
[2,] -1.0111914  0.8504137 -1.3006301 -1.2507035
[3,]  2.1214087  1.5509344  1.4966310  1.3565323

Cluster size:  
[1] 97 50  3

Within-cluster sum of squares:  
[1] 314.729336 365.106833   2.562621

##Rand:
adjustedRand(as.vector(SCres), y)
#     Rand        HA        MA        FM   Jaccard 
#0.7770917 0.5621364 0.5657505 0.7599789 0.5865560

adjustedRand(as.vector(SCres), c(rep(1,50), rep(2,100) ))
#     Rand        HA        MA        FM   Jaccard 
#0.9739597 0.9476272 0.9479231 0.9761529 0.9528745