#Stat Learning and Data Mining #Example 9.1: Use K-means, model-based clustering, spectral clustering. #to calculate Rand index: library(clues) data(iris) # This famous (Fisher's or Anderson's) iris data set gives the # measurements in centimeters of the variables sepal length and # width and petal length and width, respectively, for 50 flowers # from each of 3 species of iris. The species are _Iris setosa_, # _versicolor_, and _virginica_. iris[1:3,] # Sepal.Length Sepal.Width Petal.Length Petal.Width Species #1 5.1 3.5 1.4 0.2 setosa #2 4.9 3.0 1.4 0.2 setosa #3 4.7 3.2 1.3 0.2 setosa #data standardization: x<-as.matrix(iris[,-5]) n=nrow(x); p=ncol(x) mus<-apply(x, 2, mean) sds<-apply(x, 2, sd) for(i in 1:n) x[i,]<-(x[i,]-mus)/sds y<-as.numeric(iris[,5]) y # [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 # [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 #[112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[149] 3 3 #show data: pdf("C:/Users/panxx014/Documents/courses/7475/Examples/figs/ex9.1.pdf") pairs(iris[1:4], main = "Anderson's Iris Data -- Original", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)]) pairs(x, main = "Anderson's Iris Data -- Standardized", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)]) dev.off() ########K-means: set.seed(1) kmeansres3 <- kmeans(x, 3, nstart=100) kmeansres3 #K-means clustering with 3 clusters of sizes 50, 53, 47 # #Cluster means: # Sepal.Length Sepal.Width Petal.Length Petal.Width #1 -1.01119138 0.85041372 -1.3006301 -1.2507035 #2 -0.05005221 -0.88042696 0.3465767 0.2805873 #3 1.13217737 0.08812645 0.9928284 1.0141287 # #Clustering vector: # [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 # [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 #[112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3 #[149] 3 2 # #Within cluster sum of squares by cluster: #[1] 47.35062 44.08754 47.45019 # (between_SS / total_SS = 76.7 %) #Rand index: adjustedRand(kmeansres3$cluster, y) # Rand HA MA FM Jaccard #0.8322148 0.6201352 0.6252249 0.7452105 0.5938921 ####Try two clusters (since the second and third species are overlapped): kmeansres2 <- kmeans(x, 2, nstart=100) adjustedRand(kmeansres2$cluster, c(rep(1,50), rep(2,100) )) # Rand HA MA FM Jaccard # 1 1 1 1 1 ######model-based clustering: library(mclust) #default: it searches for the best model with G=1,...,9, res<-Mclust(x) res #'Mclust' model object: # best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 2 components names(res) # [1] "call" "data" "modelName" "n" "d" # [6] "G" "BIC" "bic" "loglik" "df" #[11] "hypvol" "parameters" "z" "classification" "uncertainty" #parameter estimates (MLEs): res$para $Vinv NULL $pro [1] 0.3333291 0.6666709 $mean [,1] [,2] Sepal.Length -1.0111837 0.5055822 Sepal.Width 0.8504463 -0.4252151 Petal.Length -1.3006289 0.6503021 Petal.Width -1.2507044 0.6253403 $variance $variance$modelName [1] "VVV" $variance$d [1] 4 $variance$G [1] 2 $variance$sigma , , 1 Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length 0.17757541 0.26937936 0.010964119 0.016040461 Sepal.Width 0.26937936 0.74114227 0.014896467 0.027429123 Petal.Length 0.01096412 0.01489647 0.009484405 0.004420546 Petal.Width 0.01604046 0.02742912 0.004420546 0.018733189 , , 2 Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length 0.6343547 0.3350867 0.3070669 0.2622096 Sepal.Width 0.3350867 0.5769960 0.1837455 0.2384840 Petal.Length 0.3070669 0.1837455 0.2165539 0.2124543 Petal.Width 0.2622096 0.2384840 0.2124543 0.3074577 ...... #show model selection results: mclustBIC(x) BIC: EII VII EEI VEI EVI VVI EEE EEV 1 -1723.766 -1723.766 -1738.7979 -1738.7979 -1738.7979 -1738.7979 -1046.6559 -1046.6559 2 -1344.053 -1325.273 -1259.6457 -1172.9600 -1223.9860 -1074.2293 -904.7750 -873.6590 3 -1214.526 -1222.840 -1029.7318 -995.8345 -1014.5122 -961.3152 -849.6435 -873.0320 4 -1177.157 -1139.393 -1044.1093 -965.1341 -1054.0810 -957.3083 -862.7295 -884.9026 5 -1135.479 -1107.769 -958.6200 -905.0295 -983.5087 -917.7421 -821.5165 -920.3064 6 -1150.550 -1122.557 -940.5658 -891.8990 -953.9741 -923.7943 -854.8822 -920.0409 7 -1140.580 -1117.073 -888.3655 -883.5454 -920.8466 -920.4096 -830.1964 -908.1671 8 -1121.389 -1067.484 -908.0867 -894.3584 -957.0332 -943.9553 -852.4652 -960.3896 9 -1115.153 -1057.656 -895.2568 -888.5205 -953.9837 -966.2672 -860.9755 -1008.9819 VEV VVV 1 -1046.6559 -1046.6559 2 -802.5253 -790.6956 3 -832.3626 -797.5178 4 -861.8030 NA 5 -859.7821 NA 6 -877.6475 NA 7 -907.9526 NA 8 -960.9389 -1064.9436 #Rand index: adjustedRand(res$class, y) # Rand HA MA FM Jaccard #0.7762864 0.5681159 0.5714286 0.7714543 0.5951417 adjustedRand(res$class, c(rep(1,50), rep(2,100) )) # Rand HA MA FM Jaccard # 1 1 1 1 1 res3<-Mclust(x, G=3:9) #Warning messages: #1: In summary.mclustBIC(Bic, data, G = G, modelNames = modelNames) : # best model occurs at the min or max # of components considered #2: In Mclust(x, G = 3:9) : optimal number of clusters occurs at min choice res3 #'Mclust' model object: # best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 3 components adjustedRand(res3$class, y) # Rand HA MA FM Jaccard #0.9574944 0.9038742 0.9051581 0.9355986 0.8789809 ######very useful to use plot: plot(res) #######################Use spectral clustering: library(kernlab) SCres <- specc(x, centers=3, kernel="rbfdot", kpar="automatic") SCres Spectral Clustering object of class "specc" Cluster memberships: 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 Gaussian Radial Basis kernel function. Hyperparameter : sigma = 7.26257829977622 Centers: [,1] [,2] [,3] [,4] [1,] 0.4556221 -0.4863246 0.6241403 0.6027379 [2,] -1.0111914 0.8504137 -1.3006301 -1.2507035 [3,] 2.1214087 1.5509344 1.4966310 1.3565323 Cluster size: [1] 97 50 3 Within-cluster sum of squares: [1] 314.729336 365.106833 2.562621 ##Rand: adjustedRand(as.vector(SCres), y) # Rand HA MA FM Jaccard #0.7770917 0.5621364 0.5657505 0.7599789 0.5865560 adjustedRand(as.vector(SCres), c(rep(1,50), rep(2,100) )) # Rand HA MA FM Jaccard #0.9739597 0.9476272 0.9479231 0.9761529 0.9528745