#PubH Stat Learning and Data Mining #Example 2.1b: Use of knn() function and LOOCV; see Fig 2.4 #set a seed for the random number generator set.seed(24) #generate simulated data #there are two classes: R and G #G: 1) mGi iid N((1,0), I), i=1,...,10 # 2) mk iid from {mG1,...,mG10}, xGk ~ N(mk, I/5), k=1,...,100--training # 2') mk iid from {mG1,...,mG10}, yGk ~ N(mk, I/5), k=1,...,10000-test #training: mG1<-rnorm(10, 1, 1) mG2<-rnorm(10, 0, 1) m1<-sample(mG1, 100, replace=T) m2<-sample(mG2, 100, replace=T) xG1<-rnorm(100, m1, sqrt(1/5)) xG2<-rnorm(100, m2, sqrt(1/5)) #test data: #m1<-sample(mG1, 10000, replace=T) #m2<-sample(mG2, 10000, replace=T) m1<-rep(mG1, 1000) m2<-rep(mG2, 1000) xtG1<-rnorm(10000, m1, sqrt(1/5)) xtG2<-rnorm(10000, m2, sqrt(1/5)) #R: 1) mRi iid N((0,1), I), i=1,...,10 # 2) mk iid from {mR1,...,mR10}, xRk ~ N(mk, I/5), k=1,...,100--training # 2') mk iid from {mR1,...,mR10}, yRk ~ N(mk, I/5), k=1,...,10000-test #training: mR1<-rnorm(10, 0, 1) mR2<-rnorm(10, 1, 1) m1<-sample(mR1, 100, replace=T) m2<-sample(mR2, 100, replace=T) xR1<-rnorm(100, m1, sqrt(1/5)) xR2<-rnorm(100, m2, sqrt(1/5)) #test data: #m1<-sample(mR1, 10000, replace=T) #m2<-sample(mR2, 10000, replace=T) m1<-rep(mR1, 1000) m2<-rep(mR2, 1000) xtR1<-rnorm(10000, m1, sqrt(1/5)) xtR2<-rnorm(10000, m2, sqrt(1/5)) train<-cbind(c(xG1, xR1), c(xG2, xR2)) test<-cbind(c(xtG1, xtR1), c(xtG2, xtR2)) yTr<-c(rep(0, 100), rep(1, 100)) #yTs<-c(rep(0, 10000), rep(1, 10000)) onesTr<-rep(1, 100) ones<-rep(1, 10000) #plot training data: pdf("ex2.1trdat.pdf") plot(xR1, xR2, col="red", xlab="X1", ylab="X2", xlim=c(-3,3), ylim=c(-3,3), main="Example 2.1") points(xG1, xG2, col="green") dev.off() ###use knn() in class package: library(class) ks<-seq(1, 199, by=2) TRerr<-TSerr<-CVerr<-rep(0, length(ks)) for(i in 1:length(ks)){ tr.res<-knn(train, train, k=ks[i], cl=yTr) TRerr[i]<-sum(onesTr[tr.res[1:100]==1])+sum(onesTr[tr.res[101:200]==0]) ts.res<-knn(train, test, k=ks[i], cl=yTr) TSerr[i]<-sum(ones[ts.res[1:10000]==1])+sum(ones[ts.res[10001:20000]==0]) #(only) LOOCV: cv.res<-knn.cv(train, k=ks[i], cl=yTr) CVerr[i]<-sum(onesTr[cv.res[1:100]==1])+sum(onesTr[cv.res[101:200]==0]) } ##summarizing the results in a plot: pdf("ex2.1b.pdf") plot(ks, TRerr/200, xlab="k", type="b", main="Performance of kNN", ylab="Error rate", ylim=c(0,0.55), col="green") lines(ks, TSerr/20000, type="b", col="red"); lines(ks, CVerr/200, type="b", col="blue"); #bayes rule's error rate: abline(h=0.1585) legend(0, 0.5, c("training", "test", "CV", "Bayes"), col=c("green", "red", "blue", "black"), pch=c(1,1,1,1) ) dev.off()