#PubH Stat Learning and Data Mining
#Example 2.1b: Use of knn() function and LOOCV; see Fig 2.4

#set a seed for the random number generator
set.seed(24)

#generate simulated data
#there are two classes: R and G
#G: 1) mGi iid N((1,0), I), i=1,...,10
#   2) mk iid from {mG1,...,mG10}, xGk ~ N(mk, I/5), k=1,...,100--training
#   2') mk iid from {mG1,...,mG10}, yGk ~ N(mk, I/5), k=1,...,10000-test

#training:
mG1<-rnorm(10, 1, 1)
mG2<-rnorm(10, 0, 1)
m1<-sample(mG1, 100, replace=T)
m2<-sample(mG2, 100, replace=T)
xG1<-rnorm(100, m1, sqrt(1/5))
xG2<-rnorm(100, m2, sqrt(1/5))
#test data:
#m1<-sample(mG1, 10000, replace=T)
#m2<-sample(mG2, 10000, replace=T)
m1<-rep(mG1, 1000)
m2<-rep(mG2, 1000)
xtG1<-rnorm(10000, m1, sqrt(1/5))
xtG2<-rnorm(10000, m2, sqrt(1/5))

#R: 1) mRi iid N((0,1), I), i=1,...,10
#   2) mk iid from {mR1,...,mR10}, xRk ~ N(mk, I/5), k=1,...,100--training
#   2') mk iid from {mR1,...,mR10}, yRk ~ N(mk, I/5), k=1,...,10000-test

#training:
mR1<-rnorm(10, 0, 1)
mR2<-rnorm(10, 1, 1)
m1<-sample(mR1, 100, replace=T)
m2<-sample(mR2, 100, replace=T)
xR1<-rnorm(100, m1, sqrt(1/5))
xR2<-rnorm(100, m2, sqrt(1/5))
#test data:
#m1<-sample(mR1, 10000, replace=T)
#m2<-sample(mR2, 10000, replace=T)
m1<-rep(mR1, 1000)
m2<-rep(mR2, 1000)
xtR1<-rnorm(10000, m1, sqrt(1/5))
xtR2<-rnorm(10000, m2, sqrt(1/5))

train<-cbind(c(xG1, xR1), c(xG2, xR2))
test<-cbind(c(xtG1, xtR1), c(xtG2, xtR2))
yTr<-c(rep(0, 100), rep(1, 100))
#yTs<-c(rep(0, 10000), rep(1, 10000))

onesTr<-rep(1, 100)
ones<-rep(1, 10000)

#plot training data:
pdf("ex2.1trdat.pdf")
plot(xR1, xR2, col="red", xlab="X1", ylab="X2", xlim=c(-3,3),
     ylim=c(-3,3), main="Example 2.1")
points(xG1, xG2, col="green")
dev.off()

###use knn() in class package:
library(class)

ks<-seq(1, 199, by=2)
TRerr<-TSerr<-CVerr<-rep(0, length(ks))
for(i in 1:length(ks)){
  tr.res<-knn(train, train, k=ks[i], cl=yTr)
  TRerr[i]<-sum(onesTr[tr.res[1:100]==1])+sum(onesTr[tr.res[101:200]==0])

  ts.res<-knn(train, test, k=ks[i], cl=yTr)
  TSerr[i]<-sum(ones[ts.res[1:10000]==1])+sum(ones[ts.res[10001:20000]==0])

  #(only) LOOCV:
  cv.res<-knn.cv(train, k=ks[i], cl=yTr)
  CVerr[i]<-sum(onesTr[cv.res[1:100]==1])+sum(onesTr[cv.res[101:200]==0])
}

##summarizing the results in a plot:
pdf("ex2.1b.pdf")
plot(ks, TRerr/200, xlab="k", type="b", main="Performance of kNN",
     ylab="Error rate", ylim=c(0,0.55), col="green")
lines(ks, TSerr/20000, type="b", col="red"); 
lines(ks, CVerr/200, type="b", col="blue");
#bayes rule's error rate:
abline(h=0.1585)
legend(0, 0.5, c("training", "test", "CV", "Bayes"),
       col=c("green", "red", "blue", "black"), pch=c(1,1,1,1) )
dev.off()