#PubH Stat Learning and Data Mining
#Example 2.1: comparing linear regression, kNN, Bayes and logistic regression; 
#             see Fig 2.4

#set a seed for the random number generator
set.seed(24)

#generate simulated data
#there are two classes: R and G
#G: 1) mGi iid N((1,0), I), i=1,...,10
#   2) mk iid from {mG1,...,mG10}, xGk ~ N(mk, I/5), k=1,...,100--training
#   2') mk iid from {mG1,...,mG10}, yGk ~ N(mk, I/5), k=1,...,10000-test

#training:
mG1<-rnorm(10, 1, 1)
mG2<-rnorm(10, 0, 1)
m1<-sample(mG1, 100, replace=T)
m2<-sample(mG2, 100, replace=T)
xG1<-rnorm(100, m1, sqrt(1/5))
xG2<-rnorm(100, m2, sqrt(1/5))
#test data:
#m1<-sample(mG1, 10000, replace=T)
#m2<-sample(mG2, 10000, replace=T)
m1<-rep(mG1, 1000)
m2<-rep(mG2, 1000)
xtG1<-rnorm(10000, m1, sqrt(1/5))
xtG2<-rnorm(10000, m2, sqrt(1/5))

#R: 1) mRi iid N((0,1), I), i=1,...,10
#   2) mk iid from {mR1,...,mR10}, xRk ~ N(mk, I/5), k=1,...,100--training
#   2') mk iid from {mR1,...,mR10}, yRk ~ N(mk, I/5), k=1,...,10000-test

#training:
mR1<-rnorm(10, 0, 1)
mR2<-rnorm(10, 1, 1)
m1<-sample(mR1, 100, replace=T)
m2<-sample(mR2, 100, replace=T)
xR1<-rnorm(100, m1, sqrt(1/5))
xR2<-rnorm(100, m2, sqrt(1/5))
#test data:
#m1<-sample(mR1, 10000, replace=T)
#m2<-sample(mR2, 10000, replace=T)
m1<-rep(mR1, 1000)
m2<-rep(mR2, 1000)
xtR1<-rnorm(10000, m1, sqrt(1/5))
xtR2<-rnorm(10000, m2, sqrt(1/5))
 
#response: y=1 for R; =0 got G
yR<-rep(1,100); yG<-rep(0, 100)
#ytR<-rep(1, 10000); ytG<-rep(1, 10000)

#plot training data:
plot(xR1, xR2, col="red", xlab="X1", ylab="X2", xlim=c(-3,3),
     ylim=c(-3,3), main="Example 2.1")
points(xG1, xG2, col="green")

##############linear regression
data.raw<-data.frame(cbind(c(yG, yR), c(xG1, xR1), c(xG2, xR2)) )
colnames(data.raw)<-c("Y", "X1", "X2")
fit1<-lm( Y~X1 + X2, data=data.raw)
summary(fit1)

#Call:
#lm(formula = c(yG, yR) ~ c(xG1, xR1) + c(xG2, xR2))

#Residuals:
#     Min       1Q   Median       3Q      Max 
#-0.75918 -0.29087 -0.03323  0.30858  0.89282 

#Coefficients:
#            Estimate Std. Error t value Pr(>|t|)    
#(Intercept)  0.52315    0.03189   16.40  < 2e-16 ***
#X1          -0.20295    0.02572   -7.89 2.05e-13 ***
#X2           0.15940    0.02028    7.86 2.45e-13 ***
#---
#Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

#Residual standard error: 0.3791 on 197 degrees of freedom
#Multiple R-Squared: 0.4338,     Adjusted R-squared: 0.428 
#F-statistic: 75.46 on 2 and 197 DF,  p-value: < 2.2e-16 

#training error:
ypred<-predict(fit1)
onesTr<-rep(1, 100)
lmerrTr<-sum(onesTr[ypred[1:100]>0.5])+sum(onesTr[ypred[101:200]<0.5])
#> lmerrTr
#[1] 39
#> 39/200
#[1] 0.195

#prediction:
 new.d<-data.frame(cbind(c(xtG1, xtR1), c(xtG2, xtR2)))
 colnames(new.d)<-c("X1", "X2")
 ytpred<-predict(fit1, newdata=new.d)
 length(ytpred)

#use manual calculations:
ytpred<-0.52315 - 0.20295*c(xtG1, xtR1) + 0.15940*c(xtG2, xtR2)
length(ytpred)
# [1] 20000

ythat<-as.numeric(ytpred>0.5)
lmerrTs<-sum(ythat != c(rep(0, 10000), rep(1, 10000)))
#> lmerrTs
#[1] 4276
#> 4276/20000
#[1] 0.2138


############# 1NN:
# brute force implementation:
# for k=1, knnerrTr=???
nnerrTs<-0
for (i in 1:10000){
  ds<-(xtG1[i]-c(xG1, xR1))^2 + (xtG2[i]-c(xG2, xR2))^2
  if (order(ds)[1]>100) nnerrTs<-nnerrTs+1

  ds<-(xtR1[i]-c(xG1, xR1))^2 + (xtR2[i]-c(xG2, xR2))^2
  if (order(ds)[1]<=100) nnerrTs<-nnerrTs+1
  }
 
#> order(c(1,3, -1,2))
#[1] 3 1 4 2
#> nnerrTs
#[1] 4731
#> 4731/20000
#[1] 0.23655


#############Bayes rule:
#priors: pr(G)=pr(R)=1/2
#posterior prob of being in G:
postpr<-function(x1, x2, mG1, mG2, mR1, mR2){
 fR<-fG<-rep(0, length(x1))
 for (i in 1:10){
   fR<-fR + dnorm(x1, mR1[i], sqrt(1/5))*dnorm(x2, mR2[i], sqrt(1/5))/10
   fG<-fG + dnorm(x1, mG1[i], sqrt(1/5))*dnorm(x2, mG2[i], sqrt(1/5))/10
   }
 fG/(fG+fR)
}

pr<-postpr(c(xtG1, xtR1), c(xtG2, xtR2), mG1, mG2, mR1, mR2)
bayesHat<-as.numeric(pr<0.5)
bayeserrTs<- sum(bayesHat != c(rep(0, 10000), rep(1, 10000)))
#> bayeserrTs
#[1] 3170
#> 3170/20000
#[1] 0.1585

#############Logistic regressgin
fit2<-glm(Y~X1 + X2, data=data.raw, family=binomial)
summary(fit2)

#Coefficients:
#            Estimate Std. Error z value Pr(>|z|)    
#(Intercept)   0.3198     0.2347   1.363    0.173    
#X1           -1.5069     0.2494  -6.042 1.52e-09 ***
#X2            1.1561     0.1960   5.897 3.70e-09 ***
#---
#Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 

#(Dispersion parameter for binomial family taken to be 1)

#    Null deviance: 277.26  on 199  degrees of freedom
#Residual deviance: 163.89  on 197  degrees of freedom
#AIC: 169.89


#Training error:
p<-predict(fit2, type="response")
logiterrTr<-sum(onesTr[p[1:100]>0.5])+sum(onesTr[p[101:200]<0.5])
#> logiterrTr
#[1] 40
#> 40/200
#[1] 0.2
#Test error:
pTs<-1/(1+exp(-(0.3198-1.5069*c(xtG1, xtR1)+1.1561*c(xtG2, xtR2))))
logitHat<-as.numeric(pTs>0.5)
logiterrTs<-sum(logitHat != c(rep(0, 10000), rep(1, 10000)))
logiterrTs/20000
#> logiterrTs
#[1] 4071
#> 4071/20000
#[1] 0.20355

#can you explain why:
pTs<-predict(fit2, newdata=new.d, family=binomial)
logitHat<-as.numeric(pTs>0.5)
logiterrTs<-sum(logitHat != c(rep(0, 10000), rep(1, 10000)))
logiterrTs/20000
#0.2408
 logitHat<-as.numeric(pTs>0)
 logiterrTs<-sum(logitHat != c(rep(0, 10000), rep(1, 10000)))
 logiterrTs/20000
#[1] 0.20355
pTs<-predict(fit2, newdata=new.d, family=binomial, type="response")
logitHat<-as.numeric(pTs>0.5)
logiterrTs<-sum(logitHat != c(rep(0, 10000), rep(1, 10000)))
logiterrTs/20000
#[1] 0.20355