#PubH Stat Learning and Data Mining
#Example 4.1: comparing linear/logit regression (with masking) and LDA/QDA; see also Fig 4.2-3

#set a seed for the random number generator
set.seed(24)

#generate simulated data
#there are 100 obs in each of three classes: N((1,1)',I), N((4,4)',I), N((7,7)',I)

#two predictors:
x1<-x2<-rep(0, 300)
x1[1:100]<-rnorm(100, 1, 1)
x2[1:100]<-rnorm(100, 1, 1)
x1[101:200]<-rnorm(100, 4, 1)
x2[101:200]<-rnorm(100, 4, 1)
x1[201:300]<-rnorm(100, 7, 1)
x2[201:300]<-rnorm(100, 7, 1)

#response variables: indicators of class memberships
y1<-y2<-y3<-rep(0, 300)
y1[1:100]<-1
y2[101:200]<-1
y3[201:300]<-1

###############################Linear models with LS
#fit linear models:
fit1<-lm(y1~x1+x2)
fit2<-lm(y2~x1+x2)
fit3<-lm(y3~x1+x2)

#predicteds for the training data:
grp<-rep(0, 300)
for(i in 1:300){
  if (fit1$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-1
  if (fit2$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-2
  if (fit3$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-3
  }

 grp
#  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 3 3 1 1 1 3 1 3 1
#[112] 1 3 3 3 1 2 3 3 1 3 1 1 1 2 3 2 1 1 3 1 2 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 1
#[149] 2 3 1 3 1 3 3 3 3 1 3 3 1 3 1 3 1 1 3 1 3 1 1 3 3 2 1 1 3 3 3 1 1 1 3 3 2
#[186] 1 3 2 1 3 3 1 1 1 1 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[260] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[297] 3 3 3 3

ones100<-rep(1, 100)
ones200<-rep(1, 200)
#false negatives for class 1: 0
sum(ones100[grp[1:100]!=1])
#false positives for class 1: 40
sum(ones200[grp[101:300]==1])
#false negatives for class 2: 91
sum(ones100[grp[101:200]!=2])
#false positives for class 2: 0
sum(ones200[grp[c(1:100,201:300)]==2])
#false negatives for class 3: 0
sum(ones100[grp[201:300]!=3])
#false positives for class 3: 51
sum(ones200[grp[1:200]==3])

###what's going on? draw a plot:
setwd("C:/Users/panxx014/Documents/courses/7475/Examples/figs")
pdf("ex4.1.pdf")
plot(x1, x2, type="n", main="Fig 4.2: Masking")
points(x1[1:100], x2[1:100], col="red")
points(x1[101:200], x2[101:200], col="green")
points(x1[201:300], x2[201:300], col="blue")

#draw boundaries:
coef(fit1)
#(Intercept)          x1          x2 
#0.92778849 -0.07397153 -0.07705970 
coef(fit2)
# (Intercept)           x1           x2 
# 0.336721456 -0.008150703  0.007432402 
coef(fit3)
#(Intercept)          x1          x2 
#-0.26450994  0.08212224  0.06962730 

#boundary b/w class 1 & 2: fit1.eq=fit2.eq
abline(a=0.591/0.0845, b=-0.0658/0.0845, col="red")
#boundary b/w class 1 & 3: fit1.eq=fit3.eq
abline(a=1.193/0.1467, b=-0.1561/0.1467, col="blue")
#boundary b/w class 3 & 2: fit3.eq=fit2.eq
abline(a=0.602/0.0622, b=-0.0903/0.0622, col="green")

dev.off()

##########adding some higher order terms may help:
#fit linear models:
fit1q<-lm(y1~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2))
fit2q<-lm(y2~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2))
fit3q<-lm(y3~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2))

#predicteds for the training data:
grp<-rep(0, 300)
for(i in 1:300){
  if (fit1q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-1
  if (fit2q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-2
  if (fit3q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-3
  }

 grp
  [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2
[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3
[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[297] 3 3 3 3

#######################LDA:
library(MASS)
y<-c(rep(1, 100), rep(2, 100), rep(3, 100))
#there is a parameter called prior that specified prior class probabilities;
# by default it is taken as sample proportions:
lda.fit<-lda(cbind(x1, x2), y)

#training error:
predict(lda.fit, cbind(x1, x2))$class
#  [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
# [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1
#[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#[149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2
#[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[297] 3 3 3 3

#you can also do CV (only LOOCV):
lda.fitCV<-lda(cbind(x1, x2), y, CV=TRUE)
names(lda.fitCV)
[1] "class"     "posterior" "call" 
lda.fitCV$class
  [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 1
[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2
[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[297] 3 3 3 3

##############################QDA
qda.fit<-qda(cbind(x1, x2), y)
#training error:
predict(qda.fit, cbind(x1, x2))$class
#  [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
# [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1
#[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#[149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2
#[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#[297] 3 3 3 3

##############################Logistic regression:
##Because we have 3 classes, we should use a multinomila logit model,
## which is available from R package  "nnet" or "mlogit" or "glmnet".

# install.packages("glmnet")
library(glmnet)
mlogits<-glmnet(cbind(x1, x2), cbind(y1, y2, y3), family="multinomial",
                lambda=0)

# estimates for the intercept terms:
mlogits$a0
#          s0
#y1  232.2588
#y2  220.7611
#y3 -453.0199

# estimates of the slopes:
mlogits$beta
#$y1
#2 x 1 sparse Matrix of class "dgCMatrix"
#          s0
#x1 -2.808626
#x2 -2.388110
#
#$y2
#2 x 1 sparse Matrix of class "dgCMatrix"
#   s0
#x1  .
#x2  .
#
#$y3
#2 x 1 sparse Matrix of class "dgCMatrix"
#         s0
#x1 45.34291
#x2 77.95167

####predict:
## estimated class prob's:
predict(mlogits, cbind(x1, x2), type="response")

## estimated class labels:
predict(mlogits, cbind(x1, x2), type="class")

##########################
##As a simple alternative, one can fit the binary-logit models separately.
##In many contexts (including this one?), based on my limited observations
##and some theory in literature (Begg and Gray, Biomatrika, 1984, 11-18),
##the results are close.
y12<-c(rep(1, 100), rep(0, 100))
y32<-c(rep(0, 100), rep(1, 100))
logit.fit12<-glm(y12~x1[c(1:100, 101:200)]+x2[c(1:100, 101:200)], family=binomial)

logit.fit32<-glm(y32~x1[101:300]+x2[101:300], family=binomial)
#Warning messages:
#1: glm.fit: algorithm did not converge
#2: glm.fit: fitted probabilities numerically 0 or 1 occurred
#####a problem with logistic reg!!!

coef(logit.fit12)
#          (Intercept) x1[c(1:100, 101:200)] x2[c(1:100, 101:200)]
#            11.502694             -2.809551             -2.389577

coef(logit.fit32)
#(Intercept) x1[101:300] x2[101:300]
# -2632.4446    177.4203    304.5236

logit.fit32b<-glm(y32~x1[101:300]+x2[101:300], family=binomial, maxit=5)
coef(logit.fit32b)
#(Intercept) x1[101:300] x2[101:300] 
# -29.859320    2.587056    2.788535 

#calculate class prob's: ....
pr12= predict(logit.fit12, as.data.frame(cbind(x1[c(1:100, 101:200)], x2[c(1:100, 101:200)])), type="response")
pr32= predict(logit.fit32, as.data.frame(cbind(x1[c(101:300)], x2[c(101:300)])),
              type="response")

###############masking with logistic regression if fitting 2 binary logits with 1vs 3 and 2 vs 3?
## Probably caused by bad estimates from nonconvergence!
## I tried a few examples w/o complete separation, masking did NOT appear!

###Note: in addiiton to LR, penalized logistic reg is one simple way to deal with the "class separation" problem!

###############Alternatively, fitting multinomial logistic reg by "nnet":
library(nnet)
dat2<-as.data.frame(cbind(x1, x2, y))
> fit2<-multinom(y~x1+x2, data=dat2)
#initial  value 329.583687 
#iter  10 value 23.518688
#iter  20 value 14.748500
#iter  30 value 14.667039
#iter  40 value 14.488509
#iter  50 value 14.375186
#iter  60 value 14.213717
#iter  70 value 14.158340
#iter  80 value 14.115121
#iter  90 value 14.058838
#iter 100 value 13.831918
#final  value 13.831918 
#stopped after 100 iterations

 summary(fit2)
#Call:
#multinom(formula = y ~ x1 + x2, data = dat2)

#Coefficients:
#  (Intercept)        x1        x2
#2   -11.36164  2.771975  2.362145
#3  -118.67872 10.001403 14.666813

#Std. Errors:
#  (Intercept)        x1        x2
#2    2.576279 0.7503928 0.7222844
#3   75.726472 4.9954560 9.2398613

#Residual Deviance: 27.66384 
#AIC: 39.66384