#PubH Stat Learning and Data Mining #Example 4.1: comparing linear/logit regression (with masking) and LDA/QDA; see also Fig 4.2-3 #set a seed for the random number generator set.seed(24) #generate simulated data #there are 100 obs in each of three classes: N((1,1)',I), N((4,4)',I), N((7,7)',I) #two predictors: x1<-x2<-rep(0, 300) x1[1:100]<-rnorm(100, 1, 1) x2[1:100]<-rnorm(100, 1, 1) x1[101:200]<-rnorm(100, 4, 1) x2[101:200]<-rnorm(100, 4, 1) x1[201:300]<-rnorm(100, 7, 1) x2[201:300]<-rnorm(100, 7, 1) #response variables: indicators of class memberships y1<-y2<-y3<-rep(0, 300) y1[1:100]<-1 y2[101:200]<-1 y3[201:300]<-1 ###############################Linear models with LS #fit linear models: fit1<-lm(y1~x1+x2) fit2<-lm(y2~x1+x2) fit3<-lm(y3~x1+x2) #predicteds for the training data: grp<-rep(0, 300) for(i in 1:300){ if (fit1$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-1 if (fit2$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-2 if (fit3$fit[i]==max(fit1$fit[i], fit2$fit[i], fit3$fit[i])) grp[i]<-3 } grp # [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 3 3 1 1 1 3 1 3 1 #[112] 1 3 3 3 1 2 3 3 1 3 1 1 1 2 3 2 1 1 3 1 2 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 1 #[149] 2 3 1 3 1 3 3 3 3 1 3 3 1 3 1 3 1 1 3 1 3 1 1 3 3 2 1 1 3 3 3 1 1 1 3 3 2 #[186] 1 3 2 1 3 3 1 1 1 1 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[260] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[297] 3 3 3 3 ones100<-rep(1, 100) ones200<-rep(1, 200) #false negatives for class 1: 0 sum(ones100[grp[1:100]!=1]) #false positives for class 1: 40 sum(ones200[grp[101:300]==1]) #false negatives for class 2: 91 sum(ones100[grp[101:200]!=2]) #false positives for class 2: 0 sum(ones200[grp[c(1:100,201:300)]==2]) #false negatives for class 3: 0 sum(ones100[grp[201:300]!=3]) #false positives for class 3: 51 sum(ones200[grp[1:200]==3]) ###what's going on? draw a plot: setwd("C:/Users/panxx014/Documents/courses/7475/Examples/figs") pdf("ex4.1.pdf") plot(x1, x2, type="n", main="Fig 4.2: Masking") points(x1[1:100], x2[1:100], col="red") points(x1[101:200], x2[101:200], col="green") points(x1[201:300], x2[201:300], col="blue") #draw boundaries: coef(fit1) #(Intercept) x1 x2 #0.92778849 -0.07397153 -0.07705970 coef(fit2) # (Intercept) x1 x2 # 0.336721456 -0.008150703 0.007432402 coef(fit3) #(Intercept) x1 x2 #-0.26450994 0.08212224 0.06962730 #boundary b/w class 1 & 2: fit1.eq=fit2.eq abline(a=0.591/0.0845, b=-0.0658/0.0845, col="red") #boundary b/w class 1 & 3: fit1.eq=fit3.eq abline(a=1.193/0.1467, b=-0.1561/0.1467, col="blue") #boundary b/w class 3 & 2: fit3.eq=fit2.eq abline(a=0.602/0.0622, b=-0.0903/0.0622, col="green") dev.off() ##########adding some higher order terms may help: #fit linear models: fit1q<-lm(y1~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)) fit2q<-lm(y2~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)) fit3q<-lm(y3~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)) #predicteds for the training data: grp<-rep(0, 300) for(i in 1:300){ if (fit1q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-1 if (fit2q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-2 if (fit3q$fit[i]==max(fit1q$fit[i], fit2q$fit[i], fit3q$fit[i])) grp[i]<-3 } grp [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 [223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [297] 3 3 3 3 #######################LDA: library(MASS) y<-c(rep(1, 100), rep(2, 100), rep(3, 100)) #there is a parameter called prior that specified prior class probabilities; # by default it is taken as sample proportions: lda.fit<-lda(cbind(x1, x2), y) #training error: predict(lda.fit, cbind(x1, x2))$class # [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 # [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 #[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #[149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 #[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [297] 3 3 3 3 #you can also do CV (only LOOCV): lda.fitCV<-lda(cbind(x1, x2), y, CV=TRUE) names(lda.fitCV) [1] "class" "posterior" "call" lda.fitCV$class [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 1 [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 [297] 3 3 3 3 ##############################QDA qda.fit<-qda(cbind(x1, x2), y) #training error: predict(qda.fit, cbind(x1, x2))$class # [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 # [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 #[112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #[149] 2 2 2 2 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 #[186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[223] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[260] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 #[297] 3 3 3 3 ##############################Logistic regression: ##Because we have 3 classes, we should use a multinomila logit model, ## which is available from R package "nnet" or "mlogit" or "glmnet". # install.packages("glmnet") library(glmnet) mlogits<-glmnet(cbind(x1, x2), cbind(y1, y2, y3), family="multinomial", lambda=0) # estimates for the intercept terms: mlogits$a0 # s0 #y1 232.2588 #y2 220.7611 #y3 -453.0199 # estimates of the slopes: mlogits$beta #$y1 #2 x 1 sparse Matrix of class "dgCMatrix" # s0 #x1 -2.808626 #x2 -2.388110 # #$y2 #2 x 1 sparse Matrix of class "dgCMatrix" # s0 #x1 . #x2 . # #$y3 #2 x 1 sparse Matrix of class "dgCMatrix" # s0 #x1 45.34291 #x2 77.95167 ####predict: ## estimated class prob's: predict(mlogits, cbind(x1, x2), type="response") ## estimated class labels: predict(mlogits, cbind(x1, x2), type="class") ########################## ##As a simple alternative, one can fit the binary-logit models separately. ##In many contexts (including this one?), based on my limited observations ##and some theory in literature (Begg and Gray, Biomatrika, 1984, 11-18), ##the results are close. y12<-c(rep(1, 100), rep(0, 100)) y32<-c(rep(0, 100), rep(1, 100)) logit.fit12<-glm(y12~x1[c(1:100, 101:200)]+x2[c(1:100, 101:200)], family=binomial) logit.fit32<-glm(y32~x1[101:300]+x2[101:300], family=binomial) #Warning messages: #1: glm.fit: algorithm did not converge #2: glm.fit: fitted probabilities numerically 0 or 1 occurred #####a problem with logistic reg!!! coef(logit.fit12) # (Intercept) x1[c(1:100, 101:200)] x2[c(1:100, 101:200)] # 11.502694 -2.809551 -2.389577 coef(logit.fit32) #(Intercept) x1[101:300] x2[101:300] # -2632.4446 177.4203 304.5236 logit.fit32b<-glm(y32~x1[101:300]+x2[101:300], family=binomial, maxit=5) coef(logit.fit32b) #(Intercept) x1[101:300] x2[101:300] # -29.859320 2.587056 2.788535 #calculate class prob's: .... pr12= predict(logit.fit12, as.data.frame(cbind(x1[c(1:100, 101:200)], x2[c(1:100, 101:200)])), type="response") pr32= predict(logit.fit32, as.data.frame(cbind(x1[c(101:300)], x2[c(101:300)])), type="response") ###############masking with logistic regression if fitting 2 binary logits with 1vs 3 and 2 vs 3? ## Probably caused by bad estimates from nonconvergence! ## I tried a few examples w/o complete separation, masking did NOT appear! ###Note: in addiiton to LR, penalized logistic reg is one simple way to deal with the "class separation" problem! ###############Alternatively, fitting multinomial logistic reg by "nnet": library(nnet) dat2<-as.data.frame(cbind(x1, x2, y)) > fit2<-multinom(y~x1+x2, data=dat2) #initial value 329.583687 #iter 10 value 23.518688 #iter 20 value 14.748500 #iter 30 value 14.667039 #iter 40 value 14.488509 #iter 50 value 14.375186 #iter 60 value 14.213717 #iter 70 value 14.158340 #iter 80 value 14.115121 #iter 90 value 14.058838 #iter 100 value 13.831918 #final value 13.831918 #stopped after 100 iterations summary(fit2) #Call: #multinom(formula = y ~ x1 + x2, data = dat2) #Coefficients: # (Intercept) x1 x2 #2 -11.36164 2.771975 2.362145 #3 -118.67872 10.001403 14.666813 #Std. Errors: # (Intercept) x1 x2 #2 2.576279 0.7503928 0.7222844 #3 75.726472 4.9954560 9.2398613 #Residual Deviance: 27.66384 #AIC: 39.66384