#PubH Stat Learning and Data Mining #Example 2.1: comparing linear regression, kNN, Bayes and logistic regression; # see Fig 2.4 #set a seed for the random number generator set.seed(24) #generate simulated data #there are two classes: R and G #G: 1) mGi iid N((1,0), I), i=1,...,10 # 2) mk iid from {mG1,...,mG10}, xGk ~ N(mk, I/5), k=1,...,100--training # 2') mk iid from {mG1,...,mG10}, yGk ~ N(mk, I/5), k=1,...,10000-test #training: mG1<-rnorm(10, 1, 1) mG2<-rnorm(10, 0, 1) m1<-sample(mG1, 100, replace=T) m2<-sample(mG2, 100, replace=T) xG1<-rnorm(100, m1, sqrt(1/5)) xG2<-rnorm(100, m2, sqrt(1/5)) #test data: #m1<-sample(mG1, 10000, replace=T) #m2<-sample(mG2, 10000, replace=T) m1<-rep(mG1, 1000) m2<-rep(mG2, 1000) xtG1<-rnorm(10000, m1, sqrt(1/5)) xtG2<-rnorm(10000, m2, sqrt(1/5)) #R: 1) mRi iid N((0,1), I), i=1,...,10 # 2) mk iid from {mR1,...,mR10}, xRk ~ N(mk, I/5), k=1,...,100--training # 2') mk iid from {mR1,...,mR10}, yRk ~ N(mk, I/5), k=1,...,10000-test #training: mR1<-rnorm(10, 0, 1) mR2<-rnorm(10, 1, 1) m1<-sample(mR1, 100, replace=T) m2<-sample(mR2, 100, replace=T) xR1<-rnorm(100, m1, sqrt(1/5)) xR2<-rnorm(100, m2, sqrt(1/5)) #test data: #m1<-sample(mR1, 10000, replace=T) #m2<-sample(mR2, 10000, replace=T) m1<-rep(mR1, 1000) m2<-rep(mR2, 1000) xtR1<-rnorm(10000, m1, sqrt(1/5)) xtR2<-rnorm(10000, m2, sqrt(1/5)) #response: y=1 for R; =0 got G yR<-rep(1,100); yG<-rep(0, 100) #ytR<-rep(1, 10000); ytG<-rep(1, 10000) #plot training data: plot(xR1, xR2, col="red", xlab="X1", ylab="X2", xlim=c(-3,3), ylim=c(-3,3), main="Example 2.1") points(xG1, xG2, col="green") ##############linear regression fit1<-lm(c(yG, yR)~c(xG1, xR1)+c(xG2, xR2)) summary(fit1) #Call: #lm(formula = c(yG, yR) ~ c(xG1, xR1) + c(xG2, xR2)) #Residuals: # Min 1Q Median 3Q Max #-0.75918 -0.29087 -0.03323 0.30858 0.89282 #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 0.52315 0.03189 16.40 < 2e-16 *** #c(xG1, xR1) -0.20295 0.02572 -7.89 2.05e-13 *** #c(xG2, xR2) 0.15940 0.02028 7.86 2.45e-13 *** #--- #Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 #Residual standard error: 0.3791 on 197 degrees of freedom #Multiple R-Squared: 0.4338, Adjusted R-squared: 0.428 #F-statistic: 75.46 on 2 and 197 DF, p-value: < 2.2e-16 #training error: ypred<-predict(fit1) onesTr<-rep(1, 100) lmerrTr<-sum(onesTr[ypred[1:100]>0.5])+sum(onesTr[ypred[101:200]<0.5]) #> lmerrTr #[1] 39 #> 39/200 #[1] 0.195 #prediction: #but the following does not work: only gives 200 predits; no data.frame in lm()? # ytpred<-predict(fit1, newdata=as.data.frame(cbind(c(xtG1, xtR1), c(xtG2, xtR2)))) # length(ytpred) # [1] 200 #use manual calculations: ytpred<-0.52315 - 0.20295*c(xtG1, xtR1) + 0.15940*c(xtG2, xtR2) length(ytpred) # [1] 20000 ythat<-as.numeric(ytpred>0.5) lmerrTs<-sum(ythat != c(rep(0, 10000), rep(1, 10000))) #> lmerrTs #[1] 4276 #> 4276/20000 #[1] 0.2138 ############# 1NN: # brute force implementation: # for k=1, knnerrTr=??? nnerrTs<-0 for (i in 1:10000){ ds<-(xtG1[i]-c(xG1, xR1))^2 + (xtG2[i]-c(xG2, xR2))^2 if (order(ds)[1]>100) nnerrTs<-nnerrTs+1 ds<-(xtR1[i]-c(xG1, xR1))^2 + (xtR2[i]-c(xG2, xR2))^2 if (order(ds)[1]<=100) nnerrTs<-nnerrTs+1 } #> order(c(1,3, -1,2)) #[1] 3 1 4 2 #> nnerrTs #[1] 4731 #> 4731/20000 #[1] 0.23655 #############Bayes rule: #priors: pr(G)=pr(R)=1/2 #posterior prob of being in G: postpr<-function(x1, x2, mG1, mG2, mR1, mR2){ fR<-fG<-rep(0, length(x1)) for (i in 1:10){ fR<-fR + dnorm(x1, mR1[i], sqrt(1/5))*dnorm(x2, mR2[i], sqrt(1/5))/10 fG<-fG + dnorm(x1, mG1[i], sqrt(1/5))*dnorm(x2, mG2[i], sqrt(1/5))/10 } fG/(fG+fR) } pr<-postpr(c(xtG1, xtR1), c(xtG2, xtR2), mG1, mG2, mR1, mR2) bayesHat<-as.numeric(pr<0.5) bayeserrTs<- sum(bayesHat != c(rep(0, 10000), rep(1, 10000))) #> bayeserrTs #[1] 3170 #> 3170/20000 #[1] 0.1585 #############Logistic regressgin fit2<-glm(c(yG, yR)~c(xG1, xR1)+c(xG2, xR2), family=binomial) summary(fit2) #Coefficients: # Estimate Std. Error z value Pr(>|z|) #(Intercept) 0.3198 0.2347 1.363 0.173 #c(xG1, xR1) -1.5069 0.2494 -6.042 1.52e-09 *** #c(xG2, xR2) 1.1561 0.1960 5.897 3.70e-09 *** #--- #Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 #(Dispersion parameter for binomial family taken to be 1) # Null deviance: 277.26 on 199 degrees of freedom #Residual deviance: 163.89 on 197 degrees of freedom #AIC: 169.89 #Training error: p<-predict(fit2, type="response") logiterrTr<-sum(onesTr[p[1:100]>0.5])+sum(onesTr[p[101:200]<0.5]) #> logiterrTr #[1] 40 #> 40/200 #[1] 0.2 #Test error: pTs<-1/(1+exp(-(0.3198-1.5069*c(xtG1, xtR1)+1.1561*c(xtG2, xtR2)))) logitHat<-as.numeric(pTs>0.5) logiterrTs<-sum(logitHat != c(rep(0, 10000), rep(1, 10000))) #> logiterrTs #[1] 4071 #> 4071/20000 #[1] 0.20355