## Table 4.1: snoring and heart disease t41 = matrix( c(24,35,21,30, 1355,603,192,224), 4,2 ) t41 n = sum(t41) nip = rowSums(t41); npj = colSums(t41) ## testing for independence chisq.test(t41) prop.test(t41[,1], nip) ## score/wald/Lr trio u41 = outer(nip, npj)/n sct = sum((t41-u41)^2/u41) sct; 1-pchisq(sct,1) wd = sum((t41-u41)^2/t41) wd; 1-pchisq(wd,1) Lr = sum(2*t41*log(t41/u41)) Lr; 1-pchisq(Lr,1) ## Trend test: incorporating ordinal score to improve power x = c(0,2,4,5); y = 1:0 nom = sum(outer(x,y)*t41) - sum(x*nip)*sum(y*npj)/n den2 = (sum(x^2*nip)-sum(x*nip)^2/n)*(sum(y^2*npj)-sum(y*npj)^2/n) r = nom/sqrt(den2) (n-1)*r^2 1-pchisq((n-1)*r^2,1) prop.trend.test(t41[,1], nip, score = x) n*r^2
## regression approach: y ~ x x1 = rep(x, nip) y1 = rep( rep(y,4), as.vector(t(t41)) ) cor(x1,y1); r summary(lm(y1 ~ x1)) ## summary(lm(x1 ~ y1)) r^2 r^2/((1-r^2)/(n-2)) ## F-statistics ## prop regression: Pr(y|x) ~ x summary(lm(t41[,1]/nip ~ x))
## Logit/Probit link glm1 = glm(y1 ~ x1, family=binomial(link = "logit")) summary(glm1) glm2 = glm(y1 ~ x1, family=binomial(link = "probit")) summary(glm2) ### identity link: not implemented ## glm(y1 ~ x1, family=binomial(link = "identity")) ### IWLS method: local quadratic approximation ## function value fn = function(ab, score,tab){ prb = ab[1] + ab[2]*score sum(log(prb)*tab[,1]) + sum(log(1-prb)*tab[,2]) } ## Gradient vector gn = function(ab, score,tab){ prb = ab[1] + ab[2]*score ga = sum(tab[,1]/prb) - sum(tab[,2]/(1-prb)) gb = sum(tab[,1]*score/prb) - sum(tab[,2]*score/(1-prb)) return( c(ga,gb) ) } ## Hessian matrix hn = function(ab, score,tab){ prb = ab[1] + ab[2]*score h11 = -sum(tab[,1]/prb^2) - sum(tab[,2]/(1-prb)^2) h22 = -sum(tab[,1]*score^2/prb^2) - sum(tab[,2]*score^2/(1-prb)^2) h12 = -sum(tab[,1]*score/prb^2) - sum(tab[,2]*score/(1-prb)^2) return( matrix(c(h11,h12,h12,h22), 2,2) ) } ## starting value ab0 = c(0.01,0) ## Iteration for(k in 1:10){ f2 = hn(ab0, x,t41) f1 = gn(ab0, x,t41) dta = as.vector(solve(f2,f1)) ab0 = ab0-dta cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n") } ifm = hn(ab0, x,t41) vm = -solve(ifm) vm ## independence testing: ab0[2]=0 ab0[2]^2/vm[4] ## compared to Logit/Probit link testing results summary(glm1)[[12]][2,3]^2 summary(glm2)[[12]][2,3]^2 ## Visual fitting comparison: predicted probabilities glm1$coef glm2$coef reg1 = lm(t41[,1]/nip ~ x) summary(reg1) summary(lm(y1 ~ x1)) Lgf = function(x, cef){ xb = cef[1]+x*cef[2] 1/(1+exp(-xb)) } Pbf = function(x, cef){ xb = cef[1]+x*cef[2] pnorm(xb) } curve(Lgf(x,glm1$coef), 0,5, xlab="Snoring", ylab=expression(hat(pi)), ylim=c(0,0.13) ) points(x, t41[,1]/nip, pch=20, cex=2) curve(Pbf(x,glm2$coef), add=TRUE, col=2, lty=2) abline(reg1, col=3, lty=3) abline(ab0, col=4, lty=4) legend(0,0.135, c("Logit","Probit","OLS","IWLS"), col=1:4, lty=1:4, bty="n")
## readin dataset crab = read.table("http://umn.edu/~baolin/teaching/ph5467/crab.txt", header=TRUE) ## attach dataframe for variable access attach(crab) ## grouping observations with similar x wt = cut(width, breaks=c(min(width),23:29+0.25,max(width)), include.lowest=TRUE ) ## transformed into integer index wi = as.integer(wt) table(wi) ## group mean for the predictor/response wavg = tapply(width, wi, mean) yavg = tapply(satell, wi, mean) ## replace predictor with the grouped mean x = wavg[wi]
## log linear model for count data, tracing the algorithm convergence a1 = glm(satell ~ width, family=poisson(link="log"), trace=TRUE) summary(a1) ## identity link for count data: specify starting value a2 = glm(satell ~ width, family=poisson(link="identity"), trace=TRUE, start=a1$coef) summary(a2)
## transform original data into events rate ## 66 unique x values and sort them length(unique(width)) ux = sort(unique(width)) ## calculate the sum of y and the number of observations sy = tapply(satell, width, sum) tx = tapply(satell, width, length) ## or using table(width) ## log linear model with offset: identical to a1 a3 = glm(sy ~ ux, family=poisson(link="log"), offset=log(tx)); a3$coef summary(a3) ## numerical iteration for solving identity link for event rate modeling ## Gradient vector: local first order gn = function(ab, sy,tx,ux){ xb = ab[1]+ab[2]*ux ga = sum( -tx + sy/xb ) gb = sum( -tx*ux + sy*ux/xb ) return( c(ga,gb) ) } ## Hessian matrix: local second order hn = function(ab, sy,tx,ux){ xb = ab[1]+ab[2]*ux h11 = -sum( sy/xb^2 ) h22 = -sum( sy*ux^2/xb^2 ) h12 = -sum( sy*ux/xb^2 ) return( matrix(c(h11,h12,h12,h22), 2,2) ) } ## starting value very important! ab0 = c(-10,0.5) for(k in 1:10){ f2 = hn(ab0, sy,tx,ux) f1 = gn(ab0, sy,tx,ux) dta = as.vector(solve(f2,f1)) ab0 = ab0-dta cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n") } ## observed fisher information for variance calculation ifm = hn(ab0, sy,tx,ux) vm = -solve(ifm) vm ab0[2]; sqrt(vm[4])
### goodness of fit inference: not quite appropriate ypr = a3$fit G2 = 2*sum(sy*log(sy/ypr),na.rm=TRUE) G2; 1-pchisq(G2,64) X2 = sum( (sy-ypr)^2/ypr ) X2; 1-pchisq(X2,64) ### grouped data: better approximation ## sum of observed counts sy = tapply(satell, width, sum) id = tapply(wi, width, mean)
## sum of grouped counts and predicted counts syg = tapply(sy, id, sum) yprg = tapply(ypr, id, sum) G2 = 2*sum(syg*log(syg/yprg),na.rm=TRUE) G2; 1-pchisq(G2,6) X2 = sum( (syg-yprg)^2/yprg ) X2; 1-pchisq(X2,6) ## model event rate wavg = tapply(width, wi, mean) ysum = tapply(satell, wi, sum) txg = tapply(wi, wi, length) ## correct way to specify the model a5 = glm(ysum ~ wavg, family=poisson(link="log"), offset=log(txg)) a5$coef ## not right a6 = glm(ysum ~ wavg, family=poisson(link="log")); a6$coef ypr = a5$fit G2 = 2*sum(ysum*log(ysum/ypr),na.rm=TRUE) G2; 1-pchisq(G2,6) X2 = sum( (ysum-ypr)^2/ypr ) X2; 1-pchisq(X2,6) ## direct modeling: replace x by the local average x = wavg[wi] a7 = glm(satell ~ x, family=poisson(link="log")); a7$coef a8 = glm(satell ~ x, family=poisson(link="identity"), start=c(-11,0.5)) a8$coef