Baolin Wu :: BiostatCDA :: Biostat CDA

Snoring and heart disease data : binomial modeling

Independence testing: contingency table approach

 ## Table 4.1: snoring and heart disease
 t41 = matrix( c(24,35,21,30, 1355,603,192,224), 4,2 )
 t41
 n = sum(t41)
 nip = rowSums(t41); npj = colSums(t41)
 ## testing for independence
 chisq.test(t41)
 prop.test(t41[,1], nip)
 ## score/wald/Lr trio
 u41 = outer(nip, npj)/n
 sct = sum((t41-u41)^2/u41)
 sct; 1-pchisq(sct,1)
 wd = sum((t41-u41)^2/t41)
 wd; 1-pchisq(wd,1)
 Lr = sum(2*t41*log(t41/u41))
 Lr; 1-pchisq(Lr,1)
 ## Trend test: incorporating ordinal score to improve power
 x = c(0,2,4,5); y = 1:0
 nom = sum(outer(x,y)*t41) - sum(x*nip)*sum(y*npj)/n
 den2 = (sum(x^2*nip)-sum(x*nip)^2/n)*(sum(y^2*npj)-sum(y*npj)^2/n)
 r = nom/sqrt(den2)
 (n-1)*r^2
 1-pchisq((n-1)*r^2,1)
 prop.trend.test(t41[,1], nip, score = x)
 n*r^2

Independence testing: linear regression modeling approach

 ## regression approach: y ~ x
 x1 = rep(x, nip)
 y1 = rep( rep(y,4), as.vector(t(t41)) )
 cor(x1,y1); r
 summary(lm(y1 ~ x1)) ## summary(lm(x1 ~ y1))
 r^2
 r^2/((1-r^2)/(n-2)) ## F-statistics
 ## prop regression: Pr(y|x) ~ x
 summary(lm(t41[,1]/nip ~ x))

Independence testing: GLM approach

 ## Logit/Probit link
 glm1 = glm(y1 ~ x1, family=binomial(link = "logit"))
 summary(glm1)
 glm2 = glm(y1 ~ x1, family=binomial(link = "probit"))
 summary(glm2)
 ### identity link: not implemented
 ## glm(y1 ~ x1, family=binomial(link = "identity"))
 ### IWLS method: local quadratic approximation
 ## function value
 fn = function(ab, score,tab){
   prb = ab[1] + ab[2]*score
   sum(log(prb)*tab[,1]) + sum(log(1-prb)*tab[,2])
 }
 ## Gradient vector
 gn = function(ab, score,tab){
   prb = ab[1] + ab[2]*score
   ga = sum(tab[,1]/prb) - sum(tab[,2]/(1-prb))
   gb = sum(tab[,1]*score/prb) - sum(tab[,2]*score/(1-prb))
   return( c(ga,gb) )
 }
 ## Hessian matrix
 hn = function(ab, score,tab){
   prb = ab[1] + ab[2]*score
   h11 = -sum(tab[,1]/prb^2) - sum(tab[,2]/(1-prb)^2)
   h22 = -sum(tab[,1]*score^2/prb^2) - sum(tab[,2]*score^2/(1-prb)^2)
   h12 = -sum(tab[,1]*score/prb^2) - sum(tab[,2]*score/(1-prb)^2)
   return( matrix(c(h11,h12,h12,h22), 2,2) )
 }
 ## starting value
 ab0 = c(0.01,0)
 ## Iteration
 for(k in 1:10){
   f2 = hn(ab0, x,t41)
   f1 = gn(ab0, x,t41)
   dta = as.vector(solve(f2,f1))
   ab0 = ab0-dta
   cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n")
 }
 ifm = hn(ab0, x,t41)
 vm = -solve(ifm)
 vm
 ## independence testing: ab0[2]=0
 ab0[2]^2/vm[4]
 ## compared to Logit/Probit link testing results
 summary(glm1)[[12]][2,3]^2
 summary(glm2)[[12]][2,3]^2
 ## Visual fitting comparison: predicted probabilities
 glm1$coef
 glm2$coef
 reg1 = lm(t41[,1]/nip ~ x)
 summary(reg1)
 summary(lm(y1 ~ x1))
 Lgf = function(x, cef){
    xb = cef[1]+x*cef[2]
    1/(1+exp(-xb))
 }
 Pbf = function(x, cef){
    xb = cef[1]+x*cef[2]
    pnorm(xb)
 }
 curve(Lgf(x,glm1$coef), 0,5, xlab="Snoring", 
       ylab=expression(hat(pi)), ylim=c(0,0.13) )
 points(x, t41[,1]/nip, pch=20, cex=2)
 curve(Pbf(x,glm2$coef), add=TRUE, col=2, lty=2)
 abline(reg1, col=3, lty=3)
 abline(ab0, col=4, lty=4)
 legend(0,0.135, c("Logit","Probit","OLS","IWLS"),
        col=1:4, lty=1:4, bty="n")

Horseshoe crab data : poisson modeling

A figure summarizing some of the analysis results.

necessary data preprocessing

## readin dataset
crab = read.table("http://umn.edu/~baolin/teaching/ph5467/crab.txt", header=TRUE)
## attach dataframe for variable access
attach(crab)
## grouping observations with similar x
wt = cut(width, breaks=c(min(width),23:29+0.25,max(width)), include.lowest=TRUE )
## transformed into integer index
wi = as.integer(wt)
table(wi)
## group mean for the predictor/response
wavg = tapply(width, wi, mean)
yavg = tapply(satell, wi, mean)
## replace predictor with the grouped mean
x = wavg[wi]

model count data with poisson distribution

## log linear model for count data, tracing the algorithm convergence
a1 = glm(satell ~ width, family=poisson(link="log"), trace=TRUE)
summary(a1)
## identity link for count data: specify starting value
a2 = glm(satell ~ width, family=poisson(link="identity"), trace=TRUE, start=a1$coef)
summary(a2)

model event rate data with offset

 ## transform original data into events rate
 ## 66 unique x values and sort them
 length(unique(width))
 ux = sort(unique(width))
 ## calculate the sum of y and the number of observations
 sy = tapply(satell, width, sum)
 tx = tapply(satell, width, length) ## or using table(width)
 ## log linear model with offset: identical to a1
 a3 = glm(sy ~ ux, family=poisson(link="log"), offset=log(tx)); a3$coef
 summary(a3)
 ## numerical iteration for solving identity link for event rate modeling
 ## Gradient vector: local first order
 gn = function(ab, sy,tx,ux){
   xb = ab[1]+ab[2]*ux
   ga = sum( -tx + sy/xb )
   gb = sum( -tx*ux + sy*ux/xb )
   return( c(ga,gb) )
 }
 ## Hessian matrix: local second order
 hn = function(ab, sy,tx,ux){
   xb = ab[1]+ab[2]*ux
   h11 = -sum( sy/xb^2 )
   h22 = -sum( sy*ux^2/xb^2 )
   h12 = -sum( sy*ux/xb^2 )
   return( matrix(c(h11,h12,h12,h22), 2,2) )
 }
 ## starting value very important!
 ab0 = c(-10,0.5)
 for(k in 1:10){
   f2 = hn(ab0, sy,tx,ux)
   f1 = gn(ab0, sy,tx,ux)
   dta = as.vector(solve(f2,f1))
   ab0 = ab0-dta
   cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n")
 }
 ## observed fisher information for variance calculation
 ifm = hn(ab0, sy,tx,ux)
 vm = -solve(ifm)
 vm
 ab0[2]; sqrt(vm[4])

model inference using raw data

 ### goodness of fit inference: not quite appropriate
 ypr = a3$fit
 G2 = 2*sum(sy*log(sy/ypr),na.rm=TRUE)
 G2; 1-pchisq(G2,64)
 X2 = sum( (sy-ypr)^2/ypr )
 X2; 1-pchisq(X2,64)
 ### grouped data: better approximation
 ## sum of observed counts
 sy = tapply(satell, width, sum)
 id = tapply(wi, width, mean)

model inference with grouped data

 ## sum of grouped counts and predicted counts
 syg = tapply(sy, id, sum)
 yprg = tapply(ypr, id, sum)
 G2 = 2*sum(syg*log(syg/yprg),na.rm=TRUE)
 G2; 1-pchisq(G2,6)
 X2 = sum( (syg-yprg)^2/yprg )
 X2; 1-pchisq(X2,6)
 ## model event rate 
 wavg = tapply(width, wi, mean)
 ysum = tapply(satell, wi, sum)
 txg = tapply(wi, wi, length)
 ## correct way to specify the model
 a5 = glm(ysum ~ wavg, family=poisson(link="log"), offset=log(txg))
 a5$coef
 ## not right
 a6 = glm(ysum ~ wavg, family=poisson(link="log")); a6$coef
 ypr = a5$fit
 G2 = 2*sum(ysum*log(ysum/ypr),na.rm=TRUE)
 G2; 1-pchisq(G2,6)
 X2 = sum( (ysum-ypr)^2/ypr )
 X2; 1-pchisq(X2,6)
 ## direct modeling: replace x by the local average
 x = wavg[wi]
 a7 = glm(satell ~ x, family=poisson(link="log")); a7$coef
 a8 = glm(satell ~ x, family=poisson(link="identity"), start=c(-11,0.5))
 a8$coef

Biostat CDA - Codes for Chapter 04

Snoring and heart disease data : binomial modeling

Horseshoe crab data : poisson modeling

A figure summarizing some of the analysis results.