Biostat CDA - Codes for Chapter 04

Snoring and heart disease data : binomial modeling

  1. Independence testing: contingency table approach
     ## Table 4.1: snoring and heart disease
     t41 = matrix( c(24,35,21,30, 1355,603,192,224), 4,2 )
     t41
     n = sum(t41)
     nip = rowSums(t41); npj = colSums(t41)
     ## testing for independence
     chisq.test(t41)
     prop.test(t41[,1], nip)
     ## score/wald/Lr trio
     u41 = outer(nip, npj)/n
     sct = sum((t41-u41)^2/u41)
     sct; 1-pchisq(sct,1)
     wd = sum((t41-u41)^2/t41)
     wd; 1-pchisq(wd,1)
     Lr = sum(2*t41*log(t41/u41))
     Lr; 1-pchisq(Lr,1)
     ## Trend test: incorporating ordinal score to improve power
     x = c(0,2,4,5); y = 1:0
     nom = sum(outer(x,y)*t41) - sum(x*nip)*sum(y*npj)/n
     den2 = (sum(x^2*nip)-sum(x*nip)^2/n)*(sum(y^2*npj)-sum(y*npj)^2/n)
     r = nom/sqrt(den2)
     (n-1)*r^2
     1-pchisq((n-1)*r^2,1)
     prop.trend.test(t41[,1], nip, score = x)
     n*r^2
    
  2. Independence testing: linear regression modeling approach
     ## regression approach: y ~ x
     x1 = rep(x, nip)
     y1 = rep( rep(y,4), as.vector(t(t41)) )
     cor(x1,y1); r
     summary(lm(y1 ~ x1)) ## summary(lm(x1 ~ y1))
     r^2
     r^2/((1-r^2)/(n-2)) ## F-statistics
     ## prop regression: Pr(y|x) ~ x
     summary(lm(t41[,1]/nip ~ x))
    
  3. Independence testing: GLM approach
     ## Logit/Probit link
     glm1 = glm(y1 ~ x1, family=binomial(link = "logit"))
     summary(glm1)
     glm2 = glm(y1 ~ x1, family=binomial(link = "probit"))
     summary(glm2)
     ### identity link: not implemented
     ## glm(y1 ~ x1, family=binomial(link = "identity"))
     ### IWLS method: local quadratic approximation
     ## function value
     fn = function(ab, score,tab){
       prb = ab[1] + ab[2]*score
       sum(log(prb)*tab[,1]) + sum(log(1-prb)*tab[,2])
     }
     ## Gradient vector
     gn = function(ab, score,tab){
       prb = ab[1] + ab[2]*score
       ga = sum(tab[,1]/prb) - sum(tab[,2]/(1-prb))
       gb = sum(tab[,1]*score/prb) - sum(tab[,2]*score/(1-prb))
       return( c(ga,gb) )
     }
     ## Hessian matrix
     hn = function(ab, score,tab){
       prb = ab[1] + ab[2]*score
       h11 = -sum(tab[,1]/prb^2) - sum(tab[,2]/(1-prb)^2)
       h22 = -sum(tab[,1]*score^2/prb^2) - sum(tab[,2]*score^2/(1-prb)^2)
       h12 = -sum(tab[,1]*score/prb^2) - sum(tab[,2]*score/(1-prb)^2)
       return( matrix(c(h11,h12,h12,h22), 2,2) )
     }
     ## starting value
     ab0 = c(0.01,0)
     ## Iteration
     for(k in 1:10){
       f2 = hn(ab0, x,t41)
       f1 = gn(ab0, x,t41)
       dta = as.vector(solve(f2,f1))
       ab0 = ab0-dta
       cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n")
     }
     ifm = hn(ab0, x,t41)
     vm = -solve(ifm)
     vm
     ## independence testing: ab0[2]=0
     ab0[2]^2/vm[4]
     ## compared to Logit/Probit link testing results
     summary(glm1)[[12]][2,3]^2
     summary(glm2)[[12]][2,3]^2
     ## Visual fitting comparison: predicted probabilities
     glm1$coef
     glm2$coef
     reg1 = lm(t41[,1]/nip ~ x)
     summary(reg1)
     summary(lm(y1 ~ x1))
     Lgf = function(x, cef){
        xb = cef[1]+x*cef[2]
        1/(1+exp(-xb))
     }
     Pbf = function(x, cef){
        xb = cef[1]+x*cef[2]
        pnorm(xb)
     }
     curve(Lgf(x,glm1$coef), 0,5, xlab="Snoring", 
           ylab=expression(hat(pi)), ylim=c(0,0.13) )
     points(x, t41[,1]/nip, pch=20, cex=2)
     curve(Pbf(x,glm2$coef), add=TRUE, col=2, lty=2)
     abline(reg1, col=3, lty=3)
     abline(ab0, col=4, lty=4)
     legend(0,0.135, c("Logit","Probit","OLS","IWLS"),
            col=1:4, lty=1:4, bty="n")
    

Horseshoe crab data : poisson modeling

A figure summarizing some of the analysis results.

  1. necessary data preprocessing
    ## readin dataset
    crab = read.table("http://umn.edu/~baolin/teaching/ph5467/crab.txt", header=TRUE)
    ## attach dataframe for variable access
    attach(crab)
    ## grouping observations with similar x
    wt = cut(width, breaks=c(min(width),23:29+0.25,max(width)), include.lowest=TRUE )
    ## transformed into integer index
    wi = as.integer(wt)
    table(wi)
    ## group mean for the predictor/response
    wavg = tapply(width, wi, mean)
    yavg = tapply(satell, wi, mean)
    ## replace predictor with the grouped mean
    x = wavg[wi]
    
  2. model count data with poisson distribution
    ## log linear model for count data, tracing the algorithm convergence
    a1 = glm(satell ~ width, family=poisson(link="log"), trace=TRUE)
    summary(a1)
    ## identity link for count data: specify starting value
    a2 = glm(satell ~ width, family=poisson(link="identity"), trace=TRUE, start=a1$coef)
    summary(a2)
    
  3. model event rate data with offset
     ## transform original data into events rate
     ## 66 unique x values and sort them
     length(unique(width))
     ux = sort(unique(width))
     ## calculate the sum of y and the number of observations
     sy = tapply(satell, width, sum)
     tx = tapply(satell, width, length) ## or using table(width)
     ## log linear model with offset: identical to a1
     a3 = glm(sy ~ ux, family=poisson(link="log"), offset=log(tx)); a3$coef
     summary(a3)
     ## numerical iteration for solving identity link for event rate modeling
     ## Gradient vector: local first order
     gn = function(ab, sy,tx,ux){
       xb = ab[1]+ab[2]*ux
       ga = sum( -tx + sy/xb )
       gb = sum( -tx*ux + sy*ux/xb )
       return( c(ga,gb) )
     }
     ## Hessian matrix: local second order
     hn = function(ab, sy,tx,ux){
       xb = ab[1]+ab[2]*ux
       h11 = -sum( sy/xb^2 )
       h22 = -sum( sy*ux^2/xb^2 )
       h12 = -sum( sy*ux/xb^2 )
       return( matrix(c(h11,h12,h12,h22), 2,2) )
     }
     ## starting value very important!
     ab0 = c(-10,0.5)
     for(k in 1:10){
       f2 = hn(ab0, sy,tx,ux)
       f1 = gn(ab0, sy,tx,ux)
       dta = as.vector(solve(f2,f1))
       ab0 = ab0-dta
       cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n")
     }
     ## observed fisher information for variance calculation
     ifm = hn(ab0, sy,tx,ux)
     vm = -solve(ifm)
     vm
     ab0[2]; sqrt(vm[4])
    
  4. model inference using raw data
     ### goodness of fit inference: not quite appropriate
     ypr = a3$fit
     G2 = 2*sum(sy*log(sy/ypr),na.rm=TRUE)
     G2; 1-pchisq(G2,64)
     X2 = sum( (sy-ypr)^2/ypr )
     X2; 1-pchisq(X2,64)
     ### grouped data: better approximation
     ## sum of observed counts
     sy = tapply(satell, width, sum)
     id = tapply(wi, width, mean)
    
  5. model inference with grouped data
     ## sum of grouped counts and predicted counts
     syg = tapply(sy, id, sum)
     yprg = tapply(ypr, id, sum)
     G2 = 2*sum(syg*log(syg/yprg),na.rm=TRUE)
     G2; 1-pchisq(G2,6)
     X2 = sum( (syg-yprg)^2/yprg )
     X2; 1-pchisq(X2,6)
     ## model event rate 
     wavg = tapply(width, wi, mean)
     ysum = tapply(satell, wi, sum)
     txg = tapply(wi, wi, length)
     ## correct way to specify the model
     a5 = glm(ysum ~ wavg, family=poisson(link="log"), offset=log(txg))
     a5$coef
     ## not right
     a6 = glm(ysum ~ wavg, family=poisson(link="log")); a6$coef
     ypr = a5$fit
     G2 = 2*sum(ysum*log(ysum/ypr),na.rm=TRUE)
     G2; 1-pchisq(G2,6)
     X2 = sum( (ysum-ypr)^2/ypr )
     X2; 1-pchisq(X2,6)
     ## direct modeling: replace x by the local average
     x = wavg[wi]
     a7 = glm(satell ~ x, family=poisson(link="log")); a7$coef
     a8 = glm(satell ~ x, family=poisson(link="identity"), start=c(-11,0.5))
     a8$coef