Biostat CDA - Codes for Chapter 05

Analysis of nesting horseshoe crabs: Logistic regression

## readin the data and exploratory data analysis
crab = read.table("http://umn.edu/~baolin/teaching/ph5467/crab.txt", header=TRUE)
attach(crab)
x0 = crab[, c(4,1:3,5)]
pairs(x0); cor(x0)

## create grouped data for weight using "cut" function
wt = cut(width, breaks=c(min(width), 23:29+0.25, max(width)), include.lowest=TRUE )
levels(wt)
wi = as.integer(wt)
table(wi)
wavg = tapply(width, wi, mean)
yavg = tapply(ys, wi, mean)
x = wavg[wi]

### Model estimation: IRWLS, fisher scoring, newton-raphson, local quadratic approx
## create binary response
ys = 1*(crab[,4]>0)
## Logistic regression: GLM with logit link
a2 = glm(ys ~ width, family=binomial)
## identity link: linear probability model
a3 = glm(ys ~ width, family=binomial("identity")) ## not implemented
## Numerical estimation procedure
## Gradient vector
gn = function(ab, y,x){
  pr = ab[1]+ab[2]*x
  ga = sum( y/pr - (1-y)/(1-pr) )
  gb = sum( y*x/pr - (1-y)*x/(1-pr) )
  return( c(ga,gb) )
}
## Hessian matrix
hn = function(ab, y,x){
  pr = ab[1]+ab[2]*x
  h11 = -sum( y/pr^2 + (1-y)/(1-pr)^2 )
  h22 = -sum( y*x^2/pr^2 + (1-y)*x^2/(1-pr)^2 )
  h12 = -sum( y*x/pr^2 + (1-y)*x/(1-pr)^2 )
  return( matrix(c(h11,h12,h12,h22), 2,2) )
}
## iteration
## starting value: linear fitting of the grouped proportion:  a good approximation
a4 = lm(yavg ~ wavg)
ab0 = a4$coef
for(k in 1:10){
  f2 = hn(ab0, ys,width)
  f1 = gn(ab0, ys,width)
  dta = as.vector(solve(f2,f1))
  ab0 = ab0-dta
  cat("Iter:", k, ", Diff=", sqrt(sum(dta^2)), ", Est: ", ab0, "\n")
}

### Model interpretation
## rate of change: d pi/d x
rt2 = a2$coef[2]*a2$fit*(1-a2$fit)
## median effective level
EL1 = -a2$coef[1]/a2$coef[2]
EL2 = (0.5-ab0[1])/ab0[2]

### Parameter inference
## Wald
cf2 = summary(a2)$coef
cf2[2,1]+c(-1,1)*qnorm(0.975)*cf2[2,2]
exp(.Last.value)
## LR
a2$null.dev - a2$dev
## Pi inference
cv2 = summary(a2)$cov.uns
xm = mean(width)
prm = cf2[1]+cf2[2]*xm
s2m = as.vector(c(1,xm)%*%cv2%*%c(1,xm))
ci = prm+c(-1,1)*s2m*qnorm(0.975)
1/(1+exp(-ci))
1/(1+exp(-prm))

### Model diagnostics
## grouped data
x = wavg[wi]
a5 = glm(ys ~ x, family=binomial)
y1sum = tapply(ys, x, sum)
y0sum = tapply(1-ys, x, sum)
p1s = tapply(a5$fit, x, sum)
p0s = tapply(1-a5$fit, x, sum)
X21 = sum( (y1sum-p1s)^2/p1s + (y0sum-p0s)^2/p0s )
G21 = 2*sum(y1sum*log(y1sum/p1s), na.rm=TRUE) + 2*sum(y0sum*log(y0sum/p0s), na.rm=TRUE)
X21; G21

## Grouping raw data: heuristic inference
p1s = tapply(a2$fit, x, sum)
p0s = tapply(1-a2$fit, x, sum)
X2 = sum( (y1sum-p1s)^2/p1s + (y0sum-p0s)^2/p0s )
G2 = 2*sum(y1sum*log(y1sum/p1s), na.rm=TRUE) + 2*sum(y0sum*log(y0sum/p0s), na.rm=TRUE)
X2; G2
qchisq(0.95,6); pchisq(4,6)
## Grouping fitted values
pr = a2$fit
pri = cut(pr, breaks=quantile(pr, prob=0:8/8), include.lowest=TRUE)
table(pri)
y1sum = tapply(ys, pri, sum)
y0sum = tapply(1-ys, pri, sum)
p1s = tapply(a5$fit, pri, sum)
p0s = tapply(1-a5$fit, pri, sum)
X2 = sum( (y1sum-p1s)^2/p1s + (y0sum-p0s)^2/p0s )
G2 = 2*sum(y1sum*log(y1sum/p1s), na.rm=TRUE) + 2*sum(y0sum*log(y0sum/p0s), na.rm=TRUE)
X2; G2
qchisq(0.95,6); pchisq(5,6)

Logistic regression for Ix2 contigency table

## Application: independence testing
## Chisq.test for independence
tb1 = table(wt, ys)
chisq.test(tb1, correct = FALSE)
## <==> model X2
a6 = glm(ys ~ 1, family=binomial)
log(mean(ys)/mean(1-ys))
p1s = tapply(a6$fit, wt, sum)
p0s = tapply(1-a6$fit, wt, sum)
X20 = sum( (tb1[,2]-p1s)^2/p1s + (tb1[,1]-p0s)^2/p0s )
G20 = 2*sum(tb1[,2]*log(tb1[,2]/p1s), na.rm=TRUE) 
       + 2*sum(tb1[,1]*log(tb1[,1]/p0s), na.rm=TRUE)
X20; G20
X20-X21; G20-G21

### Pearson residuals
a5 = glm(ys ~ x, family=binomial)
y1sum = tapply(ys, wt, sum)
p5s = tapply(a5$fit, wt, sum)
y0sum = tapply(1-ys, wt, sum)
p0s = tapply(1-a5$fit, wt, sum)
e5 = (y1sum-p5s)/sqrt(p5s*(1-p5s/(y1sum+y0sum)))
p6s = tapply(a6$fit, wt, sum)
p0s = tapply(1-a6$fit, wt, sum)
e6 = (y1sum-p6s)/sqrt(p6s*(1-p6s/(y1sum+y0sum)))
## adjusted residuals
X = cbind(1, wavg)
pr5 = tapply(a5$fit, wt, mean)
W = diag(pr5*(1-pr5)*(y1sum+y0sum))
H = sqrt(W)%*%X%*%solve(t(X)%*%W%*%X, t(X)%*%sqrt(W))
e5a = e5/sqrt(1-diag(H))
tb2 = cbind(width=y1sum, NumCases=y1sum+y0sum, NumYes=y1sum, 
            Fit1=p6s, Res1=e6, Fit2=p5s, Res2=e5,AdjRes2=e5a)
round(tb2, 2)

GLM model for 2x2xK contingency tables

### Table 5.5
## CMH test: conditional independence
Z1 = c(14,93,32,81); Z2 = c(11,52,12,43)
t55 = rbind(Z1, Z2)
n = rowSums(t55)
u11 = (t55[,1]+t55[,2])*(t55[,1]+t55[,3])/n
v11 = (t55[,1]+t55[,2])/n*(t55[,1]+t55[,3])/n*(t55[,3]+t55[,4])/(n-1)*(t55[,2]+t55[,4])
cmh = (sum(u11)-sum(t55[,1]))^2/sum(v11)
cmh
1-pchisq(cmh,1)
## MH common odds ratio estimation
n = rowSums(t55)
mh = sum(t55[,1]*t55[,4]/n)/sum(t55[,2]*t55[,3]/n)
mh
## Breslow-Day test: homogeneity
bd = apply(t55, 1, function(x){
  n1p = x[1]+x[2]; np1 = x[1]+x[3]
  np2 = x[2]+x[4]
  b2 = mh-1; b1 = -(n1p+np1)*mh-(np2-n1p); b0 = n1p*np1*mh
  u11 = (-b1+sqrt(b1^2-4*b2*b0))/2/b2
  v11 = (-b1-sqrt(b1^2-4*b2*b0))/2/b2
  if( all(np1-v11>0,n1p-v11>0,np2-n1p+v11>0) ){
    u11 = v11
  }  
  return((u11-x[1])^2*(np1/u11/(np1-u11)+np2/(n1p-u11)/(np2-n1p+u11)))
})
sum(bd)
1-pchisq(sum(bd),1)
### Modeling approach to Homogeneity testing
X = rep( rep(c(1,1,0,0),2), c(t55[1,],t55[2,]) )
Y = rep( rep(c(1,0,1,0),2), c(t55[1,],t55[2,]) )
Z = rep( 1:0, n )
a7 = glm(Y ~ X+Z, family=binomial)
## G2: LHR
pr1 = tapply(a7$fit, 2*X+Z, sum)
pr0 = tapply(1-a7$fit, 2*X+Z, sum)
y1 = tapply(Y, 2*X+Z, sum)
y0 = tapply(1-Y, 2*X+Z, sum)
2*sum(y1*log(y1/pr1)) + 2*sum(y0*log(y0/pr0))
## X2: score test
sum( (y1-pr1)^2/pr1 + (y0-pr0)^2/pr0 )
## Wald statistics
a8 = glm(Y ~ X+Z+X*Z, family=binomial)
summary(a8)$coef
summary(a8)$coef[4,3]^2
### Modeling approach to Conditional independence testing
## Wald statistics
summary(a7)$coef
summary(a7)$coef[2,3]^2
## G2: LHR
a9 = glm(Y ~ Z, family=binomial)
p1 = tapply(a9$fit, 2*X+Z, sum)
p0 = tapply(1-a9$fit, 2*X+Z, sum)
2*sum(y1*log(pr1/p1)) + 2*sum(y0*log(pr0/p0))
### Goodness of fit
2*sum(y1*log(y1/p1)) + 2*sum(y0*log(y0/p0))
## LHR G2
u11 = (t55[,1]+t55[,2])*(t55[,1]+t55[,3])/n
u12 = (t55[,1]+t55[,2])*(t55[,2]+t55[,4])/n
u21 = (t55[,3]+t55[,4])*(t55[,1]+t55[,3])/n
u22 = (t55[,3]+t55[,4])*(t55[,2]+t55[,4])/n
u = cbind(u11,u12,u21,u22)
G2 = 2*sum(t55*log(t55/u))
G2
1-pchisq(G2,2)