PubH 8401 Linear Model -- additional notes

Some references on EM algorithm and its applications

AP Dempster, NM Laird and DB Rubin (1977). Maximum Likelihood from Incomplete Data via the EM Algorithm. JRSSB, 39(1), 1-38. http://www.jstor.org/stable/2984875
Thomas Louis (1982). Finding the Observed Information Matrix when Using the EM Algorithm. JRSSB, 44(2), 226-233. http://www.jstor.org/stable/2345828
CF Jeff Wu (1983). On the Convergence Properties of the EM Algorithm. The Annals of Statistics, 11(1), 95-103. http://www.jstor.org/stable/2240463
C. Fraley and A. E. Raftery (2002). Model-based clustering, discriminant analysis, and density estimation. Journal of the American Statistical Association, 97:611:631. http://www.jstor.org/stable/3085676

Genetic data example with EM estimation

## data
yn = c(125,18,20,34)
## score equation
f1 = function(tha, yn){
  tha*(tha*yn[1]+(yn[2]+yn[3]+yn[4])*(2+tha)) - (tha*yn[1]+yn[4]*(2+tha))
}
uniroot(f1, 0:1, yn=yn)

f2 = function(tha, yn){
  -sum(yn)*tha^2 + (yn[1]-2*yn[2]-2*yn[3]-yn[4])*tha + 2*yn[4]
}
uniroot(f2, 0:1, yn=yn, tol=1e-9)
th0 = 0.6268215
## direct likelihood
lik = function(tha, yn){
  yn[1]*log(2+tha) + (yn[2]+yn[3])*log(1-tha) + yn[4]*log(tha)
}
optimize(lik, 0:1, maximum=TRUE, yn=yn, tol=1e-8)
## EM
f = function(tha, itMax=1e2,eps=1e-8, yn){
  it = 0
  dif = 1
  while( (it<itMax)&(dif>eps) ){
    th1 = (tha*yn[1]+yn[4]*(2+tha))/(tha*yn[1]+(yn[2]+yn[3]+yn[4])*(2+tha))
    it = it+1
    dif = abs(tha-th1)
    cat(it, ": diff=", dif, ", th1=", th1, ", th1-th0=", th1-th0, ", cvr=", (th1-th0)/(tha-th0), "\n")
    tha = th1
  }
  return(tha)
}
f(0.1, yn=yn)

## EM
yn[1]/th0/(2+th0) + yn[4]/th0^2 + (yn[2]+yn[3])/(1-th0)^2 - yn[1]*2/th0/(2+th0)^2
## 377.5169
## MLE
yn[1]/(2+th0)^2 + (yn[2]+yn[3])/(1-th0)^2 + yn[4]/th0^2
## 377.5169

Read textbook Chapter 14 for techinical details of LMM EM estimation, Chapter 6/14 for techinical details of ML/REML estimation.

Newton-Raphson/Fisher-scoring for LMM

Some references on LMM estimation

Nan Laird and James Ware (1982). Random-Eects Models for Longitudinal Data. Biometrics, 38 (4), 963- 974. http://www.jstor.org/stable/2529876
Mary Lindstrom and Douglas Bates (1988). Newton-Raphson and EM Algorithms for Linear Mixed-Eects Models for Repeated-Measures Data. JASA, 83(404), 1014-1022. http://www.jstor.org/stable/2290128

Sample R/SAS codes for MLE/REML of LMM with EM algorithms

## y: n,m
## x: n,m,p
## z: n,m,k
lmm.reml.ecm = function(y, x, z, itMax=1e3,eps=1e-5){
  n = dim(y)[1]
  m = dim(y)[2]
  p = dim(x)[3]
  k = dim(z)[3]

  y1 = as.vector(y)
  xm = matrix(x, m*n,p)
  obj = lm(y1 ~ xm-1)
  rba = matrix(obj$coef, ncol=1)*runif(1)
  s2 = var(y1)
  res = matrix(obj$res, n,m)
  u = matrix(0, n,k)
  for(i in 1:n) u[i,] = lm(res[i,] ~ z[i,,]-1)$coef
  V = cov(u)*runif(1)+diag(k)*0.5

  it = 0; dst = 1
  while( (it<itMax)&(dst>eps) ){
    res = y-matrix(xm%*%rba, ncol=m)
    ## rba
    W1 = matrix(0, p,p)
    W0 = rep(0,p)
    for(i in 1:n){
      Vit = z[i,,]%*%V%*%t(z[i,,]) + s2*diag(m)
      Vii = solve(Vit)
      W1 = W1 + t(x[i,,])%*%Vii%*%x[i,,]
      W0 = W0 + t(x[i,,])%*%Vii%*%t(y[i,,drop=FALSE])
    }
    W1i = solve(W1)
    rbau = W1i%*%W0
    ## V, s2
    Vu = matrix(0,k,k)
    s2u = 0
    for(i in 1:n){
      Vit = z[i,,]%*%V%*%t(z[i,,]) + s2*diag(m)
      Vii = solve(Vit)
      ei = Vii%*%t(res[i,,drop=FALSE])
      uih = V%*%t(z[i,,])%*%ei
      a0 = V%*%t(z[i,,])%*%Vii%*%x[i,,]
      Vu = Vu + uih%*%t(uih)/n + (V-V%*%t(z[i,,])%*%Vii%*%z[i,,]%*%V)/n + a0%*%W1i%*%t(a0)/n
      a1 = Vii%*%x[i,,]%*%W1i%*%t(x[i,,])%*%Vii
      s2u = s2u + s2^2*sum(ei^2)/n/m + s2*sum(diag(Vii%*%z[i,,]%*%V%*%t(z[i,,])))/n/m + s2^2*sum(diag(a1))/n/m
    }
    ## diff
    dst = mean(abs(Vu-V))/3 + mean(abs(rbau-rba))/3 + mean(abs(s2u-s2))/3
    it = it+1
    ## update
    V = Vu; rba = rbau; s2 = s2u[1]
    cat(it, ":", dst, "\n")
  }

  return( list(V=V,rba=rba[,1],s2=s2, covb=W1i) )
}
lmm.mle.ecm = function(y, x, z, itMax=1e3,eps=1e-5){
  n = dim(y)[1]
  m = dim(y)[2]
  p = dim(x)[3]
  k = dim(z)[3]

  yv = as.vector(y)
  xm = matrix(x, m*n,p)
  obj = lm(yv ~ xm-1)
  rba = matrix(obj$coef, ncol=1)*runif(1)
  s2 = var(yv)
  res = matrix(obj$res, n,m)
  u = matrix(0, n,k)
  for(i in 1:n) u[i,] = lm(res[i,] ~ z[i,,]-1)$coef
  V = cov(u)*runif(1)+diag(k)*0.5

  it = 0; dst = 1
  while( (it<itMax)&(dst>eps) ){
    res = y-matrix(xm%*%rba, ncol=m)
    ## rba
    W1 = matrix(0, p,p)
    W0 = rep(0,p)
    for(i in 1:n){
      Vit = z[i,,]%*%V%*%t(z[i,,]) + s2*diag(m)
      Vii = solve(Vit)
      W1 = W1 + t(x[i,,])%*%Vii%*%x[i,,]
      W0 = W0 + t(x[i,,])%*%Vii%*%t(y[i,,drop=FALSE])
    }
    W1i = solve(W1)
    rbau = W1i%*%W0
    ## V, s2
    Vu = matrix(0,k,k)
    s2u = 0
    for(i in 1:n){
      Vit = z[i,,]%*%V%*%t(z[i,,]) + s2*diag(m)
      Vii = solve(Vit)
      ei = Vii%*%t(res[i,,drop=FALSE])
      uih = V%*%t(z[i,,])%*%ei
      a0 = V%*%t(z[i,,])%*%Vii%*%x[i,,]
      Vu = Vu + uih%*%t(uih)/n + (V-V%*%t(z[i,,])%*%Vii%*%z[i,,]%*%V)/n
      s2u = s2u + s2^2*sum(ei^2)/n/m + s2*sum(diag(Vii%*%z[i,,]%*%V%*%t(z[i,,])))/n/m
    }
    ## diff
    dst = mean(abs(Vu-V))/3 + mean(abs(rbau-rba))/3 + mean(abs(s2u-s2))/3
    it = it+1
    ## update
    V = Vu; rba = rbau; s2 = s2u[1]
    cat(it, ":", dst, "\n")
  }

  return( list(V=V,rba=rba[,1],s2=s2, covb=W1i) )
}

Read textbook Chapter 8. Fisher scoring/NR for MLE of LMM.

## likelihood calculation
flik = function(y,x,z, rba,Dz,s2){
  m = dim(x)[1]
  n = dim(x)[2]

  lik = 0
  for(i in 1:m){
    ## rba
    V = z[i,,]%*%Dz%*%t(z[i,,]) + s2*diag(n)
    Vinv = solve(V)
    res = t(y)[,i]-x[i,,]%*%rba
    lik = lik -n/2*log(2*pi)-log(det(V))/2-t(res)%*%Vinv%*%res/2
  }
  return(lik)
}
gv.fs = function(y,x,z, rba,Dz,s2){
  m = dim(x)[1]
  n = dim(x)[2]
  p = dim(x)[3]
  k = dim(z)[3]

  a0 = diag(k)
  id = row(a0)<=col(a0)

  g1 = matrix(0, p,1)
  g2 = 0
  g3 = rep(0, k*(k+1)/2)

  for(i in 1:m){
    ## rba
    V = z[i,,]%*%Dz%*%t(z[i,,]) + s2*diag(n)
    Vinv = solve(V)
    res = t(y)[,i]-x[i,,]%*%rba
    ## rba
    g1 = g1 + t(x[i,,])%*%Vinv%*%res
    ## s2
    a1 = Vinv - Vinv%*%res%*%t(res)%*%Vinv
    g2 = g2 - sum(diag(a1))/2
    ## D
    a2 = t(z[i,,])%*%a1%*%z[i,,]
    diag(a2) = diag(a2)/2
    g3 = g3 - a2[id]
  }
  return( list(rba=g1,V=c(g2,g3)) )
}
hm.fs = function(y,x,z, rba,Dz,s2){
  m = dim(x)[1]
  n = dim(x)[2]
  p = dim(x)[3]
  k = dim(z)[3]
  kv = k*(k+1)/2

  a0 = diag(k)
  id = row(a0)<=col(a0)
  ri = row(a0)[id]
  ci = col(a0)[id]
  rc = cbind(ri,ci)
  i1 = rep(1:kv, kv)
  i2 = rep(1:kv, each=kv)
  j13 = cbind(rc[i1,1], rc[i2,1])
  j24 = cbind(rc[i1,2], rc[i2,2])
  j14 = cbind(rc[i1,1], rc[i2,2])
  j23 = cbind(rc[i1,2], rc[i2,1])
  r0 = 1*(rc[,1] == rc[,2])+1
  r1 = r0[i1]
  r2 = r0[i2]

  h1 = matrix(0, p,p)
  h2 = matrix(0, 1+kv, 1+kv)

  for(i in 1:m){
    ## rba
    V = z[i,,]%*%Dz%*%t(z[i,,]) + s2*diag(n)
    Vinv = solve(V)
    res = t(y)[,i]-x[i,,]%*%rba ## n,1
    ## rba
    h1 = h1 - t(x[i,,])%*%Vinv%*%x[i,,]
    ## s2
    h2[1,1] = h2[1,1] - sum(Vinv^2)/2
    ## s2,D
    a1 = t(z[i,,])%*%Vinv%*%Vinv%*%z[i,,]
    h2[1,-1] = h2[1,-1] - 2*a1[id]/r0/2
    ## D,D
    Vz = t(z[i,,])%*%Vinv%*%z[i,,]
    a2 = 2*(Vz[j13]*Vz[j24] + Vz[j14]*Vz[j23])/r1/r2
    h2[-1,-1] = h2[-1,-1] - a2/2
  }
  h2[-1,1] = h2[1,-1]

  return( list(rba=h1,V=h2) )
}
lmm.mle.fs = function(y, x, z, itMax=1e3,eps=1e-5, verbose=FALSE){
  m = dim(y)[1]
  n = dim(y)[2]
  p = dim(x)[3]
  k = dim(z)[3]

  xm = matrix(x, m*n,p)
  obj = lm(as.vector(y) ~ xm-1)
  rba1 = matrix(obj$coef, ncol=1)
  s2 = 0
  res = matrix(obj$res, m,n)
  u = matrix(0, m,k)
  for(i in 1:m) u[i,] = lm(res[i,] ~ z[i,,]-1)$coef
  D1 = cov(u)*0.9+diag(k)*0.1
  id = row(D1)<=col(D1)

  it = 0; dst = 1
  while( (it<itMax)&(dst>eps) ){
    ## update
    Dz = D1; rba = rba1
    ## gv, hm
    dr1 = gv.fs(y,x,z, rba,Dz,s2)
    dr2 = hm.fs(y,x,z, rba,Dz,s2)
    ## update
    rba1 = rba - solve(dr2$rba, dr1$rba)
    a1 = Dz[id] - solve(dr2$V[-1,-1], dr1$V[-1])
    D1[id] = a1
    D1[!id] = t(D1)[!id]
    ## diff
    dst = mean(abs(D1-Dz))/2 + mean(abs(rba1-rba))/2
    it = it+1
    if(verbose) cat(it, ":", dst, det(dr2$V), det(dr2$rba), "\n")
  }

  return( list(D=Dz,rba=rba,s2=s2) )
}

## generate data and validate the procedures
library(mvtnorm)
library(lme4)
## conditional model generation
n = 10; m = 9; p = 2; k = 2
x = array(rnorm(n*m*p), c(n,m,p))
z = array(rnorm(n*m*k), c(n,m,k))
u = matrix(rnorm(n*k), n,k)
zu = matrix(0, n,m)
for(i in 1:n) zu[i,] = z[i,,]%*%u[i,]
y = apply(x, 1:2, sum) + zu + rnorm(n*m,0,1.5)

## REML/EM
a0 = lmm.reml.ecm(y,x,z, eps=1e-8)
lr = rep(0, dim(y)[1])
for(i in 1:dim(y)[1]){
  lr[i] = dmvnorm(y[i,], x[i,,]%*%a0$rba, a0$s2*diag(dim(y)[2])+z[i,,]%*%a0$V%*%t(z[i,,]), log=TRUE)
}
sum(lr)

y2 = as.vector(y)
x2 = matrix(x, ncol=dim(x)[3])
z2 = matrix(z, ncol=dim(z)[3])
id = rep(1:dim(y)[1], dim(y)[2])
b0 = lmer(y2 ~ 0 + x2 + (0+z2|id), REML=TRUE)
b0v = VarCorr(b0)
sig = attributes(b0v, "sc")$sc
V = b0v$id
lr1 = rep(0, dim(y)[1])
for(i in 1:dim(y)[1]){
  lr1[i] = dmvnorm(y[i,], x[i,,]%*%fixef(b0), sig^2*diag(dim(y)[2])+z[i,,]%*%V%*%t(z[i,,]), log=TRUE)
}
sum(lr1)

## MLE/EM
a1 = lmm.mle.ecm(y,x,z, eps=1e-8)
lr = rep(0, dim(y)[1])
for(i in 1:dim(y)[1]){
  lr[i] = dmvnorm(y[i,], x[i,,]%*%a1$rba, a1$s2*diag(dim(y)[2])+z[i,,]%*%a1$V%*%t(z[i,,]), log=TRUE)
}
sum(lr)

b1 = lmer(y2 ~ 0 + x2 + (0+z2|id), REML=FALSE)
b1v = VarCorr(b1)
sig = attributes(b1v, "sc")$sc
V = b1v$id
lr1 = rep(0, dim(y)[1])
for(i in 1:dim(y)[1]){
  lr1[i] = dmvnorm(y[i,], x[i,,]%*%fixef(b1), sig^2*diag(dim(y)[2])+z[i,,]%*%V%*%t(z[i,,]), log=TRUE)
}
sum(lr1)

write.table(data.frame(y2,x2,z2,id), file="a.txt", sep="\t", row.names=FALSE)

## SAS
proc import datafile="a.txt" out=aaa dbms=tab replace;
   getnames=yes;
   datarows=2;
run;

proc mixed data=aaa method=REML;
  class id;
  model y = x1 x2 / noint S chisq;
  random z1 z2 / type=un subject=id;
run;

proc mixed data=aaa method=ML;
  class id;
  model y = x1 x2 / noint S chisq;
  random z1 z2 / type=un subject=id;
run;

LMM application example: The TLC example is from the book, "Applied Longitudinal Analysis", by Garrett Fitzmaurice, Nan Laird and James Ware. The data can be downloaded from http://biosun1.harvard.edu/~fitzmaur/ala.
SAS codes for the TLC example: Download the data from http://umn.edu/~baolin/teaching/linmods/tlc1.dat, and put it somewhere in your computer.

data tlc;
   infile "tlc1.dat";
   input id trt$ lead0 lead1 lead4 lead6;
   y=lead0; time=0; output;
   y=lead1; time=1; output;
   y=lead4; time=4; output;
   y=lead6; time=6; output;
   drop lead0 lead1 lead4 lead6;
run;
title Treatment of Lead Exposed Children (TLC) Trial;
/* sorting data */
proc sort data = tlc;
  by trt  descending time;
run;
/* factor model */
proc mixed order=data data=tlc;
  class id trt time;
  model y = trt time trt*time / s chisq;
  repeated time / type=un subject=id r;
run;
/* create linear splines */
data tlc2;
   set tlc;
   if time<=1 then time0 = time-1;
    else time0 = 0;
   if time>1 then time1 = time-1;
    else time1 = 0;
run;
/* splines model */
proc mixed data=tlc2;
  class id time trt;
  model y = time0 time1 trt trt*time0 trt*time1 / s chisq;
  repeated time / type=un subject=id r;
run;

Read textbook chapter 7 and 14 for approximate/conditional/MLE inference of GLMM.

Some references on GLMM inference

NE Breslow and DG Clayton (1993). Approximate Inference in Generalized Linear Mixed Models. JASA, 88(421), 9-25. http://www.jstor.org/stable/2290687 (PQL, Laplace approximation)
Xihong Lin and Norman Breslow (1996). Bias Correction in Generalized Linear Mixed Models With Multiple Components of Dispersion. JASA, 91(435), 1007-1016. http://www.jstor.org/stable/2291720 (reduce PQL bias)
Robert Schall (1991). Estimation in Generalized Linear Models with Random Effects. Biometrika, 78(4), 719-727. http://www.jstor.org/stable/2336923 (LMM approximation)

Read textbook chapter 14 for MCMC inference of GLMM.

Some references on GLMM inference

Charles McCulloch (1997). Maximum Likelihood Algorithms for Generalized Linear Mixed Models. JASA, 92(437), 162-170. http://www.jstor.org/stable/2291460
Charles McCulloch (1994). Maximum Likelihood Variance Components Estimation for Binary Data. JASA, 89(425), 330-335. http://www.jstor.org/stable/2291229
Alan Gelfand and Adrian Smith (1990). Sampling-Based Approaches to Calculating Marginal Densities. JASA, 85(410), 398-409. http://www.jstor.org/stable/2289776
George Casella and Edward George (1992). Explaining the Gibbs Sampler. The American Statistician, 46(3), 167-174. http://www.jstor.org/stable/2685208
AFM Smith and GO Roberts (1993). Bayesian Computation Via the Gibbs Sampler and Related Markov Chain Monte Carlo Methods. JRSSB, 55(1), 3-23. http://www.jstor.org/stable/2346063
James Booth, James Hobert and Wolfgang Jank (2001). A survey of Monte Carlo algorithms for maximizing the likelihood of a two-stage hierarchical model. Statistical Modelling, 1(4), 333-349. Sage link
James Booth and James Hobert (1999). Maximizing Generalized Linear Mixed Model Likelihoods with an Automated Monte Carlo EM Algorithm. JRSSB, 61(1), 265-285. http://www.jstor.org/stable/2680750

Read textbook Chapter 14 (Page 320-343) for a review of computation for mixed effects model.

Sample R codes for exploring GLM/GLMM/Marginal GLM with MLE and approximate inference

### GLM: MLE estimation (simulation studies)
## logisic regression model
##   Logit\Pr(Y=1|X) = 1 + 2*X, X ~ N(0,1)

##  numerical optimization
set.seed(2008)
n = 1e2
x = rnorm(n)
pr = 1/(1+exp(-1-2*x))
y = 1*(runif(n)<pr)
( yfit = glm(y ~ x, family=binomial) )

ylik = function(rpar, x,y){
  pr = 1/(1+exp(-rpar[1]-rpar[2]*x))
  tmp = log(1-pr) + y*log(pr/(1-pr))
  return( -sum(tmp) )
}
( a0 = optim(0:1, ylik, x=x,y=y) )

library(numDeriv)
ymle = function(rpar){
  ylik(rpar, x=x,y=y)
}
( a1 = hessian(ymle, x=a0$par) )
solve(a1)
summary(yfit)$cov.un

##  NR/Fisher scoring: Textbook page 104-106
gv = function(rpar, x,y){
  pr = 1/(1+exp(-rpar[1]-rpar[2]*x))
  g1 = sum(y-pr)
  g2 = sum( x*(y-pr) )
  return( c(g1,g2) )
}
hm = function(rpar, x,y){
  pr = 1/(1+exp(-rpar[1]-rpar[2]*x))
  h11 = -sum(pr*(1-pr))
  h12 = -sum(x*pr*(1-pr))
  h22 = -sum(x^2*pr*(1-pr))
  return( matrix(c(h11,h12,h12,h22),2,2) )
}
rpar0 = 0:1
it = 0; dif = 1
while( (it<1e2)&(dif>1e-6) ){
  rpar1 = rpar0 - solve(hm(rpar0,x=x,y=y), gv(rpar0,x=x,y=y))
  it = it+1
  dif = mean(abs(rpar1-rpar0))
  cat(it, ": diff=", dif, ", par=", rpar1, "\n")
  rpar0 = rpar1
}
(a2 = -hm(rpar0, x=x,y=y))
solve(a2)
summary(yfit)$cov.un


## GLMM:
## logisic regression model
##   Logit\Pr(y_{ij}=1|u_i,x_{ij}) = 1 + u_i + 2*x_{ij},
##            u_i ~ N(0,1)
## simulate x_{ij} ~ N(0,1)
set.seed(2008)
n = 1e2; m = 5
x = matrix(rnorm(n*m), n,m)
u = rnorm(n)
pr = 1/(1+exp(-1-u-2*x))
y = matrix(runif(n*m), n,m)
y = 1*(y<pr)

xi = as.vector(x)
yi = as.vector(y)
id = rep(1:n,m)
write.table(data.frame(x=xi,y=yi,id=id), file="xy08.txt",
            quote=FALSE, row.names=FALSE, col.names=FALSE)


## approximate inference
mlik = function(rpar, x=x, y=y){
  n = dim(x)[1]
  ai = rep(0,n)
  for(i in 1:n){
    fi = function(mu){
      pr = 1/(1+exp(-rpar[1]-mu-rpar[2]*x[i,]))
      prod(y[i,]*pr + (1-y[i,])*(1-pr))*dnorm(mu, 0, rpar[3])
    }
    ## ai[i] = integrate(fi, -Inf,Inf,stop.on.error=FALSE)$value
    ## ai[i] = integrate(fi, -Inf,Inf)$value
    ## a0 = mean(rpar[1]+rpar[2]*x[i,])
    ## ai[i] = integrate(fi, a0-25,a0+25)$value
    a1 = range(rpar[1]+rpar[2]*x[i,])
    ai[i] = integrate(fi, a1[1]-25,a1[2]+25)$value
    ## x0 = seq(a0-15, a0+15, length=1e3)
    ## ai[i] = sum(fi(x0)*30/1e3)
  }
  return(-sum(log(ai)))
}

mlik(c(1,2,1), x=x, y=y)
( a0 = optim(c(0.8,1.8,0.9), mlik, x=x,y=y) )
mlik(c(0.8785, 1.9203, 0.73573), x=x,y=y)

##
set.seed(2008)
mu0 = rnorm(1e3)
amlik = function(rpar, x=x, y=y){
  n = dim(x)[1]
  ai = rep(0,n)
  for(i in 1:n){
    lik = sapply(mu0, function(mu){
      pr = 1/(1+exp(-rpar[1]-rpar[2]*xi-mu*rpar[3]))
      prod(y[i,]*pr + (1-y[i,])*(1-pr))
    })
    ai[i] = mean(lik)
  }
  return(-sum(log(ai)))
}

amlik(c(1,2,1), x=x, y=y)
( a0 = optim(c(0.8,1.8,0.9), amlik, x=x,y=y) )
amlik(c(0.8785, 1.9203, 0.73573), x=x,y=y)



## linearization: method similar to proc glimmix with syntax similar to proc nlmixed
##   approximate inference of Binomial likelihood with LS
library(nlme)
nlme(yi ~ 1/(1+exp(-beta.0-xi*beta.xi)), fixed = beta.0 + beta.xi ~ 1,
     random = beta.0 ~ 1, groups = ~ id, start=c(beta.0=0,beta.xi=1))

## PQL/Laplace
library(MASS)
glmmPQL(yi ~ xi, random = ~ 1|id, family = binomial)
## glmmPQL might be sensitive to the random effects structure.
##  For large number of random effects, or crossed random effects,
##  the program may have numerical difficulties of convergence.
## PQL, AGQ
library(lme4)
glmer(yi ~ xi + (1|id), family=binomial, nAGQ = 1)
glmer(yi ~ xi + (1|id), family=binomial, nAGQ = 20)
## alternative nlme modeling approach: Binomial likelihood approximation with LS
f = function(x, beta.0, beta.x){
  a0 = exp(-beta.0-x*beta.x)
  res = 1/(1+a0)
  gr1 = a0/(1+a0)^2
  gr2 = x*a0/(1+a0)^2

  grv <- array(0, c(length(res), 2), list(NULL, c("beta.0", "beta.x")))
  grv[, "beta.0"] <- gr1
  grv[, "beta.x"] <- gr2
  attr(res, "gradient") <- grv
  res
}
xy = data.frame(x=xi,y=yi, id=id)
nlmer(y ~ f(x, beta.0, beta.x) ~ beta.0|id, data=xy, start=c(beta.0=1, beta.x=2), nAQG=10)
## can only handle random intercept.
library(glmmML)
xy.agq = glmmML(yi ~ xi, cluster = id, family = binomial, method="ghq", n.points=20)
summary(xy.agq)



## GEE1: assuming constant correlation matrix
library(geepack)
library(gee)
xy.gee1 = gee(yi ~ xi, id=id, family="binomial", scale.fix=TRUE, corstr="unstructured")
summary(xy.gee1)

## GEE1 augmented: ALR, assuming constant log OR
xy.alr = ordgee(ordered(yi) ~ xi, id=id, waves=rep(1:5,n), int.const=TRUE,
                corstr="unstructured")
summary(xy.alr)

Sample SAS codes for GLMM/Marginal GLM inference

data xy;
   infile "xy08.txt";
   input x y id;
run;

Title Marginal inference with GEE assuming constant correlation matrix;
/*
  1. Based on the idea of GEE1 (Zeger and Liang 1986).
  2. It is relatively easy to estimate the correlation matrix based on the residuals,
     with sample moments/average.
  3. There are some implicit constraints on the correlation between discrete variables.
*/
proc genmod descending;
   class id;
   model y = x / dist=bin link=logit type3 wald;
   repeated subject=id / type=un CORRW;
run;
Title Marginal inference with GEE assuming constant OR to model the correlation matrix;
/*
  1. Based on alternating logistic regression model (ALR, Carey et.al 1993,
     often referred to as GEE1 augmented.)
  2. The intuition is that it is more natural to use OR to model the dependence
     between discrete variables.
*/
proc genmod descending;
   class id;
   model y = x / dist=bin link=logit type3 wald;
   repeated subject=id / logor=fullclust;
run;

Title MLE by approximating nonlinear GLM with linear model;
/* 1. Approximate the often nonlinear GLM with linear model, hence reduce to the LMM,
      which can be solved very efficiently (with iterations).
   2. The approximation depends on the current paramter/random effects estimation.
      So for each iteration, the approximated LMM will be different.
      So it is a double iteration procedure. The whole procedure is often referred to as
      pseudo-likelihood (PL) or restricted PL (RPL) depending on
      whether we adjust for the degree of freedom in LMM.
   3. It may not converge and may converge to the wrong model.
   4. Can in principle handle any number of random effects (like proc mixed).
      But large number of random effects will make the convergence harder/slower
      and more likely converge to the wrong model.
   5. It is important to set "maxopt" option, which controls the outside number of
      LMM approximations.
*/
proc glimmix maxopt=200;
  class id;
  model y = x / dist=bin S;
  random int / type=un subject=id;
run;
Title Conditional inference with GLMM: MLE by approximating integration with finite sums;
/* 1. Adaptive Gaussian quadrature: similar to Laplace approximation (only using one point),
      but averaged over multiple points to improve the overall approximation accuracy.
   2. Can only handle very small number of random effects
   3. It is important to set "qpoints" option, which controls the number of adaptive
      Gaussian quadrature points.
*/
proc nlmixed qpoints=20;
  parms B_int = 1
        B_x = 2
        sig = 1.0;
  eta = B_int + B_x*x;
  pr = 1/(1 + exp(- b0 - eta));
  model y ~ binomial(1, pr);
  random b0 ~ normal( 0, sig*sig ) subject=id;
run;

Statistical analysis of matched pair data: see textbook Page 202.
Some GEE references

Kung-Yee Liang, Scott Zeger (1986). Longitudinal Data Analysis Using Generalized Linear Models. Biometrika, 73(1), 13-22. http://www.jstor.org/stable/2336267
Scott Zeger, Kung-Yee Liang (1986). Longitudinal Data Analysis for Discrete and Continuous Outcomes, Biometrics, 42(1), 121-130. http://www.jstor.org/stable/2531248
Scott Zeger; Kung-Yee Liang; Paul Albert (1988). Models for Longitudinal Data: A Generalized Estimating Equation Approach. Biometrics, 44(4), 1049-1060. http://www.jstor.org/stable/2531734
Paul Burton, Lyle Gurrin, Peter Sly (1998). Extending the simple linear regression model to account for correlated responses: An introduction to generalized estimating equations and multi-level mixed modelling. Statistics in Medicine, 17(11), 1261-1291. Wiley link

GEE1 improved

Some references

Kung-Yee Liang, Scott Zeger and Hahjat Qaqish (1992). Multivariate regression analyses for categorial data. JRSSB, 54(1), 3-40. http://www.jstor.org/stable/2345947
Vincent Carey, Scott Zeger , and Peter Diggle (1993). Modelling multivariate binary data with alternating logistic regressions. Biometrika 80: 517-526. doi:10.1093/biomet/80.3.517
Lue Ping Zhao; Ross Prentice (1990). Correlated Binary Regression Using a Quadratic Exponential Model. Biometrika, 77(3), 642-648. http://www.jstor.org/stable/2337004
Ross Prentice; Lue Ping Zhao (1991). Estimating Equations for Parameters in Means and Covariances of Multivariate Discrete and Continuous Responses. Biometrics, 47(3), 825-839. http://www.jstor.org/stable/2532642
Lue Ping Zhao; Ross Prentice; Steven Self (1992). Multivariate Mean Parameter Estimation by Using a Partly Exponential Model. JRSSB, 54(3), 805-811. http://www.jstor.org/stable/2345860