LINEAR REGRESSION WITH CENSORED OBSERVATIONS

Suppose you have a scale on which you weigh people in a clinical study.
The scale goes up to 200 kg.  Some people in your study weigh more than
200 kg.  When they stand on the scale, the needle goes up to 200 and stops.
You know that they weight 200 kg or more, but you don't know their exact
weight.  That is, observations above 200 kg are censored.

You are interested in other variables that may predict weight.  For example,
their gender or family income or daily caloric intake.  The model is

      W = b0 + b1*gender + b2*familyincome + b3*calories + error,

where you assume error ~ N(0, sigma^2).

The likelihood for uncensored observations is

    (1/sqrt(2*pi)*(1/sigma)*exp((W - E(W|X))^2/(2*sigma^2)).

The likelihood for censored observations is:

    1 - NORMALCDF((Censorlevel - E(W|X))/sigma).

In the absence of censoring, the MLE is the same as the least-squares
estimate.  However with censoring, the likelihood for the censored terms
must be included.  This can be accomplished in SAS as in the following
program (a simulated dataset): 

==============================================================================

options linesize = 80 ;
footnote "~john-c/5421/ceiling.sas &sysdate &systime" ;

FILENAME GRAPH 'gsas.grf' ;
LIBNAME  loc '' ;

OPTIONS  LINESIZE = 80 MPRINT ;

GOPTIONS
         RESET = GLOBAL
         ROTATE = PORTRAIT
         FTEXT = SWISSB
         DEVICE = PSCOLOR
         GACCESS = SASGASTD
         GSFNAME = GRAPH
         GSFMODE = REPLACE
         GUNIT = PCT BORDER
         CBACK = WHITE
         HTITLE = 2 HTEXT = 1 ;

*===================================================================== ;        

data ceiling ;

     n = 200 ;
     sigma = 2 ;
     xlow = 1 ; xhigh = 5 ;
     ceiling = 5 ;
     b0 = 1 ;
     b1 = 1 ;
     seed = 20131202 ;

     do i = 1 to n ;

        x = xlow + (xhigh - xlow) * ranuni(seed) ;
        y = b0 + b1 * x + sigma * rannor(seed) ;
        w = y ; if y > ceiling then w = ceiling ;
        output ;

     end ;

run ;

symbol1 v = 'o' w = 1 h = 2 c = black ;
* symbol1 v = 'o' w = 1 h = 2 c = red ;

proc gplot data = ceiling ;
     plot W * x ;
title1 h = 2 "W vs. X ..." ;
run ;

proc reg data = ceiling ;
     model y = x ;
title1 'PROC REG for y vs. x ...' ;
run ;

proc reg data = ceiling ;
     model w = x ;
title1 'PROC REG for w vs. x ...' ;
run ;


proc nlp data = ceiling cov = 1 pcov phes ;
     max loglike ;
     parms b0 = .5, b1 = .5, sigma = .5 ;
     ey = b0 + b1 * x ;
     loglike = -log(sigma) - (y - ey)**2/(2 * sigma * sigma) ;
title 'PROC NLP on untruncated data ... ' ;
run ;

proc nlp data = ceiling cov = 1 pcov phes ;
     max loglike ;
     parms b0 = .5, b1 = .5, sigma = .5 ;
     ew = b0 + b1 * x ;
     loglike = -log(sigma) - (w - ew)**2/(2 * sigma * sigma) ;
title 'PROC NLP on truncated data ... Wrong LogLikelihood ' ;
run ;

proc nlp data = ceiling cov = 1 pcov phes ;
     max loglike ;
     parms b0 = .5, b1 = .5, sigma = .5 ;
     ey = b0 + b1 * x ;
     loglike = -log(sigma) - (y - ey)**2/(2 * sigma * sigma) ;

     ceiling = 5 ;

     if y > ceiling then do ; 
        zceiling = (ceiling - (b0 + b1*x))/sigma ;
        like = 1 - probnorm(zceiling) ;
        loglike = log(like) ;
     end ;
title 'PROC NLP taking truncation into account ... Correct LogLikelihood' ;
run ;


                            PROC REG for y vs. x ...                           1
                                                  19:32 Monday, December 2, 2013

                               The REG Procedure
                                 Model: MODEL1
                             Dependent Variable: y 

                    Number of Observations Read         200
                    Number of Observations Used         200


                              Analysis of Variance
 
                                     Sum of           Mean
 Source                   DF        Squares         Square    F Value    Pr > F

 Model                     1      228.46770      228.46770      59.17    <.0001
 Error                   198      764.56068        3.86142                     
 Corrected Total         199      993.02837                                    


              Root MSE              1.96505    R-Square     0.2301
              Dependent Mean        4.28538    Adj R-Sq     0.2262
              Coeff Var            45.85468                       


                              Parameter Estimates
 
                           Parameter       Standard
      Variable     DF       Estimate          Error    t Value    Pr > |t|

      Intercept     1        1.30770        0.41130       3.18      0.0017
      x             1        0.94647        0.12305       7.69      <.0001
                            PROC REG for w vs. x ...                           2
                                                  19:32 Monday, December 2, 2013

                               The REG Procedure
                                 Model: MODEL1
                             Dependent Variable: w 

                    Number of Observations Read         200
                    Number of Observations Used         200


                              Analysis of Variance
 
                                     Sum of           Mean
 Source                   DF        Squares         Square    F Value    Pr > F

 Model                     1       97.67918       97.67918      51.53    <.0001
 Error                   198      375.29411        1.89542                     
 Corrected Total         199      472.97329                                    


              Root MSE              1.37674    R-Square     0.2065
              Dependent Mean        3.70643    Adj R-Sq     0.2025
              Coeff Var            37.14478                       


                              Parameter Estimates
 
                           Parameter       Standard
      Variable     DF       Estimate          Error    t Value    Pr > |t|

      Intercept     1        1.75942        0.28816       6.11      <.0001
      x             1        0.61886        0.08621       7.18      <.0001
                       PROC NLP on untruncated data ...                        3
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

Gradient is computed using analytic formulas.
Hessian is computed using analytic formulas.

                       PROC NLP on untruncated data ...                        4
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                               Optimization Start
                               Parameter Estimates
                                                       Gradient
                                                      Objective
                   N Parameter         Estimate        Function

                   1 b0                0.500000     1769.871302
                   2 b1                0.500000     6023.646987
                   3 sigma             0.500000           13954

                   Value of Objective Function = -3449.947476


                                Hessian Matrix
 
                               b0                b1             sigma

          b0                 -800      -2516.871453      -7079.485206
          b1         -2516.871453      -8938.468025      -24094.58795
          sigma      -7079.485206      -24094.58795       -85325.8459

                           Determinant = -15851960488

                      Matrix has Only Negative Eigenvalues

                       PROC NLP on untruncated data ...                        5
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                       Newton-Raphson Ridge Optimization

                           Without Parameter Scaling

                    Parameter Estimates                    3
                    Functions (Observations)             200

                               Optimization Start

Active Constraints                    0  Objective Function         -3449.947476
Max Abs Gradient Element    13954.30765                                         


                                                                         Actual
                                                        Max Abs            Over
          Rest    Func       Act    Objective  Obj Fun Gradient            Pred
 Iter     arts   Calls       Con     Function   Change  Element  Ridge   Change

    1        0       8         0        -3328    122.0  18129.5  32.00   0.0556
    2        0       9         0        -1710   1618.1   7489.2  64.00    1.278
    3        0      10         0   -950.10461    759.7   3097.5  16.00    1.301
    4        0      11         0   -560.85443    389.3   1267.2  4.000    1.300
    5        0      12         0   -366.94593    193.9    505.9  1.000    1.293
    6        0      13         0   -278.20655  88.7394    192.1  0.250    1.277
    7        0      14         0   -244.21419  33.9924  65.4363 0.0625    1.245
    8        0      15         0   -235.21503   8.9992  17.2580      0    1.186
    9        0      16         0   -234.12417   1.0909   2.3651      0    1.094
   10        0      17         0   -234.09842   0.0257   0.0637      0    1.018
   11        0      18         0   -234.09840 0.000019 0.000049      0    1.001
   12        0      19         0   -234.09840 1.19E-11 2.99E-11      0    1.016

                              Optimization Results

Iterations                           12  Function Calls                       20
Hessian Calls                        13  Active Constraints                    0
Objective Function         -234.0984025  Max Abs Gradient Element   2.988831E-11
Ridge                                 0  Actual Over Pred Change    1.0161993883

GCONV convergence criterion satisfied.                                          
                       PROC NLP on untruncated data ...                        6
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                              Optimization Results
 
                               Parameter Estimates
                                         Approx                          Approx
   N Parameter         Estimate         Std Err         t Value        Pr > |t|

   1 b0                1.307703        0.431442        3.031004        0.002765
   2 b1                0.946470        0.127524        7.421914     3.38457E-12
   3 sigma             1.955199        0.093066       21.008829    3.455779E-52

                              Optimization Results
                               Parameter Estimates
                                               Gradient
                                              Objective
                           N Parameter         Function

                           1 b0             -3.2796E-13
                           2 b1            -1.13154E-12
                           3 sigma         2.988831E-11

                   Value of Objective Function = -234.0984025


                                Hessian Matrix
 
                               b0                b1             sigma

          b0          -52.3176265      -164.5959258      3.320677E-13
          b1         -164.5959258      -584.5492895      1.151523E-12
          sigma      3.320677E-13      1.151523E-12       -104.635253

                           Determinant = 365220.20602

                      Matrix has Only Negative Eigenvalues


             Covariance Matrix 1: M = (NOBS/d) inv(G) JJ(f) inv(G)
 
                               b0                b1             sigma

          b0         0.1861423842      -0.052080475      -0.000631801
          b1         -0.052080475      0.0162623102      0.0003672379
          sigma      -0.000631801      0.0003672379      0.0086612038

                           Factor sigm = 1.0152284264


                           Determinant = 2.7185069E-6

                      Matrix has 3 Positive Eigenvalue(s)


                       PROC NLP on untruncated data ...                        7
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

             Approximate Correlation Matrix of Parameter Estimates
 
                               b0                b1             sigma

          b0                    1      -0.946588407      -0.015735068
          b1         -0.946588407                 1      0.0309433408
          sigma      -0.015735068      0.0309433408                 1

                           Determinant = 0.1036870844

                      Matrix has 3 Positive Eigenvalue(s)

              PROC NLP on truncated data ... Wrong LogLikelihood               8
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

Gradient is computed using analytic formulas.
Hessian is computed using analytic formulas.

              PROC NLP on truncated data ... Wrong LogLikelihood               9
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                               Optimization Start
                               Parameter Estimates
                                                       Gradient
                                                      Objective
                   N Parameter         Estimate        Function

                   1 b0                0.500000     1306.706122
                   2 b1                0.500000     4232.275767
                   3 sigma             0.500000     6899.882542

                   Value of Objective Function = -1686.341199


                                Hessian Matrix
 
                               b0                b1             sigma

          b0                 -800      -2516.871453      -5226.824486
          b1         -2516.871453      -8938.468025      -16929.10307
          sigma      -5226.824486      -16929.10307      -42999.29525

                           Determinant = -7034293471

                      Matrix has Only Negative Eigenvalues

              PROC NLP on truncated data ... Wrong LogLikelihood              10
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                       Newton-Raphson Ridge Optimization

                           Without Parameter Scaling

                    Parameter Estimates                    3
                    Functions (Observations)             200

                               Optimization Start

Active Constraints                    0  Objective Function         -1686.341199
Max Abs Gradient Element   6899.8825419                                         


                                                                         Actual
                                                        Max Abs            Over
          Rest    Func       Act    Objective  Obj Fun Gradient            Pred
 Iter     arts   Calls       Con     Function   Change  Element  Ridge   Change

    1        0       9         0        -1599  87.3081   9087.7  128.0   0.0788
    2        0      10         0   -802.13363    796.9   3727.7  256.0    1.274
    3        0      11         0   -445.06723    357.1   1518.2  64.00    1.296
    4        0      12         0   -273.52248    171.5    602.2  16.00    1.290
    5        0      13         0   -197.64914  75.8733    225.5  4.000    1.272
    6        0      14         0   -170.12571  27.5234  74.4551  1.000    1.237
    7        0      15         0   -163.58283   6.5429  18.1338  0.250    1.171
    8        0      16         0   -162.94866   0.6342   2.0209 0.0625    1.076
    9        0      17         0   -162.93927  0.00939   0.0335      0    1.011
   10        0      18         0   -162.93927 2.641E-6 9.625E-6      0    1.000

                              Optimization Results

Iterations                           10  Function Calls                       19
Hessian Calls                        11  Active Constraints                    0
Objective Function          -162.939265  Max Abs Gradient Element   9.6248153E-6
Ridge                                 0  Actual Over Pred Change    1.0001913257

ABSGCONV convergence criterion satisfied.                                       
              PROC NLP on truncated data ... Wrong LogLikelihood              11
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                              Optimization Results
 
                               Parameter Estimates
                                         Approx                          Approx
   N Parameter         Estimate         Std Err         t Value        Pr > |t|

   1 b0                1.759424        0.343587        5.120752     0.000000722
   2 b1                0.618865        0.090766        6.818241    1.100852E-10
   3 sigma             1.369843        0.083547       16.396054    1.138705E-38

                              Optimization Results
                               Parameter Estimates
                                               Gradient
                                              Objective
                           N Parameter         Function

                           1 b0            -0.000000160
                           2 b1            -0.000000723
                           3 sigma          0.000009625

                   Value of Objective Function = -162.939265


                                Hessian Matrix
 
                               b0                b1             sigma

          b0         -106.5830808      -335.3198918      2.3396232E-7
          b1         -335.3198918      -1190.861824      1.0556667E-6
          sigma      2.3396232E-7      1.0556667E-6      -213.1661826

                           Determinant = 3087987.6083

                      Matrix has Only Negative Eigenvalues


             Covariance Matrix 1: M = (NOBS/d) inv(G) JJ(f) inv(G)
 
                               b0                b1             sigma

          b0         0.1180520228      -0.030207371      -0.010241355
          b1         -0.030207371      0.0082384712       0.001906023
          sigma      -0.010241355       0.001906023      0.0069801227

                           Factor sigm = 1.0152284264


                           Determinant = 3.0572862E-7

                      Matrix has 3 Positive Eigenvalue(s)


              PROC NLP on truncated data ... Wrong LogLikelihood              12
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

             Approximate Correlation Matrix of Parameter Estimates
 
                               b0                b1             sigma

          b0                    1       -0.96861881      -0.356770704
          b1          -0.96861881                 1      0.2513467613
          sigma      -0.356770704      0.2513467613                 1

                           Determinant = 0.0450352914

                      Matrix has 3 Positive Eigenvalue(s)

       PROC NLP taking truncation into account ... Correct LogLikelihood      13
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

Gradient is computed using analytic formulas.
Hessian is computed using analytic formulas.

       PROC NLP taking truncation into account ... Correct LogLikelihood      14
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                               Optimization Start
                               Parameter Estimates
                                                       Gradient
                                                      Objective
                   N Parameter         Estimate        Function

                   1 b0                0.500000     1331.599252
                   2 b1                0.500000     4327.932234
                   3 sigma             0.500000     7170.264249

                   Value of Objective Function = -1921.399196


WARNING: There is a large relative difference between the analytic gradient of 
     the objective function and the gradient computed by finite difference 
  approximation. This may indicate that the specification of the gradient is 
  wrong, or that the derivatives are too small for accurate finite difference 
                                  computation.


                                Hessian Matrix
 
                               b0                b1             sigma

          b0         -782.1585048      -2472.505163       -5160.40358
          b1         -2472.505163      -8786.464221       -16873.1232
          sigma       -5160.40358       -16873.1232      -42998.25038

                           Determinant = -6549853041

                      Matrix has Only Negative Eigenvalues

       PROC NLP taking truncation into account ... Correct LogLikelihood      15
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                       Newton-Raphson Ridge Optimization

                           Without Parameter Scaling

                    Parameter Estimates                    3
                    Functions (Observations)             200

                               Optimization Start

Active Constraints                    0  Objective Function         -1921.399196
Max Abs Gradient Element   7170.2642486                                         


                                                                         Actual
                                                        Max Abs            Over
          Rest    Func       Act    Objective  Obj Fun Gradient            Pred
 Iter     arts   Calls       Con     Function   Change  Element  Ridge   Change

    1        0       9         0        -1757    164.6   9492.9  128.0    0.140
    2        0      10         0   -941.56920    815.2   3894.4  64.00    1.268
    3        0      11         0   -562.80185    378.8   1609.9  16.00    1.304
    4        0      12         0   -365.79772    197.0    657.5  4.000    1.301
    5        0      13         0   -267.28264  98.5151    261.4  1.000    1.292
    6        0      14         0   -222.57190  44.7107  98.1765  0.250    1.274
    7        0      15         0   -205.97308  16.5988  32.5790 0.0625    1.239
    8        0      16         0   -201.91865   4.0544   8.0278      0    1.173
    9        0      17         0   -201.50942   0.4092   0.9223      0    1.078
   10        0      18         0   -201.50289  0.00653   0.0165      0    1.012
   11        0      19         0   -201.50288 2.146E-6 5.522E-6      0    1.000

                              Optimization Results

Iterations                           11  Function Calls                       20
Hessian Calls                        12  Active Constraints                    0
Objective Function         -201.5028832  Max Abs Gradient Element   5.5218154E-6
Ridge                                 0  Actual Over Pred Change    1.0002238547

ABSGCONV convergence criterion satisfied.                                       
       PROC NLP taking truncation into account ... Correct LogLikelihood      16
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

                              Optimization Results
 
                               Parameter Estimates
                                         Approx                          Approx
   N Parameter         Estimate         Std Err         t Value        Pr > |t|

   1 b0                1.430046        0.433063        3.302163        0.001139
   2 b1                0.884780        0.132169        6.694286    2.203781E-10
   3 sigma             1.897667        0.128332       14.787118    8.933459E-34

                              Optimization Results
                               Parameter Estimates
                                               Gradient
                                              Objective
                           N Parameter         Function

                           1 b0            -9.613822E-9
                           2 b1            -9.236474E-8
                           3 sigma          0.000005522

                   Value of Objective Function = -201.5028832


                                Hessian Matrix
 
                               b0                b1             sigma

          b0          -48.9193994      -149.0675813      15.331672465
          b1         -149.0675813      -515.7835227      56.060243687
          sigma      15.331672465      56.060243687      -68.93940002

                           Determinant = 188818.93101

                      Matrix has Only Negative Eigenvalues


             Covariance Matrix 1: M = (NOBS/d) inv(G) JJ(f) inv(G)
 
                               b0                b1             sigma

          b0         0.1875439127      -0.053653485      -0.007291765
          b1         -0.053653485      0.0174687632      0.0042921861
          sigma      -0.007291765      0.0042921861      0.0164692192

                           Factor sigm = 1.0152284264


                           Determinant = 5.5204631E-6

                      Matrix has 3 Positive Eigenvalue(s)


       PROC NLP taking truncation into account ... Correct LogLikelihood      17
                                                  19:32 Monday, December 2, 2013

                        PROC NLP: Nonlinear Maximization

             Approximate Correlation Matrix of Parameter Estimates
 
                               b0                b1             sigma

          b0                    1      -0.937379323      -0.131203276
          b1         -0.937379323                 1      0.2530526904
          sigma      -0.131203276      0.2530526904                 1

                           Determinant = 0.1023145443

                      Matrix has 3 Positive Eigenvalue(s)