/*          Program for finding the distortion function in 2-D
                    using the calculus of variations
                           by Cavan Reilly

   This function uses the calculus of variations to find the minimum of
         I(p_1(x_1,x_2),p_2(x_1,x_2))=\int \int [y(x_1,x_2)-
              \hat y(p_1(x_1,x_2),p_2(x_1,x_2))]^2 +
              lambda_1(dp_1/dx_1 - 1)^2 + lambda_2(dp_2/dx_2 - 1)^2 dx_1 dx_2
with respect to p_1 and p_2.  Using the methods in
Renardy and Rogers (1993) section 9.2 we obtain a system of p.d.e.s.

We numerically solve this system using an implementation of Brandts's
full approximation storage algorithm appropriate for ``weakly coupled''
systems (see Hackbusch (1985) section 11.1 and Press et al. (1992)
section 19.6). Since the order of the equations is the same, we can
use the same restriction and prolongation operators for both equations.
   As in the 1-D version, we obtain a smooth observation and prediction
surface via interpolation between sites (here bicubic splines or bilinear
interpolation). This program assumes observations and predictions are on
the unit square, [0,1]^2. The global variable nGlob controls the dimension
of the grid on which we have observations/predictions, so the user must
alter this. The global variable nfGrid controls the dimension of the finest
grid in the multi-grid implementation, so it must be of the form n=2^j+1
(here 33 or 65 is usually adequate).
   We assume the observations are in a file called smpld.dat while the
predictions are in a file called pred.dat. The program puts p_1
evaluated over the n by n grid in the file pdeOut1.dat, p_2 in the file
pdeOut2.dat, and \hat y(p_1(i,j),p_2(i,j)) in pdeOut3.dat. We can use
the plotting function "morph2Dplt" in S to create graphs of these
functions. This function is as follows. Since this function does 3
graphs you first want to do something like > par(mfrow=c(2,3)).
> morph2Dplt
function(n, y, yhat)
{
	dd <- dim(y)[1]
	b1 <- matrix(scan("pdeOut1.dat"), byrow = T, ncol = n)
	b2 <- matrix(scan("pdeOut2.dat"), byrow = T, ncol = n)
	b3 <- matrix(scan("pdeOut3.dat"), byrow = T, ncol = n)
	tt1 <- seq(0, 1, len = n)
	tt2 <- seq(0, 1, len = dd)
	contour(tt1, tt1, b1)
	contour(tt1, tt1, b2)
	contour(tt2, tt2, y)
	contour(tt2, tt2, yhat, add = T, col = 3)
	contour(tt1, tt1, b3, add = T, col = 2)
}
  When the program starts, the user is asked for lambda_1 and lambda_2.
If either is choosen too small, the program may fail to find a solution
since the differential operator we are working with undergoes a singular
perturbation at lambda_k=0 (since we know any admissable solution has
h(-dp_k/dx_k)=0 for k=1,2). Unfortunately this failure is not always
graceful (i.e. the error handler does not handle the error), but usually
the user knows because the root finding algorithm on the coursest grid
fails (e.g. the Jacobian in Newton's algorithm is singular, so the LU
decomposition does not work).

References
Hackbusch, W. (1985). {\sl Multi-Grid Methods and Applications}.
  Springer-Verlag, New York.
Press, W., Teukolsky, S., Vetterling, W., and Flannery, B. (1992).
  {\sl Numerical Recipes in C}. Cambridge University Press. Cambridge.
Renardy, M. and Rogers, R. (1993). {\sl An Introduction to Partial
  Differential Equations}. Springer-Verlag. New York.
*/

#include <math.h>
#include <iostream.h>
#include <fstream.h>

#define FREE_ARG char*

/* useful macros */
static double dsqrarg;
#define DSQR(a) ((dsqrarg=(a)) == 0.0 ? 0.0 : dsqrarg*dsqrarg)

static double dmaxarg1,dmaxarg2;
#define DMAX(a,b) (dmaxarg1=(a),dmaxarg2=(b),(dmaxarg1) > (dmaxarg2) ?\
   (dmaxarg1) : (dmaxarg2))

/* function declarations-note that many functions are declared within
   these functions, but these are the primary ones */
int solnChk(double **u1,double **u2,int n);
void invrtFcn(double **u1,double **u2,double *x);
void mgfas(double **u1,double **u2,int n,int maxcyc);
double quad2d(double (*func)(double,double), double x1,double x2);
double qtrap(double (*func)(double),double ,double b);
double intFcn1(double x1,double x2);
double intFcn2(double x1,double x2);
double intFcn3(double x1,double x2);
void newt(double *x,int n,int *check, void (*vecfunc)(int, double *,
   double *,double **,double **),double **rhs1,double **rhs2);
void newtOrig(double *x,int n,int *check, void (*vecfunc)(int, double *,
   double *));
void ludcmp(double **a,int n,int *indx, double *d);
void lubksb(double **a,int n,int *indx,double *b);
void bilin(double *x1a,double *x2a,double **ya,int m,int n,
   double x1,double x2,double *y);
void spline(double *x, double *y, int n, double yp1, double ypn,
   double *y2);
void splint(double *xa, double *ya, double *y2a, int n, double x,
   double *y);
void splie2(double *x1a,double *x2a,double **ya,int m,int n,double **y2a);
void splin2(double *x1a,double *x2a,double **ya,double **y2a,int m,int n,
   double x1,double x2,double *y);
double *dvector(long nl,long nh);
void free_dvector(double *v,long nl,long nh);
int *ivector(long nl,long nh);
void free_ivector(int *v,long nl, long nh);
double **dmatrix(long nrl,long nrh,long ncl,long nch);
void free_dmatrix(double **m, long nrl,long nrh,long ncl,long nch);
void nrerror(char error_text[]);

/* global variables */
/* nfGrid gives the number of rows/col.s in finest grid, so it should be of
      form  n=2^j+1 so that coursest grid is 3 by 3 */
/* nGlob gives the dimension of the grid on which we have obs.-so
   we have nGlob^2 observations */
/* INTERP=1 for bilinear interpolation and INTERP=0 for bicubic splines,
INTERPINT is same deal but for the integration routines-seperate control
becuase the bicubic splines can really slow down the integration since
we use the trapezoidaial method (since we assume no smoothness for
observations and predictions */
/* epsGlb controls lower bound for derivative of deformation and
alfaGlb controls the upper bound for the derivative */
/* max cycle controls how many cycles of smoothing */
int NPRE=1,NPOST=2,maxcyc=2,nGlob=41,nfGrid=65,INTERP=1,INTERPINT=1,INVERT=0;
double lambda1=1.0e5,lambda2=1.0e5,tau=1.0e6,epsGlb=1.0e-1,alfaGlb=10.0,*xGlob,
   *x1loc,*x2loc,*ux1,*ux2,**yGlob,**yHat,**yGlobS,**yHatS,**u1Glob,**u2Glob;

int main()
{
   /* this file holds observed values */
//   ifstream filer1("c:\\Spluswin\\home\\smpld.dat");
   ifstream filer1("c:\\Spluswin\\home\\pred.dat");
   if(!filer1){
      cerr << "Failure to open filer1\n";
      getch();
      exit(EXIT_FAILURE);
   }
/* this file holds the predictions-one for each observation */
//   ifstream filer2("c:\\Spluswin\\home\\pred.dat");
   ifstream filer2("c:\\Spluswin\\home\\smpld.dat");
   if(!filer2){
      cerr << "Failure to open filer2\n";
      getch();
      exit(EXIT_FAILURE);
   }
/* output file for first component of distortion function */
   ofstream filew1("c:\\Spluswin\\home\\pdeOut1.dat");
   if(!filew1){
      cerr << "Failure to open filew1\n";
      getch();
      exit(EXIT_FAILURE);
   }
/* output file for second component of distortion function */
   ofstream filew2("c:\\Spluswin\\home\\pdeOut2.dat");
   if(!filew2){
      cerr << "Failure to open filew2\n";
      getch();
      exit(EXIT_FAILURE);
   }
/* output file for distorted predictions */
   ofstream filew3("c:\\Spluswin\\home\\pdeOut3.dat");
   if(!filew3){
      cerr << "Failure to open filew3\n";
      getch();
      exit(EXIT_FAILURE);
   }
/* output file for observations defromed with inverse of optimal deformation*/
   ofstream filew4("c:\\Spluswin\\home\\pdeOut4.dat");
   if(!filew4){
      cerr << "Failure to open filew4\n";
      getch();
      exit(EXIT_FAILURE);
   }
   int i,j,k1,k2,ALWD;
   double temp,preIntMise,pstIntMise;
   cout << "give value for lambda1 (trade-off parameter in x-direction):";
   cin >> lambda1;
//   cout << "give value for lambda2 (trade-off parameter in y-direction):";
//   cin >> lambda2;
   lambda2=lambda1;
   x1loc=dvector(1,nGlob);
   x2loc=dvector(1,nGlob);
   ux1=dvector(1,nfGrid);
   ux2=dvector(1,nfGrid);
   u1Glob=dmatrix(1,nfGrid,1,nfGrid);
   u2Glob=dmatrix(1,nfGrid,1,nfGrid);
   yGlob=dmatrix(1,nGlob,1,nGlob);
   yHat=dmatrix(1,nGlob,1,nGlob);
   yGlobS=dmatrix(1,nGlob,1,nGlob);
   yHatS=dmatrix(1,nGlob,1,nGlob);
   /*this gives data values*/
   filer1.seekg(0);
   for(i=1;i<=nGlob*nGlob;i++){
      k1=((i-1)/nGlob);
      k2=i-nGlob*k1;
      filer1 >> yGlob[k1+1][k2];
   }
/* this gets predictions */
   filer2.seekg(0);
   for(i=1;i<=nGlob*nGlob;i++){
      k1=((i-1)/nGlob);
      k2=i-nGlob*k1;
      filer2 >> yHat[k1+1][k2];
   }
   for(i=1;i<=nGlob;i++) x1loc[i]=x2loc[i]=(i-1.0)/(nGlob-1.0);
   /* we have a homogeneous problem on [0,1]^2 */
   splie2(x1loc,x2loc,yGlob,nGlob,nGlob,yGlobS);
   splie2(x1loc,x2loc,yHat,nGlob,nGlob,yHatS);
   for(i=1;i<=nfGrid;i++){
      for(j=1;j<=nfGrid;j++){
         ux1[i]=ux2[i]=(i-1.0)/(nfGrid-1.0);
         u1Glob[i][j]=0.0;
         u2Glob[i][j]=0.0;
      }
   }
/* solve the system of p.d.e.s using FAS algorithm */
   mgfas(u1Glob,u2Glob,nfGrid,maxcyc);
   ALWD=solnChk(u1Glob,u2Glob,nfGrid);
   if(ALWD==1) cout << "nonallowable solution: deformation gradient not p.d." << endl;
   else cout << "solution is in allowable set of deformations" << endl;
   for(i=1;i<=nfGrid;i++){
      for(j=1;j<=nfGrid;j++){
         filew1 << u1Glob[i][j] << " ";
         filew2 << u2Glob[i][j] << " ";
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1Glob[i][j],
            u2Glob[i][j],&temp);
         filew3 << temp << " ";
      }
   }
   if(INVERT==1){
      xGlob=dvector(1,2);
      for(i=1;i<=nGlob;i++){
         for(j=1;j<=nGlob;j++){
            xGlob[1]=x1loc[i];   xGlob[2]=x2loc[j];
            invrtFcn(u1Glob,u2Glob,xGlob);
            bilin(x1loc,x2loc,yGlob,nGlob,nGlob,xGlob[1],
               xGlob[2],&temp);
            filew4 << temp << " ";
         }
      }
      free_dvector(xGlob,1,2);
   }
   cout << "starting the integration" << endl;
   preIntMise=quad2d(intFcn1,0.0,1.0);
   pstIntMise=quad2d(intFcn2,0.0,1.0);
   /* since we assume data one [0,1]^2 no matter how many observations the
   integrated squared errror s already standardized */
   cout << "RMISE before transformation=" << sqrt(preIntMise) << endl;
   cout << "RMISE after transformation=" << sqrt(pstIntMise) << endl;
   cout << "standardized deformation penalty=" << sqrt(quad2d(intFcn3,0.0,1.0)) << endl;
   cout << "press any key to end";
   getch();
   free_dvector(x1loc,1,nGlob);
   free_dvector(x2loc,1,nGlob);
   free_dvector(ux1,1,nfGrid);
   free_dvector(ux2,1,nfGrid);
   free_dmatrix(u1Glob,1,nfGrid,1,nfGrid);
   free_dmatrix(u2Glob,1,nfGrid,1,nfGrid);
   free_dmatrix(yGlob,1,nGlob,1,nGlob);
   free_dmatrix(yHat,1,nGlob,1,nGlob);
   free_dmatrix(yGlobS,1,nGlob,1,nGlob);
   free_dmatrix(yHatS,1,nGlob,1,nGlob);
   return 0;
}

void invrtFcn(double **u1,double **u2,double *x)
{
   /* invert a function which maps R^2 into R^2 and this map is given by look up
   in a table and interpolation, u1 and u2 hold matrices with the value of
   the components of the 2-D map and x is value for which we want f^{-1} */
   void func1(int n,double *x,double *f);
   int chck;
   double *initVal;
   initVal=dvector(1,2);
   initVal[1]=x[1];    initVal[2]=x[2];
   newtOrig(initVal,2,&chck,func1);
   if(chck) nrerror("unable to find root in routine invrtFcn");
   x[1]=initVal[1];      x[2]=initVal[2];
   free_dvector(initVal,1,2);
}

void func1(int n,double *x,double *f)
{
   double a1,a2;
   bilin(ux1,ux2,u1Glob,nfGrid,nfGrid,x[1],x[2],&a1);
   bilin(ux1,ux2,u2Glob,nfGrid,nfGrid,x[1],x[2],&a2);
   f[1]=a1-xGlob[1];
   f[2]=a2-xGlob[2];
}

int solnChk(double **u1,double **u2,int n)
{
   int i,j,res=0;
   double del=1.0/(n-1.0);
   for(i=2;i<=n-1;i++){
      for(j=2;j<=n-1;j++){
         if(((u1[i+1][j]-u1[i-1][j])*(u2[i][j+1]-u2[i][j-1])/
            DSQR(2.0*del)-(u1[i][j+1]-u1[i][j-1])*(u2[i+1][j]-
            u2[i-1][j])/DSQR(2.0*del))<0.0)
            res=1;
      }
   }
   return res;
}

/* routines for FAS algorithm */
void mgfas(double **u1,double **u2,int n,int maxcyc)
{
   const int NGMAX=15;
   double ALPHA=0.33;
   double anorm2(double **a,int n);
   void copy(double **aout,double **ain,int n);
   void interp(double **uf,double **uc,int nf);
   void lop1(double **out,double **u1,double **u2,int n);
   void lop2(double **out,double **u1,double **u2,int n);
   void matadd(double **a,double **b,double **c,int n);
   void matsub(double **a,double **b,double **c,int n);
   void relax2C1(double **u1,double **u2,double **rhs,int n);
   void relax2C2(double **u1,double **u2,double **rhs,int n);
   void rstrct(double **uc,double **uf,int nc);
   void slvsm2(double **u1,double **u2,double **rhs1,double **rhs2);
   unsigned int j,jcycle,jj,jm1,jpost,jpre,nf,ng=0,ngrid,nn;
   double **irho1[NGMAX+1],**irhs1[NGMAX+1],**itau1[NGMAX+1],
      **itemp1[NGMAX+1],**iu1[NGMAX+1],**irho2[NGMAX+1],
      **irhs2[NGMAX+1],**itau2[NGMAX+1],**itemp2[NGMAX+1],**iu2[NGMAX+1];
   double res1,res2,trerr1,trerr2;
   nn=n;
   while(nn >>= 1) ng++;
   if(n != 1+(1L << ng)) nrerror("n-1 must bw a power of 2 in mgfas.");
   if(ng > NGMAX) nrerror("increase NGMAX in mglin.");
   nn=n/2+1;
   ngrid=ng-1;
   irho1[ngrid]=dmatrix(1,nn,1,nn);
   rstrct(irho1[ngrid],u1,nn);
   irho2[ngrid]=dmatrix(1,nn,1,nn);
   rstrct(irho2[ngrid],u2,nn);
   while(nn > 3){
      nn=nn/2+1;
      irho1[--ngrid]=dmatrix(1,nn,1,nn);
      rstrct(irho1[ngrid],irho1[ngrid+1],nn);
      irho2[ngrid]=dmatrix(1,nn,1,nn);
      rstrct(irho2[ngrid],irho2[ngrid+1],nn);
   }
   cout << "starting iteration 1 of the nested iterations." << endl;
   nn=3;
   iu1[1]=dmatrix(1,nn,1,nn);
   irhs1[1]=dmatrix(1,nn,1,nn);
   itau1[1]=dmatrix(1,nn,1,nn);
   itemp1[1]=dmatrix(1,nn,1,nn);
   iu2[1]=dmatrix(1,nn,1,nn);
   irhs2[1]=dmatrix(1,nn,1,nn);
   itau2[1]=dmatrix(1,nn,1,nn);
   itemp2[1]=dmatrix(1,nn,1,nn);
   slvsm2(iu1[1],iu2[1],irho1[1],irho2[1]);
   free_dmatrix(irho1[1],1,nn,1,nn);
   free_dmatrix(irho2[1],1,nn,1,nn);
   ngrid=ng;
   for(j=2;j<=ngrid;j++){
      cout << "starting iteration " << j <<
         " of the nested iterations." << endl;
      nn=2*nn-1;
      iu1[j]=dmatrix(1,nn,1,nn);
      irhs1[j]=dmatrix(1,nn,1,nn);
      itau1[j]=dmatrix(1,nn,1,nn);
      itemp1[j]=dmatrix(1,nn,1,nn);
      iu2[j]=dmatrix(1,nn,1,nn);
      irhs2[j]=dmatrix(1,nn,1,nn);
      itau2[j]=dmatrix(1,nn,1,nn);
      itemp2[j]=dmatrix(1,nn,1,nn);
      interp(iu1[j],iu1[j-1],nn);
      interp(iu2[j],iu2[j-1],nn);
      copy(irhs1[j],(j != ngrid ? irho1[j] : u1),nn);
      copy(irhs2[j],(j != ngrid ? irho2[j] : u2),nn);
      for(jcycle=1;jcycle<=maxcyc;jcycle++){
         nf=nn;
         for(jj=j;jj>=2;jj--){
            for(jpre=1;jpre<=NPRE;jpre++){
               relax2C1(iu1[jj],iu2[jj],irhs1[jj],nf);
               relax2C2(iu1[jj],iu2[jj],irhs2[jj],nf);
            }
            lop1(itemp1[jj],iu1[jj],iu2[jj],nf);
            lop2(itemp2[jj],iu1[jj],iu2[jj],nf);
            nf=nf/2+1;
            jm1=jj-1;
            rstrct(itemp1[jm1],itemp1[jj],nf);
            rstrct(itemp2[jm1],itemp2[jj],nf);
            rstrct(iu1[jm1],iu1[jj],nf);
            rstrct(iu2[jm1],iu2[jj],nf);
            lop1(itau1[jm1],iu1[jm1],iu2[jm1],nf);
            lop2(itau2[jm1],iu1[jm1],iu2[jm1],nf);
            matsub(itau1[jm1],itemp1[jm1],itau1[jm1],nf);
            matsub(itau2[jm1],itemp2[jm1],itau2[jm1],nf);
            if(jj==j){
               trerr1=ALPHA*anorm2(itau1[jm1],nf);
               trerr2=ALPHA*anorm2(itau2[jm1],nf);
            }
            rstrct(irhs1[jm1],irhs1[jj],nf);
            rstrct(irhs2[jm1],irhs2[jj],nf);
            matadd(irhs1[jm1],itau1[jm1],irhs1[jm1],nf);
            matadd(irhs2[jm1],itau2[jm1],irhs2[jm1],nf);
         }
         slvsm2(iu1[1],iu2[1],irhs1[1],irhs2[1]);
         nf=3;
         for(jj=2;jj<=j;jj++){
            jm1=jj-1;
            rstrct(itemp1[jm1],iu1[jj],nf);
            rstrct(itemp2[jm1],iu2[jj],nf);
            matsub(iu1[jm1],itemp1[jm1],itemp1[jm1],nf);
            matsub(iu2[jm1],itemp2[jm1],itemp2[jm1],nf);
            nf=2*nf-1;
            interp(itau1[jj],itemp1[jm1],nf);
            interp(itau2[jj],itemp2[jm1],nf);
            matadd(iu1[jj],itau1[jj],iu1[jj],nf);
            matadd(iu2[jj],itau2[jj],iu2[jj],nf);
            for(jpost=1;jpost<=NPOST;jpost++){
               relax2C1(iu1[jj],iu2[jj],irhs1[jj],nf);
               relax2C2(iu1[jj],iu2[jj],irhs2[jj],nf);
            }
         }
         lop1(itemp1[j],iu1[j],iu2[j],nf);
         lop2(itemp2[j],iu1[j],iu2[j],nf);
         matsub(itemp1[j],irhs1[j],itemp1[j],nf);
         matsub(itemp2[j],irhs2[j],itemp2[j],nf);
         res1=anorm2(itemp1[j],nf);
         res2=anorm2(itemp2[j],nf);
         if(res1 < trerr1 && res2 < trerr2) break;
      }
   }
   copy(u1,iu1[ngrid],n);
   copy(u2,iu2[ngrid],n);
   for(nn=n,j=ng;j>=1;j--,nn=nn/2+1){
      free_dmatrix(itemp1[j],1,nn,1,nn);
      free_dmatrix(itau1[j],1,nn,1,nn);
      free_dmatrix(irhs1[j],1,nn,1,nn);
      free_dmatrix(iu1[j],1,nn,1,nn);
      if(j != ng && j != 1) free_dmatrix(irho1[j],1,nn,1,nn);
      free_dmatrix(itemp2[j],1,nn,1,nn);
      free_dmatrix(itau2[j],1,nn,1,nn);
      free_dmatrix(irhs2[j],1,nn,1,nn);
      free_dmatrix(iu2[j],1,nn,1,nn);
      if(j != ng && j != 1) free_dmatrix(irho2[j],1,nn,1,nn);
   }
}

void slvsm2(double **u1,double **u2,double **rhs1,double **rhs2)
{
/* solve the nonlinear difference equations on a 3 by 3 grid */
   void fcnTo0(int n,double *x,double *f,double **rhs1,double **rhs2);
   int i,chck;
   double *initVal;
   initVal=dvector(1,2);
   initVal[1]=0.5;  initVal[2]=0.5;
   newt(initVal,2,&chck,fcnTo0,rhs1,rhs2);
   if(chck) nrerror("unable to find root at coursest level");
   u1[2][2]=initVal[1];
   u2[2][2]=initVal[2];
   /* put in the boundary conditions */
   for(i=1;i<=3;i++){
      u1[1][i]=0.0; /* set all values on y-axis to zero */
      u1[3][i]=1.0;
      u1[i][1]=u1[i][3]=(i-1.0)/(3.0-1.0);
   }
   for(i=1;i<=3;i++){
      u2[i][1]=0.0; /* set all values on x-axis to zero */
      u2[i][3]=1.0;
      u2[1][i]=u2[3][i]=(i-1.0)/(3.0-1.0);
   }
   free_dvector(initVal,1,2);
}

void fcnTo0(int n,double *x,double *f,double **rhs1,double **rhs2)
{
/* here we calculate the function we must zero in order to solve the
   difference equations on the coursest grid */
   double del=0.5,h=1.0e-5,temp1,temp2,temp3,temp4;
   if(INTERP==1){
      bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1],x[2],&temp1);
      bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]+h,x[2],&temp2);
      bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1],x[2]+h,&temp3);
      bilin(x1loc,x2loc,yGlob,nGlob,nGlob,0.5,0.5,&temp4);
   } else{
      splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1],x[2],&temp1);
      splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]+h,x[2],&temp2);
      splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1],x[2]+h,&temp3);
      splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,0.5,0.5,&temp4);
   }
   f[1]=lambda1*(1.0-2.0*x[1])/(del*del)-
      (temp2-temp1)*(temp1-temp4)/h-rhs1[2][2];
   f[2]=lambda2*(1.0-2.0*x[2])/(del*del)-
      (temp3-temp1)*(temp1-temp4)/h-rhs2[2][2];
}

/* we use 2 relaxation functions */
void relax2C1(double **u1,double **u2,double **rhs,int n)
{
   int i,ipass,isw,j,jsw=1;
   double h=1.0e-5,del=1.0/(n-1),temp1,temp2,temp3,temp4,temp5,temp6,
      res1,res2;
   /* do a newton step on the nonlinear problem to get a nonlinear
      smoother to use in a manner analogous to Gauss-Seidel-so must
      differentiate the discretized pde with respect to arg. updated */
   for(ipass=1;ipass<=2;ipass++,jsw=3-jsw){
      isw=jsw;
      for(j=2;j<n;j++,isw=3-isw){
         for(i=isw+1;i<n;i+=2){
            if(INTERP==1){
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
                  u2[i][j],&temp1);
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j]+h,
                  u2[i][j],&temp2);
               bilin(x1loc,x2loc,yGlob,nGlob,nGlob,
                  (i-1.0)/(n-1.0),(j-1.0)/(n-1.0),&temp3);
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j]+2.0*h,
                  u2[i][j],&temp6);
            } else{
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
                  u2[i][j],&temp1);
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j]+h,
                  u2[i][j],&temp2);
               splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,
                  (i-1.0)/(n-1.0),(j-1.0)/(n-1.0),&temp3);
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j]+2.0*h,
                  u2[i][j],&temp6);
            }
            if(((u1[i+1][j]-u1[i-1][j])/(2.0*del))<epsGlb){
               temp4=12.0*DSQR(epsGlb-(u1[i+1][j]-u1[i-1][j])/(2.0*del));
               temp5=24.0*(epsGlb-(u1[i+1][j]-u1[i-1][j])/(2.0*del));
            }
            else temp4=temp5=0.0;
            if(((u1[i+1][j]-u1[i-1][j])/(2.0*del))>alfaGlb){
               temp4+=12.0*DSQR((u1[i+1][j]-u1[i-1][j])/(2.0*del)-alfaGlb);
               temp5+=24.0*((u1[i+1][j]-u1[i-1][j])/(2.0*del)-alfaGlb);
            }
            res1=(lambda1+tau*temp4)*(u1[i+1][j]-2.0*u1[i][j]+u1[i-1][j])/
               (del*del)-(temp2-temp1)*(temp1-temp3)/h-rhs[i][j];
            res2=tau*temp5*(u1[i+1][j]-2.0*u1[i][j]+u1[i-1][j])/
               (del*del*del)-2.0*(lambda1+tau*temp4)/(del*del)-
               (temp6-2.0*temp2+temp1)*(temp1-temp3)/(h*h)-
               DSQR((temp2-temp1)/h);
            u1[i][j] -= res1/res2;
         }
      }
   }
}

void relax2C2(double **u1,double **u2,double **rhs,int n)
{
   int i,ipass,isw,j,jsw=1;
   double h=1.0e-5,del=1.0/(n-1),temp1,temp2,temp3,temp4,temp5,temp6,
      res1,res2;
   /* do a newton step on the nonlinear problem to get a nonlinear
      smoother to use in a manner analogous to Gauss-Seidel-so must
      differentiate the discretized pde with respect to arg. updated */
   for(ipass=1;ipass<=2;ipass++,jsw=3-jsw){
      isw=jsw;
      for(j=2;j<n;j++,isw=3-isw){
         for(i=isw+1;i<n;i+=2){
            if(INTERP==1){
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
                  u2[i][j],&temp1);
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
                  u2[i][j]+h,&temp2);
               bilin(x1loc,x2loc,yGlob,nGlob,nGlob,
                  (i-1.0)/(n-1.0),(j-1.0)/(n-1.0),&temp3);
               bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
                  u2[i][j]+2.0*h,&temp6);
            } else{
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
                  u2[i][j],&temp1);
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
                  u2[i][j]+h,&temp2);
               splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,
                  (i-1.0)/(n-1.0),(j-1.0)/(n-1.0),&temp3);
               splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
                  u2[i][j]+2.0*h,&temp6);
            }
            if(((u2[i][j+1]-u2[i][j-1])/(2.0*del))<epsGlb){
               temp4=12.0*DSQR(epsGlb-(u2[i][j+1]-u2[i][j-1])/(2.0*del));
               temp5=24.0*(epsGlb-(u2[i][j+1]-u2[i][j-1])/(2.0*del));
            }
            else temp4=temp5=0.0;
            if(((u2[i][j+1]-u2[i][j-1])/(2.0*del))>alfaGlb){
               temp4+=12.0*DSQR((u2[i][j+1]-u2[i][j-1])/(2.0*del)-alfaGlb);
               temp5+=24.0*((u2[i][j+1]-u2[i][j-1])/(2.0*del)-alfaGlb);
            }
            res1=(lambda2+tau*temp4)*(u2[i][j+1]-2.0*u2[i][j]+u2[i][j-1])/
               (del*del)-(temp2-temp1)*(temp1-temp3)/h-rhs[i][j];
            res2=tau*temp5*(u2[i][j+1]-2.0*u2[i][j]+u2[i][j-1])/
               (del*del*del)-2.0*(lambda2+tau*temp4)/(del*del)-
               (temp6-2.0*temp2+temp1)*(temp1-temp3)/(h*h)-
               DSQR((temp2-temp1)/h);
            u2[i][j] -= res1/res2;
         }
      }
   }
}

void lop1(double **out,double **u1,double **u2,int n)
{
   int i,j;
   double h=1.0e-5,del=1.0/(n-1),temp1,temp2,temp3,temp4;
   /*  compute the value of the differential operator at each
      grid location */
   for(j=2;j<n;j++){
      for(i=2;i<n;i++){
         if(INTERP==1){
            bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
               u2[i][j],&temp1);
            bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j]+h,
               u2[i][j],&temp2);
            bilin(x1loc,x2loc,yGlob,nGlob,nGlob,(i-1.0)/(n-1.0),
               (j-1.0)/(n-1.0),&temp3);
         } else{
            splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
               u2[i][j],&temp1);
            splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j]+h,
               u2[i][j],&temp2);
            splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,(i-1.0)/(n-1.0),
               (j-1.0)/(n-1.0),&temp3);
         }
         if(((u1[i+1][j]-u1[i-1][j])/(2.0*del))<epsGlb)
            temp4=12.0*DSQR(epsGlb-(u1[i+1][j]-u1[i-1][j])/(2.0*del));
         else temp4=0.0;
         if(((u1[i+1][j]-u1[i-1][j])/(2.0*del))>alfaGlb)
            temp4+=12.0*DSQR((u1[i+1][j]-u1[i-1][j])/(2.0*del)-alfaGlb);
         out[i][j]=(lambda1+tau*temp4)*(u1[i+1][j]-2.0*u1[i][j]+
            u1[i-1][j])/(del*del)-(temp2-temp1)*(temp1-temp3)/h;
      }
   }
   /* put in the boundary conditions */
   for(i=1;i<=n;i++){
      out[1][i]=0.0; /* for p_1, set all values on x-axis to zero */
      out[n][i]=1.0;
      out[i][1]=out[i][n]=(i-1.0)/(n-1.0);
   }
}

void lop2(double **out,double **u1,double **u2,int n)
{
   int i,j;
   double h=1.0e-5,del=1.0/(n-1),temp1,temp2,temp3,temp4;
   /*  compute the value of the differential operator at each
      grid location */
   for(j=2;j<n;j++){
      for(i=2;i<n;i++){
         if(INTERP==1){
            bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
               u2[i][j],&temp1);
            bilin(x1loc,x2loc,yHat,nGlob,nGlob,u1[i][j],
               u2[i][j]+h,&temp2);
            bilin(x1loc,x2loc,yGlob,nGlob,nGlob,(i-1.0)/(n-1.0),
               (j-1.0)/(n-1.0),&temp3);
         } else{
            splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
               u2[i][j],&temp1);
            splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,u1[i][j],
               u2[i][j]+h,&temp2);
            splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,(i-1.0)/(n-1.0),
               (j-1.0)/(n-1.0),&temp3);
         }
         if(((u2[i][j+1]-u2[i][j-1])/(2.0*del))<epsGlb)
            temp4=12.0*DSQR(epsGlb-(u2[i][j+1]-u2[i][j-1])/(2.0*del));
         else temp4=0.0;
         if(((u2[i][j+1]-u2[i][j-1])/(2.0*del))>alfaGlb)
            temp4+=12.0*DSQR((u2[i][j+1]-u2[i][j-1])/(2.0*del)-alfaGlb);
         out[i][j]=(lambda2+tau*temp4)*(u2[i][j+1]-2.0*u2[i][j]+
            u2[i][j-1])/(del*del)-(temp2-temp1)*(temp1-temp3)/h;
      }
   }
   /* put in the boundary conditions */
   for(i=1;i<=n;i++){
      out[i][1]=0.0; /* for p_2, set all values on y-axis to zero */
      out[i][n]=1.0;
      out[1][i]=out[n][i]=(i-1.0)/(n-1.0);
   }
}

void rstrct(double **uc,double **uf,int nc)
/* the fine to course grid restriction function */
{
   int ic,iif,jc,jf,ncc=2*nc-1;
   for(jf=3,jc=2;jc<nc;jc++,jf+=2){
      for(iif=3,ic=2;ic<nc;ic++,iif+=2){
         uc[ic][jc]=0.5*uf[iif][jf]+0.125*(uf[iif+1][jf]+uf[iif-1][jf]
            +uf[iif][jf+1]+uf[iif][jf-1]);
      }
   }
   for(jc=1,ic=1;ic<=nc;ic++,jc+=2){
      uc[ic][1]=uf[jc][1];
      uc[ic][nc]=uf[jc][ncc];
   }
   for(jc=1,ic=1;ic<=nc;ic++,jc+=2){
      uc[1][ic]=uf[1][jc];
      uc[nc][ic]=uf[ncc][jc];
   }
}

void interp(double **uf,double **uc,int nf)
/* the course to fine interpolation function */
{
   int ic,iif,jc,jf,nc;
   nc=nf/2+1;
   for(jc=1,jf=1;jc<=nc;jc++,jf+=2)
      for(ic=1;ic<=nc;ic++) uf[2*ic-1][jf]=uc[ic][jc];
   for(jf=1;jf<=nf;jf+=2)
      for(iif=2;iif<nf;iif+=2)
         uf[iif][jf]=0.5*(uf[iif+1][jf]+uf[iif-1][jf]);
   for(jf=2;jf<nf;jf+=2)
      for(iif=1;iif<=nf;iif++)
         uf[iif][jf]=0.5*(uf[iif][jf+1]+uf[iif][jf-1]);
}

/* 2-D integration routines */
static double xsav;
static double (*nrfunc)(double,double);

double quad2d(double (*func)(double,double), double x1,double x2)
{
   double f1(double x);
   nrfunc=func;
   return qtrap(f1,x1,x2);
}

double f1(double x)
{
   double f2(double y);
   xsav=x;
   return qtrap(f2,0.0,1.0);
}

double f2(double y)
{
   return (*nrfunc)(xsav,y);
}

/* functions for integration using the trapezoidal rule */
double qtrap(double (*func)(double),double a,double b)
{
   double trapzd(double (*func)(double),double a,double b,int n);
   int j,JMAX=20;
   /* this EPS controls accuarcy of numerical integral */
   double s,olds,EPS=1.0e-3;
   olds= -1.0e30;
   for(j=1;j<=JMAX;j++){
      s=trapzd(func,a,b,j);
      if(fabs(s-olds) < EPS*fabs(olds)) return s;
      if(s==0.0 && olds == 0.0 && j>6) return s;
      olds=s;
   }
   nrerror("too many steps in routine qtrap");
   return 0.0;
}

#define FUNC(x) ((*func)(x))

double trapzd(double (*func)(double),double a,double b,int n)
{
   double x,tnm,sum,del;
   /*static*/ double s;
   int it,j;
   if(n==1){
      return (s=0.5*(b-a)*(FUNC(a)+FUNC(b)));
   } else{
      for(it=1,j=1;j<n-1;j++) it <<= 1;
      tnm=it;
      del=(b-a)/tnm;
      x=a+0.5*del;
      for(sum=0.0,j=1;j<=it;j++,x+=del) sum += FUNC(x);
      s=0.5*(s+(b-a)*sum/tnm);
      return s;
   }
}

/* functions for integration to find MISE */
double intFcn1(double x1,double x2)
{
   double temp1,temp2;
   if(INTERPINT==1){
      bilin(x1loc,x2loc,yGlob,nGlob,nGlob,x1,x2,&temp1);
      bilin(x1loc,x2loc,yHat,nGlob,nGlob,x1,x2,&temp2);
   } else{
      splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,x1,x2,&temp1);
      splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x1,x2,&temp2);
   }
   return DSQR(temp1-temp2);
}

double intFcn2(double x1,double x2)
{
   double temp1,temp2,temp3,temp4;
   bilin(ux1,ux2,u1Glob,nfGrid,nfGrid,x1,x2,&temp2);
   bilin(ux1,ux2,u2Glob,nfGrid,nfGrid,x1,x2,&temp3);
   if(INTERPINT==1){
      bilin(x1loc,x2loc,yGlob,nGlob,nGlob,x1,x2,&temp1);
      bilin(x1loc,x2loc,yHat,nGlob,nGlob,temp2,temp3,&temp4);
   } else{
      splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,x1,x2,&temp1);
      splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,temp2,temp3,&temp4);
   }
   return DSQR(temp1-temp4);
}

double intFcn3(double x1,double x2)
{
/* use this to find the quantification of the deformation, PENLTY determines
   which oe to use */
   int PENLTY=1;
   double res,temp1,temp2,temp3,temp4,h=1.0e-4;
   if(PENLTY==0){
      bilin(ux1,ux2,u1Glob,nfGrid,nfGrid,x1,x2,&temp1);
      bilin(ux1,ux2,u2Glob,nfGrid,nfGrid,x1,x2,&temp2);
      res=DSQR(temp1-x1)+DSQR(temp2-x2);
   }
   if(PENLTY==1){
      bilin(ux1,ux2,u1Glob,nfGrid,nfGrid,x1-h,x2,&temp1);
      bilin(ux1,ux2,u1Glob,nfGrid,nfGrid,x1+h,x2,&temp2);
      bilin(ux1,ux2,u2Glob,nfGrid,nfGrid,x1,x2-h,&temp3);
      bilin(ux1,ux2,u2Glob,nfGrid,nfGrid,x1,x2+h,&temp4);
      res=DSQR((temp2-temp1)/(2.0*h)-1.0)+DSQR((temp4-temp3)/(2.0*h)-1.0);
   }
   return res;
}

/* routines for newton algorithm for meeting boundary conditions:
   these routines find roots of vector equations using a globally
   convergent implementation of newton's algorithm-uses descent
   directions and line searches with backtrakcing along these
   directions*/
/* global variables to communicate with fmin */
int nn;
double *fvec;
double **rhs1g,**rhs2g;
void (*nrfuncv)(int n,double *v,double *f,double **rhs1,double **rhs2);

#define FREERETURN {free_dvector(fvec,1,n);free_dvector(xold,1,n);\
   free_dvector(p,1,n);free_dvector(ptemp,1,n);free_dvector(g,1,n);free_dmatrix(fjac,1,n,1,n);\
   free_ivector(indx,1,n);return;}

void newt(double *x,int n,int *check, void (*vecfunc)(int, double *,
   double *,double **,double **),double **rhs1,double **rhs2)
{
   double fmin(double *x);
   void lnsrch(int n,double *xold,double fold,double *g,double *p,
      double *x,double *f,double stpmax,int *check,
      double (*func)(double *));
   /* MAXITS is max number of iterations-probably do not need to mess
      with it */
   int MAXITS=200000;
   /* TOLF is tolerance for diff. in function values, TOLX is
      tolerance for diff. in x values-these can make a difference */
   double TOLF=1.0e-4,TOLMIN=1.0e-6,TOLX=1.0e-7,STPMX=100.0;
   int i,its,j,*indx;
   double d,den,f,fold,stpmax,sum,temp,test,det,h=1.0e-4,temp0,temp1,
      temp2,temp3,temp4,temp5,temp6,temp7,temp8,temp9,**fjac,*g,*p,
      *ptemp,*xold;
   indx=ivector(1,n);
   fjac=dmatrix(1,n,1,n);
   g=dvector(1,n);
   p=dvector(1,n);
   ptemp=dvector(1,n);
   xold=dvector(1,n);
   fvec=dvector(1,n);
   nn=n;
   rhs1g=rhs1;
   rhs2g=rhs2;
   nrfuncv=vecfunc;
   f=fmin(x);
   test=0.0;
   for(i=1;i<=n;i++)
      if(fabs(fvec[i]) > test) test=fabs(fvec[i]);
   if(test < 0.01*TOLF){
      *check=0;
      FREERETURN
   }
   for(sum=0.0,i=1;i<=n;i++) sum+=DSQR(x[i]);
   stpmax=STPMX*DMAX(sqrt(sum),(double)n);
   if(INTERP==1)
      bilin(x1loc,x2loc,yGlob,nGlob,nGlob,0.5,0.5,&temp0);
   else
      splin2(x1loc,x2loc,yGlob,yGlobS,nGlob,nGlob,0.5,0.5,&temp0);
   for(its=1;its<=MAXITS;its++){
      if(INTERP==1){
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1],x[2],&temp1);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]+h,x[2],&temp2);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1],x[2]+h,&temp3);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]-h,x[2],&temp4);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1],x[2]-h,&temp5);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]+h,x[2]+h,&temp6);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]+h,x[2]-h,&temp7);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]-h,x[2]+h,&temp8);
         bilin(x1loc,x2loc,yHat,nGlob,nGlob,x[1]-h,x[2]-h,&temp9);
      } else{
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1],x[2],&temp1);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]+h,x[2],&temp2);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1],x[2]+h,&temp3);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]-h,x[2],&temp4);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1],x[2]-h,&temp5);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]+h,x[2]+h,&temp6);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]+h,x[2]-h,&temp7);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]-h,x[2]+h,&temp8);
         splin2(x1loc,x2loc,yHat,yHatS,nGlob,nGlob,x[1]-h,x[2]-h,&temp9);
      }
      fjac[1][1]=(temp2-2.0*temp1+temp4)*(temp0-temp1)/(4.0*h*h)-
         DSQR((temp2-temp4)/(2.0*h))-8.0*lambda1;
      fjac[1][2]=fjac[2][1]=(temp6-temp7-temp8+temp9)*(temp0-temp1)/
         (4.0*h*h)-(temp2-temp4)*(temp3-temp5)/(4.0*h*h);
      fjac[2][2]=(temp3-2.0*temp1+temp5)*(temp0-temp1)/(4.0*h*h)-
         DSQR((temp3-temp5)/(2.0*h))-8.0*lambda2;
//      cout << fjac[1][1] << " " << fjac[1][2] << endl
//         << fjac[1][2] << " " << fjac[2][2] << endl << endl;
      det=fjac[1][1]*fjac[2][2]-fjac[1][2]*fjac[2][1];
      for(i=1;i<=n;i++){
         for(sum=0.0,j=1;j<=n;j++) sum += fjac[j][i]*fvec[j];
         g[i]=sum;
      }
      for(i=1;i<=n;i++) xold[i]=x[i];
      fold=f;
      for(i=1;i<=n;i++) ptemp[i] = -fvec[i];
      p[1]=(ptemp[1]*fjac[2][2]-ptemp[2]*fjac[1][2])/det;
      p[2]=(ptemp[2]*fjac[1][1]-ptemp[1]*fjac[2][1])/det;
      lnsrch(n,xold,fold,g,p,x,&f,stpmax,check,fmin);
      test=0.0;
      for(i=1;i<=n;i++)
         if(fabs(fvec[i]) > test) test=fabs(fvec[i]);
      if(test <  TOLF){
         *check=0;
         FREERETURN
      }
      if(*check){
         test=0.0;
         den=DMAX(f,0.5*n);
         for(i=1;i<=n;i++){
            temp=fabs(g[i])*DMAX(fabs(x[i]),1.0)/den;
            if(temp > test) test=temp;
         }
         *check=(test < TOLMIN ? 1 : 0);
         FREERETURN
      }
      test=0.0;
      for(i=1;i<=n;i++){
         temp=(fabs(x[i]-xold[i]))/DMAX(fabs(x[i]),1.0);
         if(temp > test) test=temp;
      }
      if(test < TOLX) FREERETURN
   }
   nrerror("MAXITS exceeded in newt");
}

void (*nrfuncvOrig)(int n,double *v,double *f);

#define FREERETURNOrig {free_dvector(fvec,1,n);free_dvector(xold,1,n);\
   free_dvector(p,1,n);free_dvector(g,1,n);free_dmatrix(fjac,1,n,1,n);\
   free_ivector(indx,1,n);return;}

void newtOrig(double *x,int n,int *check, void (*vecfunc)(int, double *,
   double *))
{
   double fminOrig(double *x);
   void lnsrch(int n,double *xold,double fold,double *g,double *p,
      double *x,double *f,double stpmax,int *check,
      double (*func)(double *));
   void fdjac(int n,double *x,double *fvec,double **df,
      void (*vecfunc)(int,double *,double *));
   /* MAXITS is max number of iterations-probably do not need to mess
      with it */
   int MAXITS=200;
   /* TOLF is tolerance for diff. in function values, TOLX is
      tolerance for diff. in x values-these probably O.K. */
   double TOLF=1.0e-6,TOLMIN=1.0e-6,TOLX=1.0e-7,STPMX=100.0;
   int i,its,j,*indx;
   double d,den,f,fold,stpmax,sum,temp,test,**fjac,*g,*p,*xold;
   indx=ivector(1,n);
   fjac=dmatrix(1,n,1,n);
   g=dvector(1,n);
   p=dvector(1,n);
   xold=dvector(1,n);
   fvec=dvector(1,n);
   nn=n;
   nrfuncvOrig=vecfunc;
   f=fminOrig(x);
   test=0.0;
   for(i=1;i<=n;i++)
      if(fabs(fvec[i]) > test) test=fabs(fvec[i]);
//   cout << test << " ";
//   getch();
   if(test < 0.1*TOLF){
      *check=0;
      FREERETURNOrig
   }
   for(sum=0.0,i=1;i<=n;i++) sum+=DSQR(x[i]);
   stpmax=STPMX*DMAX(sqrt(sum),(double)n);
   for(its=1;its<=MAXITS;its++){
      fdjac(n,x,fvec,fjac,vecfunc);
      for(i=1;i<=n;i++){
         for(sum=0.0,j=1;j<=n;j++) sum += fjac[j][i]*fvec[j];
         g[i]=sum;
      }
      for(i=1;i<=n;i++) xold[i]=x[i];
      fold=f;
      for(i=1;i<=n;i++) p[i] = -fvec[i];
      ludcmp(fjac,n,indx,&d);
      lubksb(fjac,n,indx,p);
      lnsrch(n,xold,fold,g,p,x,&f,stpmax,check,fminOrig);
      test=0.0;
//      cout << fvec[1] << " ";
      for(i=1;i<=n;i++){
//         cout << fabs(fvec[i]) << " ";
         if(fabs(fvec[i]) > test) test=fabs(fvec[i]);
      }
//      cout << test << " ";
//      getch();
      if(test <  TOLF){
         *check=0;
         FREERETURNOrig
      }
      if(*check){
         test=0.0;
         den=DMAX(f,0.5*n);
         for(i=1;i<=n;i++){
            temp=fabs(g[i])*DMAX(fabs(x[i]),1.0)/den;
            if(temp > test) test=temp;
         }
         *check=(test < TOLMIN ? 1 : 0);
         FREERETURNOrig
      }
      test=0.0;
      for(i=1;i<=n;i++){
         temp=(fabs(x[i]-xold[i]))/DMAX(fabs(x[i]),1.0);
         if(temp > test) test=temp;
      }
      if(test < TOLX) FREERETURNOrig
   }
   nrerror("MAXITS exceeded in newtOrig");
}

void lnsrch(int n,double *xold,double fold,double *g,double *p,
   double *x,double *f,double stpmax,int *check,double (*func)(double *))
{
   int i;
   double ALF=1.0e-4,TOLX=1.0e-7;
   double a,alam,alam2,alamin,b,disc,f2,fold2,rhs1,rhs2,slope,
      sum,temp,test,tmplam;
   *check=0;
   for(sum=0.0,i=1;i<=n;i++) sum += p[i]*p[i];
   sum=sqrt(sum);
   if(sum > stpmax)
      for(i=1;i<=n;i++) p[i] *= stpmax/sum;
   for(slope=0.0,i=1;i<=n;i++)
      slope += g[i]*p[i];
   test=0.0;
   for(i=1;i<=n;i++){
      temp=fabs(p[i])/DMAX(fabs(xold[i]),1.0);
      if(temp > test) test=temp;
   }
   alamin=TOLX/test;
   alam=1.0;
   for(;;){
      for(i=1;i<=n;i++) x[i]=xold[i]+alam*p[i];
      *f=(*func)(x);
      if(alam < alamin){
         for(i=1;i<=n;i++) x[i]=xold[i];
         *check=1;
         return;
      } else if(*f <= fold+ALF*alam*slope) return;
      else{
         if(alam == 1.0)
            tmplam = -slope/(2.0*(*f-fold-slope));
         else{
            rhs1 = *f-fold-alam*slope;
            rhs2= f2-fold2-alam2*slope;
            a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2);
            b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/
               (alam-alam2);
            if(a==0.0) tmplam = -slope/(2.0*b);
            else{
               disc=b*b-3.0*a*slope;
               if(disc < 0.0) nrerror("Roundoff problem in lnsrch.");
               else tmplam=(-b+sqrt(disc))/(3.0*a);
            }
            if(tmplam>0.5*alam)
               tmplam=0.5*alam;
         }
      }
      alam2=alam;
      f2 = *f;
      fold2=fold;
      alam=DMAX(tmplam,0.1*alam);
   }
}

extern int nn;
extern double *fvec;
extern double **rhs1g,**rhs2g;
extern void (*nrfuncv)(int n,double *v,double *f,double **rhs1,
   double **rhs2);
extern void (*nrfuncvOrig)(int n,double *v,double *f);

double fmin(double *x)
{
   int i;
   double sum;
   (*nrfuncv)(nn,x,fvec,rhs1g,rhs2g);
   for(sum=0.0,i=1;i<=nn;i++) sum += DSQR(fvec[i]);
   return 0.5*sum;
}

double fminOrig(double *x)
{
   int i;
   double sum;
   (*nrfuncvOrig)(nn,x,fvec);
   for(sum=0.0,i=1;i<=nn;i++) sum += DSQR(fvec[i]);
   return 0.5*sum;
}

/* LU decomposition functions for newt */
void ludcmp(double **a,int n,int *indx, double *d)
{
   int i,imax,j,k;
   double big,dum,sum,temp;
   double TINY=1.0e-20;
   double *vv;
   vv=dvector(1,n);
   *d=1.0;
   for(i=1;i<=n;i++){
      big=0.0;
      for(j=1;j<=n;j++)
         if((temp=fabs(a[i][j]))>big) big=temp;
      if(big==0.0) nrerror("Singular Matrix in routine ludcmp");
      vv[i]=1.0/big;
   }
   for(j=1;j<=n;j++){
      for(i=1;i<j;i++){
         sum=a[i][j];
         for(k=1;k<i;k++) sum -= a[i][k]*a[k][j];
         a[i][j]=sum;
      }
      big=0.0;
      for(i=j;i<=n;i++){
         sum=a[i][j];
         for(k=1;k<j;k++)
            sum -= a[i][k]*a[k][j];
         a[i][j]=sum;
         if((dum=vv[i]*fabs(sum))>=big){
            big=dum;
            imax=i;
         }
      }
      if(j!=imax){
         for(k=1;k<=n;k++){
            dum=a[imax][k];
            a[imax][k]=a[j][k];
            a[j][k]=dum;
         }
         *d = -(*d);
         vv[imax]=vv[j];
      }
      indx[j]=imax;
      if(a[j][j]==0.0) a[j][j]=TINY;
      if(j!=n){
         dum=1.0/(a[j][j]);
         for(i=j+1;i<=n;i++) a[i][j] *= dum;
      }
   }
   free_dvector(vv,1,n);
}

void lubksb(double **a,int n,int *indx,double *b)
{
   int i,ii=0,ip,j;
   double sum;
   for(i=1;i<=n;i++){
      ip=indx[i];
      sum=b[ip];
      b[ip]=b[i];
      if(ii)
         for(j=ii;j<=i-1;j++) sum -= a[i][j]*b[j];
      else if(sum) ii=i;
      b[i]=sum;
   }
   for(i=n;i>=1;i--){
      sum=b[i];
      for(j=i+1;j<=n;j++) sum -= a[i][j]*b[j];
      b[i]=sum/a[i][i];
   }
}

void fdjac(int n,double *x,double *fvec,double **df,
  void (*vecfunc)(int,double *,double *))
{
   int i,j;
   double EPS=1.0e-4,h,temp,*f;
   f=dvector(1,n);
   for(j=1;j<=n;j++){
      temp=x[j];
      h=EPS*fabs(temp);
      if(h==0) h=EPS;
      x[j]=temp+h;
      h=x[j]-temp;
      (*vecfunc)(n,x,f);
      x[j]=temp;
      for(i=1;i<=n;i++)
         df[i][j]=(f[i]-fvec[i])/h;
   }
   free_dvector(f,1,n);
}

void locate(double *xx,int n,double x,int *j)
{
   int ju,jm,jl,ascnd;
   jl=0;
   ju=n+1;
   ascnd=(xx[n] >= xx[1]);
   while(ju-jl > 1){
      jm=(ju+jl) >> 1;
      if(x >= xx[jm] ==ascnd) jl=jm;
      else ju=jm;
   }
   if(x == xx[1]) *j=1;
   else if(x==xx[n]) *j=n-1;
   else *j=jl;
}

void bilin(double *x1a,double *x2a,double **ya,int m,int n,
   double x1,double x2,double *y)
{
   /*  do bilinear interpolation  */
   void locate(double *xx,int n,double x,int *j);
   int i,j;
   double t,u;
   locate(x1a,n,x1,&i);
   locate(x2a,n,x2,&j);
//   cout << i << " " << j << endl;
   if(i>n-1) i=n-1;
   if(j>n-1) j=n-1;
   if(i<1) i=1;
   if(j<1) j=1;
   t=(x1-x1a[i])/(x1a[i+1]-x1a[i]);
   u=(x2-x2a[j])/(x2a[j+1]-x2a[j]);
   *y=(1.0-t)*((1.0-u)*ya[i][j]+u*ya[i][j+1])+t*((1.0-u)*ya[i+1][j]+
      u*ya[i+1][j+1]);
}

/* routines or bicubic splines */
void spline(double *x, double *y, int n, double yp1, double ypn,
   double *y2)
{
   int i,k;
   double p,qn,sig,un,*u;
   u=dvector(1,n-1);
   if(yp1 > 0.99e30)
      y2[1]=u[1]=0.0;
   else{
      y2[1]=-0.5;
      u[1]=(3.0/(x[2]-x[1]))*((y[2]-y[1])/(x[2]-x[1])-yp1);
   }
   for(i=2;i<=n-1;i++){
      sig=(x[i]-x[i-1])/(x[i+1]-x[i-1]);
      p=sig*y2[i-1]+2.0;
      y2[i]=(sig-1.0)/p;
      u[i]=(y[i+1]-y[i])/(x[i+1]-x[i])-(y[i]-y[i-1])/(x[i]-x[i-1]);
      u[i]=(6.0*u[i]/(x[i+1]-x[i-1])-sig*u[i-1])/p;
   }
   if(ypn > 0.99e30)
      qn=un=0.0;
   else{
      qn=0.5;
      un=(3.0/(x[n]-x[n-1]))*(ypn-(y[n]-y[n-1])/(x[n]-x[n-1]));
   }
   y2[n]=(un-qn*u[n-1])/(qn*y2[n-1]+1.0);
   for(k=n-1;k>=1;k--)
      y2[k]=y2[k]*y2[k+1]+u[k];
   free_dvector(u,1,n-1);
}

void splint(double *xa, double *ya, double *y2a, int n, double x,
   double *y)
{
   int klo,khi,k;
   double h,b,a;
   klo=1;
   khi=n;
   while(khi-klo > 1){
      k=(khi+klo) >> 1;
      if(xa[k] > x) khi=k;
      else klo=k;
   }
   h=xa[khi]-xa[klo];
   if(h == 0.0) nrerror("Bad xa input to routine splint:ties!");
   a=(xa[khi]-x)/h;
   b=(x-xa[klo])/h;
   *y=a*ya[klo]+b*ya[khi]+((a*a*a-a)*y2a[klo]+
      (b*b*b-b)*y2a[khi])*(h*h)/6.0;
}

void splie2(double *x1a,double *x2a,double **ya,int m,int n,double **y2a)
{
   int j;
   for(j=1;j<=m;j++)
      spline(x2a,ya[j],n,1.0e30,1.0e30,y2a[j]);
}

void splin2(double *x1a,double *x2a,double **ya,double **y2a,int m,int n,
   double x1,double x2,double *y)
{
   int j;
   double *ytmp,*yytmp;
   ytmp=dvector(1,m);
   yytmp=dvector(1,m);
   for(j=1;j<=m;j++)
      splint(x2a,ya[j],y2a[j],n,x2,&yytmp[j]);
   spline(x1a,yytmp,m,1.0e30,1.0e30,ytmp);
   splint(x1a,yytmp,ytmp,m,x1,y);
   free_dvector(yytmp,1,m);
   free_dvector(ytmp,1,m);
}

/* simple matrix algebra functions */
void matadd(double **a,double **b,double **c,int n)
{
   int i,j;
   for(j=1;j<=n;j++)
      for(i=1;i<=n;i++)
         c[i][j]=a[i][j]+b[i][j];
}

void matsub(double **a,double **b,double **c,int n)
{
   int i,j;
   for(j=1;j<=n;j++)
      for(i=1;i<=n;i++)
         c[i][j]=a[i][j]-b[i][j];
}

double anorm2(double **a,int n)
{
   int i,j;
   double sum=0.0;
   for(j=1;j<=n;j++)
      for(i=1;i<=n;i++)
         sum += a[i][j]*a[i][j];
   return sqrt(sum)/n;
}

/* copy 2-D array from one location to another */
void copy(double **aout,double **ain,int n)
{
   int i,j;
   for(i=1;i<=n;i++)
      for(j=1;j<=n;j++)
         aout[j][i]=ain[j][i];
}

/* memory allocation functions */
double *dvector(long nl,long nh)
/* allocate a vector with index range v[nl...nh] */
{
   double *v;
   v=(double *)malloc((size_t) ((nh-nl+2)*sizeof(double)));
   if (!v) nrerror("allocation failure in dvector()");
   return v-nl+1;
}

void free_dvector(double *v, long nl, long nh)
/* free allocated memory */
{
   free((FREE_ARG) (v+nl-1));
}

int *ivector(long nl,long nh)
/* allocate a vector with index range v[nl...nh] */
{
   int *v;
   v=(int *)malloc((size_t) ((nh-nl+2)*sizeof(int)));
   if (!v) nrerror("allocation failure in dvector()");
   return v-nl+1;
}

void free_ivector(int *v, long nl, long nh)
/* free allocated memory */
{
   free((FREE_ARG) (v+nl-1));
}

double **dmatrix(long nrl,long nrh,long ncl,long nch)
/* allocate a double matrix */
{
   long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
   double **m;
   /* allocate pointers to rows */
   m=(double **) malloc((size_t)((nrow+1)*sizeof(double*)));
   if (!m) nrerror("allocation failure 1 in dmatrix()");
   m += 1;
   m -= nrl;
   /* allocate rows and set pointers to them */
   m[nrl]=(double *) malloc((size_t)((nrow*ncol+1)*sizeof(double)));
   if (!m[nrl]) nrerror("allocation failure 2 in dmatrix()");
   m[nrl] += 1;
   m[nrl] -= ncl;
   for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
   /* return pointer to array of pointers to rows */
   return m;
}

void free_dmatrix(double **m, long nrl,long nrh,long ncl,long nch)
/* free allocated memory */
{
   free((FREE_ARG) (m[nrl]+ncl-1));
   free((FREE_ARG) (m+nrl-1));
}

/* error handler */
void nrerror(char error_text[])
{
   fprintf(stderr,"Run-time error...\n");
   fprintf(stderr,"%s\n",error_text);
   fprintf(stderr,"...now exiting to system...\n");
   getch();
   exit(1);
}