/* this is an implementation of the Gibbs sampler for the block motif 
model of Liu et al. that uses a call to the Metropolis algorithm to 
sample a set of parameters from a log linear model: expects that the 
data is a matrix that holds numbers from 1 to size of alphabet that 
represent the particular element of that alphabet, e.g. A=1, C=2, G=3, 
T=4 */

#include <math.h>
#pragma hdrstop
#include <condefs.h>
#include <conio.h>
#include <iostream.h>
#include <fstream.h>

#include "C:\Documents and Settings\Owner\Desktop\matrices.txt"
#include "C:\Documents and Settings\Owner\Desktop\randNo.txt"
#include "C:\Documents and Settings\Owner\Desktop\sorting.txt"
#include "C:\Documents and Settings\Owner\Desktop\math.txt"
#include "C:\Documents and Settings\Owner\Desktop\minimize.txt"

#pragma argsused
#define FREE_ARG char*

void gibbsDriver(long *setSeed,int lookInt,int noChn,int numIter,int numburn,
   int thinfac,int numpar,double **initValu,double ***simres);
void gibbsSmplr(long *setSeed,int *mtf);
int smpl(double *unnrmlzPst,int len,long *setSeed);
double minFcn(double *curDev,double **data);
void metrop1(long *setSeed,int numpar,double *curDev,double **data);
void loglinSmpl(long *setSeed,int *mtf,double *beta);
void gibbsSmplrLL(long *setSeed,int *mtf, double *beta);

/* nSeq is the number of sequences, wGlob is the width of the motif, maxLen
is the length of the longest sequence and pGlob is the size of the alphabet over
which you are searching */
int nSeq=10,wGlob=8,maxLen=100,pGlob=4,*beta0,**beta,*seqLen,**yGlob;
int noPar=nSeq+1+(pGlob-1)*wGlob,numBpar=1+(pGlob-1)*wGlob,
   tLen=pow(pGlob,wGlob);
double acpt=0.0,*pp,**tryV,**desMat;

int main()
{
   /* this file holds data */
   ifstream filer1("C:\\Documents and Settings\\Owner\\Desktop\\seqData.txt");
   if(!filer1){
      cerr << "Failure to open filer1\n";
      getch();
      exit(EXIT_FAILURE);
   }
   /* this file holds the length of each sequence */
   ifstream filer2("C:\\Documents and Settings\\Owner\\Desktop\\seqLen.txt");
   if(!filer2){
      cerr << "Failure to open filer2\n";
      getch();
      exit(EXIT_FAILURE);
   }
   /* initial values, one for each sequence-reg coef sampled first */
   ifstream filer3("C:\\Documents and Settings\\Owner\\Desktop\\blkMtfIV.txt");
   if(!filer3){
      cerr << "Failure to open filer3\n";
      exit(EXIT_FAILURE);
   }
   /* covariance matrix for metropolis step */
   ifstream filer4("C:\\Documents and Settings\\Owner\\Desktop\\covMat2.txt");
   if(!filer4){
      cerr << "Failure to open filer4\n";
      exit(EXIT_FAILURE);
   }
   /* file that holds the design matrix for the log linear model */
   ifstream filer5("C:\\Documents and Settings\\Owner\\Desktop\\desMat2.txt");
   if(!filer5){
      cerr << "Failure to open filer5\n";
      exit(EXIT_FAILURE);
   }
   /* samples from the posterior */
   ofstream filew1("C:\\Documents and Settings\\Owner\\Desktop\\blkMtfSmp.txt");
   if(!filew1){
      cerr << "Failure to open filew1\n";
      getch();
      exit(EXIT_FAILURE);
   }
   /* numIter is numiter per chain in Gibbs, numburn is number of burn in iterations,
   noChn is number of chains to use in MCMC, thinfac inicates how much thinning
   of chain output-use 1 to have none, and lookInt indicates how frequently
   convergence diagnostic statistics should be computed */
   int numIter=40000,numburn=20000,noChn=1,lookInt=1,thinfac=10,ii=1,i,j,
      k,l,iter;
   long setSeed=-8623;
   double **initValu,***simres;
   seqLen=ivector(1,nSeq);
   beta0=ivector(1,wGlob);
   pp=dvector(1,numBpar);
   beta=imatrix(1,wGlob,1,pGlob);
   yGlob=imatrix(1,nSeq,1,maxLen);
   initValu=dmatrix(1,noPar,1,noChn);
   tryV=dmatrix(1,numBpar,1,numBpar);
   desMat=dmatrix(1,tLen,1,numBpar);
   simres=d3tensor(1,noChn,1,(numIter-numburn)/thinfac,1,noPar);
   filer2.seekg(0);
   for(i=1;i<=nSeq;i++){
      filer2 >> seqLen[i];
   }
   filer1.seekg(0);
   for(i=1;i<=nSeq;i++){
      for(j=1;j<=seqLen[i];j++){
         filer1 >> yGlob[i][j];
      }
   }
   filer3.seekg(0);
   for(i=1;i<=noChn;i++){
      for(j=1;j<=noPar;j++){
         filer3 >> initValu[j][i];
      }
   }
   filer4.seekg(0);
   for(i=1;i<=numBpar;i++){
      for(j=1;j<=numBpar;j++){
         filer4 >> tryV[i][j];
         tryV[i][j]*=0.1;
      }
   }
   choldc(tryV,numBpar,pp);
   for(i=1;i<=tLen;i++){
      for(j=1;j<=numBpar;j++){
         filer5 >> desMat[i][j];
      }
    }
   /* specify priors via pseudo-counts, here just vague */
   for(i=1;i<=pGlob;i++){
      beta0[i]=1;
      for(j=1;j<=wGlob;j++){
         beta[j][i]=1;
      }
   }
   /* draw samples from the posterior */
   gibbsDriver(&setSeed,lookInt,noChn,numIter,numburn,thinfac,noPar,initValu,
      simres);
   /* write to file */
   for(i=1;i<=noChn;i++){
      for(j=1;j<=(numIter-numburn)/thinfac;j++){
            for(k=1;k<=noPar;k++){
               filew1 << simres[i][j][k] << " ";
            }
      }
   }
   cout << "press any key to finish" << endl;
   getch();
   free_ivector(seqLen,1,nSeq);
   free_ivector(beta0,1,wGlob);
   free_dvector(pp,1,numBpar);
   free_imatrix(beta,1,wGlob,1,pGlob);
   free_dmatrix(initValu,1,noPar,1,noChn);
   free_imatrix(yGlob,1,nSeq,1,maxLen);
   free_dmatrix(tryV,1,numBpar,1,numBpar);
   free_dmatrix(desMat,1,tLen,1,numBpar);
   free_d3tensor(simres,1,noChn,1,(numIter-numburn)/thinfac,1,noPar);
}

void gibbsSmplrLL(long *setSeed,double *par)
{
   /* this runs a single iteration of the Gibbs sampler for block motif model
   run with a log linear model, first it samples the block of betas for the glm
   then it samples each starting position given beta and all other start sites*/
   int i,j,k,k1,mtfOld,*mtf;
   double eta1,etak1,gy1,gyk1,*res1,*regCoef,*table;
   mtf=ivector(1,nSeq);
   regCoef=dvector(1,numBpar);
   table=dvector(1,tLen);
   for(i=1;i<=nSeq;i++) mtf[i]=int(par[i]);
   for(i=nSeq+1;i<=noPar;i++) regCoef[i-nSeq]=par[i];
   /* sample the regression coefficients */
   loglinSmpl(setSeed,mtf,regCoef);
   /* now sample the sites */
   /* first construct the table using the existing set of motif start sites */
   for(i=1;i<=tLen;i++) table[i]=0.0;
   for(i=1;i<=nSeq;i++){
      k=1;
      for(j=1;j<=wGlob;j++)
         k+=(yGlob[i][mtf[i]+j-1]-1)*pow(pGlob,j-1);
      table[k]+=1.0;
   }
   for(i=1;i<=nSeq;i++){
      res1=dvector(1,seqLen[i]-wGlob+1);
      /* calculate posterior for each position relative to posterior
      for position 1 */
      k=1;
      for(j=1;j<=wGlob;j++)
         k+=(yGlob[i][j]-1)*pow(pGlob,j-1);
      eta1=0.0;
      for(j=1;j<=numBpar;j++)
         eta1+=desMat[k][j]*regCoef[j];
         if(loggam(table[k]+1.0) > 300.0){
            cout << table[k] << endl;
            nrerror("gy1 problem");
      }
      gy1=exp(loggam(table[k]+1.0));
      res1[1]=1.0;
      for(j=2;j<=seqLen[i]-wGlob+1;j++){
         k1=1;
         for(k=1;k<=wGlob;k++)
            k1+=(yGlob[i][j+k-1]-1)*pow(pGlob,k-1);
         etak1=0.0;
         for(k=1;k<=numBpar;k++)
            etak1+=desMat[k1][k]*regCoef[k];
            if(loggam(table[k1]+1.0) > 300.0){
               nrerror("gyk1 problem");
            }
         gyk1=exp(loggam(table[k1]+1.0));
         if(etak1-eta1 > 300.0) nrerror("gyk1 problem");
         res1[j]=exp(etak1-eta1)*gy1/gyk1;
      }
      /* sample a motif position */
      mtfOld=mtf[i];
      mtf[i]=smpl(res1,seqLen[i]-wGlob+1,setSeed);
      /* now adjust the table to account for this motif changing */
      k=1;
      for(j=1;j<=wGlob;j++)
         k+=(yGlob[i][mtfOld+j-1]-1)*pow(pGlob,j-1);
      table[k]-=1.0;
      k=1;
      for(j=1;j<=wGlob;j++)
         k+=(yGlob[i][mtf[i]+j-1]-1)*pow(pGlob,j-1);
      table[k]+=1.0;
      free_dvector(res1,1,seqLen[i]-wGlob+1);
   }
   for(i=1;i<=nSeq;i++) par[i]=double(mtf[i]);
   for(i=nSeq+1;i<=noPar;i++) par[i]=regCoef[i-nSeq];
   free_ivector(mtf,1,nSeq);
   free_dvector(regCoef,1,numBpar);
   free_dvector(table,1,tLen);
}

void loglinSmpl(long *setSeed,int *mtf,double *beta)
{
/* this draws from post. of log lin model */
   int i,j;
   double **data;
   data=dmatrix(1,nSeq,1,wGlob);
/* first use mtf to pick out the data from yGlob */
   for(i=1;i<=nSeq;i++){
      for(j=1;j<=wGlob;j++){
         data[i][j]=yGlob[i][mtf[i]+j-1];
      }
   }
   /* now call the metropolis algorithm for a log linear model */
   metrop1(setSeed,numBpar,beta,data);
   free_dmatrix(data,1,nSeq,1,wGlob);
}

void metrop1(long *setSeed,int numpar,double *curDev,double **data)
{
/* does one iteration of the metropolis algorithm tryV
   is the varaince matrix for the normal jumping kernel, tryV
   is a global matrix which at first is the covariance matrix but
   gets turned into its inverse in main (after read in from external
   file) */
   int i,j;
   double sum,temp1,temp2,*mm,*canDev;
   mm=dvector(1,numpar);
   canDev=dvector(1,numpar);
   temp1=minFcn(curDev,data);
   for(i=1;i<=numpar;i++) mm[i]=gasdev(setSeed);
   for(i=1;i<=numpar;i++){
      sum=0.0;
      for(j=1;j<i;j++) sum+=tryV[i][j]*mm[j];
      canDev[i]=curDev[i]+sum+pp[i]*mm[i];
   }
   temp2=minFcn(canDev,data);
   if(log(ran1(setSeed)) < temp1-temp2){
      for(j=1;j<=numpar;j++) curDev[j]=canDev[j];
      acpt+=1.0;
   }
   free_dvector(mm,1,numpar);
   free_dvector(canDev,1,numpar);
}

double minFcn(double *dev,double **data)
{
/* this computes the negative log likelihood at dev for a log linear model
with a design matrix that is global and read in to the program in main
data is the seq data for positions within the current motif regions */
   int i,j,k;
   double res1,res2,*eta;
   eta=dvector(1,tLen);
   res1=0;
   for(i=1;i<=tLen;i++){
      eta[i]=0;
      for(j=1;j<=numBpar;j++){
         eta[i]+=desMat[i][j]*dev[j];
      }
      if(eta[i] > 300.0){
         cout << "eta=" << eta[i] << endl;
         for(k=1;k<=numBpar;k++) cout << dev[k] << " ";
         nrerror("eta problem");
      }
      res1+=exp(eta[i]);
   }
   res2=0;
   for(i=1;i<=nSeq;i++){
      k=1;
      for(j=1;j<=wGlob;j++)
         k+=(data[i][j]-1)*pow(pGlob,j-1);
      res2+=eta[k];
   }
   free_dvector(eta,1,tLen);
   return res1-res2;
}

void gibbsDriver(long *setSeed,int lookInt,int noChn,int numIter,int numburn,
   int thinfac,int numpar,double **initValu,double ***simres)
{
/* runs the Gibbs sampler-simres holds sim. results in form chain by
   iteration by parameter, and initValu a is matrix of initial values
   for the chain, this finds convergence diagnostic stat and reports
   it-stops if converrgence taken place if you want*/
   int ii,jj,i,j;
   double RSTOP=1.01,*par,*Rstat,**dev;
   Rstat=dvector(1,numpar);
   par=dvector(1,numpar);
   dev=dmatrix(1,numpar,1,noChn);
   for(ii=1;ii<=noChn;ii++){
      for(i=1;i<=numpar;i++){
         dev[i][ii]=initValu[i][ii];
      }
   }
   for(i=1;i<=numIter;i++){
      if(i%100==0) cout << "starting iteration number " << i << endl;
      for(ii=1;ii<=noChn;ii++){
//         cout << "starting chain number " << ii << endl;
         for(j=1;j<=numpar;j++) par[j]=dev[j][ii];
         gibbsSmplrLL(setSeed,par);
         for(j=1;j<=numpar;j++) dev[j][ii]=double(par[j]);
         if(i > numburn){
            if((i-numburn)%thinfac==0){
               for(j=1;j<=numpar;j++){
                  simres[ii][(i-numburn)/thinfac][j]=dev[j][ii];
               }
            }
         }
      }
      if(i%lookInt==0 && i > numburn && noChn > 1){
         sqrtR(numpar,Rstat,noChn,(i-numburn)/thinfac,numpar,simres);
         cout << "convergence diagnostic statistics after " << i << " iterations" << endl;
         jj=0;
/* able or disable early return by commenting in/out */
         for(j=1;j<=numpar;j++){
//            if(Rstat[j] < RSTOP) jj+=1;
            cout << Rstat[j] << " ";
         }
         cout << endl;
      }
//      if(jj==numpar) break;
   }
   cout << "acceptance rate for metropolis=" << acpt/(numIter*noChn) << endl;
   free_dvector(Rstat,1,numpar);
   free_dvector(par,1,numpar);
   free_dmatrix(dev,1,numpar,1,noChn);
}

int smpl(double *unnrmlzPst,int len,long *setSeed)
{
   /*  draws a sample from the set 1,2,...,len with
   unnormalized probabilities given by the vector
   unnrmlzPst */
   int i,ret=len;
   double qTot,u,cumsum;
   qTot=0.0;
   cumsum=0.0;
   for(i=1;i<=len;i++) qTot+=unnrmlzPst[i];
   if(qTot <= 1.0e-50) nrerror("error in smpl");
   u=ran1(setSeed);
   for(i=1;i<=len;i++){
      cumsum+=unnrmlzPst[i]/qTot;
      if(cumsum >= u){
         ret=i;
         break;
      }
   }
   return ret;
}

/* this is stuff for simple block motif sampler */

void gibbsSmplr(long *setSeed,int *mtf)
{
   /* this runs a single iteration of the Gibbs sampler */
   int i,j,k1,k2,*h1,**h2,q;
   double *res1;
   h1=ivector(1,pGlob);
   h2=imatrix(1,wGlob,1,pGlob);
   for(i=1;i<=nSeq;i++){
      res1=dvector(1,seqLen[i]-wGlob+1);
      /* given the current set of motifs, count the frequency of each
      letter in the background area and at each motif position */
      for(j=1;j<=pGlob;j++){
         h1[j]=beta0[j];
         for(k1=1;k1<=nSeq;k1++){
            for(k2=1;k2<=seqLen[k1];k2++){
               if(k1!=i){
                  if(k2<mtf[k1] || k2>=mtf[k1]+wGlob){
                     if(yGlob[k1][k2]==j) h1[j]+=1;
                  }
                }
                else{
                     if(yGlob[k1][k2]==j) h1[j]+=1;
                }
            }
         }
      }
      for(j=1;j<=pGlob;j++){
         for(k1=1;k1<=wGlob;k1++){
            h2[k1][j]=beta[k1][q];
            for(k2=1;k2<=nSeq;k2++){
               if(k2!=i){
                  if(yGlob[k2][mtf[k2]+k1-1]==j) h2[k1][j]+=1;
               }
            }
         }
      }
      for(j=1;j<=seqLen[i]-wGlob+1;j++){
         res1[j]=1;
         /* now scan all windows of width wGlob to assess agreement with
         the counts given by all other motifs relative to the background
         distribution */
         for(k1=1;k1<=wGlob;k1++){
            res1[j]*=(double(h2[k1][yGlob[i][j+k1-1]])/double(h1[yGlob[i][j+k1-1]]));
         }
      }
      /* sample a motif position using the results from scanning the sequence */
      mtf[i]=smpl(res1,seqLen[i]-wGlob+1,setSeed);
      free_dvector(res1,1,seqLen[i]-wGlob+1);
   }
   free_ivector(h1,1,pGlob);
   free_imatrix(h2,1,wGlob,1,pGlob);
}