/* this is an implementation of the Gibbs sampler for the block motif model of Liu et al. that uses a call to the Metropolis algorithm to sample a set of parameters from a log linear model: expects that the data is a matrix that holds numbers from 1 to size of alphabet that represent the particular element of that alphabet, e.g. A=1, C=2, G=3, T=4 */ #include #pragma hdrstop #include #include #include #include #include "C:\Documents and Settings\Owner\Desktop\matrices.txt" #include "C:\Documents and Settings\Owner\Desktop\randNo.txt" #include "C:\Documents and Settings\Owner\Desktop\sorting.txt" #include "C:\Documents and Settings\Owner\Desktop\math.txt" #include "C:\Documents and Settings\Owner\Desktop\minimize.txt" #pragma argsused #define FREE_ARG char* void gibbsDriver(long *setSeed,int lookInt,int noChn,int numIter,int numburn, int thinfac,int numpar,double **initValu,double ***simres); void gibbsSmplr(long *setSeed,int *mtf); int smpl(double *unnrmlzPst,int len,long *setSeed); double minFcn(double *curDev,double **data); void metrop1(long *setSeed,int numpar,double *curDev,double **data); void loglinSmpl(long *setSeed,int *mtf,double *beta); void gibbsSmplrLL(long *setSeed,int *mtf, double *beta); /* nSeq is the number of sequences, wGlob is the width of the motif, maxLen is the length of the longest sequence and pGlob is the size of the alphabet over which you are searching */ int nSeq=10,wGlob=8,maxLen=100,pGlob=4,*beta0,**beta,*seqLen,**yGlob; int noPar=nSeq+1+(pGlob-1)*wGlob,numBpar=1+(pGlob-1)*wGlob, tLen=pow(pGlob,wGlob); double acpt=0.0,*pp,**tryV,**desMat; int main() { /* this file holds data */ ifstream filer1("C:\\Documents and Settings\\Owner\\Desktop\\seqData.txt"); if(!filer1){ cerr << "Failure to open filer1\n"; getch(); exit(EXIT_FAILURE); } /* this file holds the length of each sequence */ ifstream filer2("C:\\Documents and Settings\\Owner\\Desktop\\seqLen.txt"); if(!filer2){ cerr << "Failure to open filer2\n"; getch(); exit(EXIT_FAILURE); } /* initial values, one for each sequence-reg coef sampled first */ ifstream filer3("C:\\Documents and Settings\\Owner\\Desktop\\blkMtfIV.txt"); if(!filer3){ cerr << "Failure to open filer3\n"; exit(EXIT_FAILURE); } /* covariance matrix for metropolis step */ ifstream filer4("C:\\Documents and Settings\\Owner\\Desktop\\covMat2.txt"); if(!filer4){ cerr << "Failure to open filer4\n"; exit(EXIT_FAILURE); } /* file that holds the design matrix for the log linear model */ ifstream filer5("C:\\Documents and Settings\\Owner\\Desktop\\desMat2.txt"); if(!filer5){ cerr << "Failure to open filer5\n"; exit(EXIT_FAILURE); } /* samples from the posterior */ ofstream filew1("C:\\Documents and Settings\\Owner\\Desktop\\blkMtfSmp.txt"); if(!filew1){ cerr << "Failure to open filew1\n"; getch(); exit(EXIT_FAILURE); } /* numIter is numiter per chain in Gibbs, numburn is number of burn in iterations, noChn is number of chains to use in MCMC, thinfac inicates how much thinning of chain output-use 1 to have none, and lookInt indicates how frequently convergence diagnostic statistics should be computed */ int numIter=40000,numburn=20000,noChn=1,lookInt=1,thinfac=10,ii=1,i,j, k,l,iter; long setSeed=-8623; double **initValu,***simres; seqLen=ivector(1,nSeq); beta0=ivector(1,wGlob); pp=dvector(1,numBpar); beta=imatrix(1,wGlob,1,pGlob); yGlob=imatrix(1,nSeq,1,maxLen); initValu=dmatrix(1,noPar,1,noChn); tryV=dmatrix(1,numBpar,1,numBpar); desMat=dmatrix(1,tLen,1,numBpar); simres=d3tensor(1,noChn,1,(numIter-numburn)/thinfac,1,noPar); filer2.seekg(0); for(i=1;i<=nSeq;i++){ filer2 >> seqLen[i]; } filer1.seekg(0); for(i=1;i<=nSeq;i++){ for(j=1;j<=seqLen[i];j++){ filer1 >> yGlob[i][j]; } } filer3.seekg(0); for(i=1;i<=noChn;i++){ for(j=1;j<=noPar;j++){ filer3 >> initValu[j][i]; } } filer4.seekg(0); for(i=1;i<=numBpar;i++){ for(j=1;j<=numBpar;j++){ filer4 >> tryV[i][j]; tryV[i][j]*=0.1; } } choldc(tryV,numBpar,pp); for(i=1;i<=tLen;i++){ for(j=1;j<=numBpar;j++){ filer5 >> desMat[i][j]; } } /* specify priors via pseudo-counts, here just vague */ for(i=1;i<=pGlob;i++){ beta0[i]=1; for(j=1;j<=wGlob;j++){ beta[j][i]=1; } } /* draw samples from the posterior */ gibbsDriver(&setSeed,lookInt,noChn,numIter,numburn,thinfac,noPar,initValu, simres); /* write to file */ for(i=1;i<=noChn;i++){ for(j=1;j<=(numIter-numburn)/thinfac;j++){ for(k=1;k<=noPar;k++){ filew1 << simres[i][j][k] << " "; } } } cout << "press any key to finish" << endl; getch(); free_ivector(seqLen,1,nSeq); free_ivector(beta0,1,wGlob); free_dvector(pp,1,numBpar); free_imatrix(beta,1,wGlob,1,pGlob); free_dmatrix(initValu,1,noPar,1,noChn); free_imatrix(yGlob,1,nSeq,1,maxLen); free_dmatrix(tryV,1,numBpar,1,numBpar); free_dmatrix(desMat,1,tLen,1,numBpar); free_d3tensor(simres,1,noChn,1,(numIter-numburn)/thinfac,1,noPar); } void gibbsSmplrLL(long *setSeed,double *par) { /* this runs a single iteration of the Gibbs sampler for block motif model run with a log linear model, first it samples the block of betas for the glm then it samples each starting position given beta and all other start sites*/ int i,j,k,k1,mtfOld,*mtf; double eta1,etak1,gy1,gyk1,*res1,*regCoef,*table; mtf=ivector(1,nSeq); regCoef=dvector(1,numBpar); table=dvector(1,tLen); for(i=1;i<=nSeq;i++) mtf[i]=int(par[i]); for(i=nSeq+1;i<=noPar;i++) regCoef[i-nSeq]=par[i]; /* sample the regression coefficients */ loglinSmpl(setSeed,mtf,regCoef); /* now sample the sites */ /* first construct the table using the existing set of motif start sites */ for(i=1;i<=tLen;i++) table[i]=0.0; for(i=1;i<=nSeq;i++){ k=1; for(j=1;j<=wGlob;j++) k+=(yGlob[i][mtf[i]+j-1]-1)*pow(pGlob,j-1); table[k]+=1.0; } for(i=1;i<=nSeq;i++){ res1=dvector(1,seqLen[i]-wGlob+1); /* calculate posterior for each position relative to posterior for position 1 */ k=1; for(j=1;j<=wGlob;j++) k+=(yGlob[i][j]-1)*pow(pGlob,j-1); eta1=0.0; for(j=1;j<=numBpar;j++) eta1+=desMat[k][j]*regCoef[j]; if(loggam(table[k]+1.0) > 300.0){ cout << table[k] << endl; nrerror("gy1 problem"); } gy1=exp(loggam(table[k]+1.0)); res1[1]=1.0; for(j=2;j<=seqLen[i]-wGlob+1;j++){ k1=1; for(k=1;k<=wGlob;k++) k1+=(yGlob[i][j+k-1]-1)*pow(pGlob,k-1); etak1=0.0; for(k=1;k<=numBpar;k++) etak1+=desMat[k1][k]*regCoef[k]; if(loggam(table[k1]+1.0) > 300.0){ nrerror("gyk1 problem"); } gyk1=exp(loggam(table[k1]+1.0)); if(etak1-eta1 > 300.0) nrerror("gyk1 problem"); res1[j]=exp(etak1-eta1)*gy1/gyk1; } /* sample a motif position */ mtfOld=mtf[i]; mtf[i]=smpl(res1,seqLen[i]-wGlob+1,setSeed); /* now adjust the table to account for this motif changing */ k=1; for(j=1;j<=wGlob;j++) k+=(yGlob[i][mtfOld+j-1]-1)*pow(pGlob,j-1); table[k]-=1.0; k=1; for(j=1;j<=wGlob;j++) k+=(yGlob[i][mtf[i]+j-1]-1)*pow(pGlob,j-1); table[k]+=1.0; free_dvector(res1,1,seqLen[i]-wGlob+1); } for(i=1;i<=nSeq;i++) par[i]=double(mtf[i]); for(i=nSeq+1;i<=noPar;i++) par[i]=regCoef[i-nSeq]; free_ivector(mtf,1,nSeq); free_dvector(regCoef,1,numBpar); free_dvector(table,1,tLen); } void loglinSmpl(long *setSeed,int *mtf,double *beta) { /* this draws from post. of log lin model */ int i,j; double **data; data=dmatrix(1,nSeq,1,wGlob); /* first use mtf to pick out the data from yGlob */ for(i=1;i<=nSeq;i++){ for(j=1;j<=wGlob;j++){ data[i][j]=yGlob[i][mtf[i]+j-1]; } } /* now call the metropolis algorithm for a log linear model */ metrop1(setSeed,numBpar,beta,data); free_dmatrix(data,1,nSeq,1,wGlob); } void metrop1(long *setSeed,int numpar,double *curDev,double **data) { /* does one iteration of the metropolis algorithm tryV is the varaince matrix for the normal jumping kernel, tryV is a global matrix which at first is the covariance matrix but gets turned into its inverse in main (after read in from external file) */ int i,j; double sum,temp1,temp2,*mm,*canDev; mm=dvector(1,numpar); canDev=dvector(1,numpar); temp1=minFcn(curDev,data); for(i=1;i<=numpar;i++) mm[i]=gasdev(setSeed); for(i=1;i<=numpar;i++){ sum=0.0; for(j=1;j 300.0){ cout << "eta=" << eta[i] << endl; for(k=1;k<=numBpar;k++) cout << dev[k] << " "; nrerror("eta problem"); } res1+=exp(eta[i]); } res2=0; for(i=1;i<=nSeq;i++){ k=1; for(j=1;j<=wGlob;j++) k+=(data[i][j]-1)*pow(pGlob,j-1); res2+=eta[k]; } free_dvector(eta,1,tLen); return res1-res2; } void gibbsDriver(long *setSeed,int lookInt,int noChn,int numIter,int numburn, int thinfac,int numpar,double **initValu,double ***simres) { /* runs the Gibbs sampler-simres holds sim. results in form chain by iteration by parameter, and initValu a is matrix of initial values for the chain, this finds convergence diagnostic stat and reports it-stops if converrgence taken place if you want*/ int ii,jj,i,j; double RSTOP=1.01,*par,*Rstat,**dev; Rstat=dvector(1,numpar); par=dvector(1,numpar); dev=dmatrix(1,numpar,1,noChn); for(ii=1;ii<=noChn;ii++){ for(i=1;i<=numpar;i++){ dev[i][ii]=initValu[i][ii]; } } for(i=1;i<=numIter;i++){ if(i%100==0) cout << "starting iteration number " << i << endl; for(ii=1;ii<=noChn;ii++){ // cout << "starting chain number " << ii << endl; for(j=1;j<=numpar;j++) par[j]=dev[j][ii]; gibbsSmplrLL(setSeed,par); for(j=1;j<=numpar;j++) dev[j][ii]=double(par[j]); if(i > numburn){ if((i-numburn)%thinfac==0){ for(j=1;j<=numpar;j++){ simres[ii][(i-numburn)/thinfac][j]=dev[j][ii]; } } } } if(i%lookInt==0 && i > numburn && noChn > 1){ sqrtR(numpar,Rstat,noChn,(i-numburn)/thinfac,numpar,simres); cout << "convergence diagnostic statistics after " << i << " iterations" << endl; jj=0; /* able or disable early return by commenting in/out */ for(j=1;j<=numpar;j++){ // if(Rstat[j] < RSTOP) jj+=1; cout << Rstat[j] << " "; } cout << endl; } // if(jj==numpar) break; } cout << "acceptance rate for metropolis=" << acpt/(numIter*noChn) << endl; free_dvector(Rstat,1,numpar); free_dvector(par,1,numpar); free_dmatrix(dev,1,numpar,1,noChn); } int smpl(double *unnrmlzPst,int len,long *setSeed) { /* draws a sample from the set 1,2,...,len with unnormalized probabilities given by the vector unnrmlzPst */ int i,ret=len; double qTot,u,cumsum; qTot=0.0; cumsum=0.0; for(i=1;i<=len;i++) qTot+=unnrmlzPst[i]; if(qTot <= 1.0e-50) nrerror("error in smpl"); u=ran1(setSeed); for(i=1;i<=len;i++){ cumsum+=unnrmlzPst[i]/qTot; if(cumsum >= u){ ret=i; break; } } return ret; } /* this is stuff for simple block motif sampler */ void gibbsSmplr(long *setSeed,int *mtf) { /* this runs a single iteration of the Gibbs sampler */ int i,j,k1,k2,*h1,**h2,q; double *res1; h1=ivector(1,pGlob); h2=imatrix(1,wGlob,1,pGlob); for(i=1;i<=nSeq;i++){ res1=dvector(1,seqLen[i]-wGlob+1); /* given the current set of motifs, count the frequency of each letter in the background area and at each motif position */ for(j=1;j<=pGlob;j++){ h1[j]=beta0[j]; for(k1=1;k1<=nSeq;k1++){ for(k2=1;k2<=seqLen[k1];k2++){ if(k1!=i){ if(k2=mtf[k1]+wGlob){ if(yGlob[k1][k2]==j) h1[j]+=1; } } else{ if(yGlob[k1][k2]==j) h1[j]+=1; } } } } for(j=1;j<=pGlob;j++){ for(k1=1;k1<=wGlob;k1++){ h2[k1][j]=beta[k1][q]; for(k2=1;k2<=nSeq;k2++){ if(k2!=i){ if(yGlob[k2][mtf[k2]+k1-1]==j) h2[k1][j]+=1; } } } } for(j=1;j<=seqLen[i]-wGlob+1;j++){ res1[j]=1; /* now scan all windows of width wGlob to assess agreement with the counts given by all other motifs relative to the background distribution */ for(k1=1;k1<=wGlob;k1++){ res1[j]*=(double(h2[k1][yGlob[i][j+k1-1]])/double(h1[yGlob[i][j+k1-1]])); } } /* sample a motif position using the results from scanning the sequence */ mtf[i]=smpl(res1,seqLen[i]-wGlob+1,setSeed); free_dvector(res1,1,seqLen[i]-wGlob+1); } free_ivector(h1,1,pGlob); free_imatrix(h2,1,wGlob,1,pGlob); }