// 25 Mar 2009: MPI double precision, finally!  Needs more testing...
// See some comments and define's below...

// 14 Oct 2008: Important correction to d_choldc_strip made

// I am now trying to "transpose" everything and make the code
// return L^T in order to see if the memory related slowdowns go away...
// It appears they do!

/* (C) Steven Gratton 2008.
   This program computes the Cholesky factorization of a matrix.
   Basic program layout inspired by the examples in the NVIDIA CUDA (TM) SDK.
   Basic algorithm based on that in Numerical Recipes for F90, 
   that referencing Golub and Van Loan, Matrix Computations, Johns
   Hopkins University Press.
   I came up with the parallelization scheme for the G80 myself, but
   (unsurprisingly) it is rather similar to those discussed for general 
   parallel computing, see e.g. SCALAPACK documentation.  I
*/

/* This code is in an unfinished state and as such should not be used
   for anything serious or be redistributed.  

   I plan to make a "proper" version available in the near future, 
   hopefully supporting double precision when that is available in hardware.

   Any comments/suggestions are welcome; please see my homepage at
   www.ast.cam.ac.uk/~stg20 for contact details.
*/


// You may need to increase stack size to get the program to run,
// try "ulimit -s unlimited" or similar on linux.
// block indices follow matrix conventions
// i.e. (bp,bq)
// refers to the bp'th block matrix in the bq'th column
// thread x,y indices are always chosen to coalesce global memory reads 
// for large matrices you may want to use diaginitialize...


// WARNING: various options will break if BLOCK_SIZE is changed from 16
// MAT_SIZE should be a multiple of BLOCK_SIZE
// MAT_BLOCKS should be a multiple of NUM_GPUS
// Set the number of processes "-np ..." in the argument to mpirun to NUM_GPUS
// Set the number of slots per node in the mpi hostfile to NUM_GPUS_PER_NODE

#define MAT_SIZE (16*1024)
#define BLOCK_SIZE 16
#define MAT_BLOCKS (MAT_SIZE/BLOCK_SIZE)
#define NUM_GPUS 2
#define NUM_GPUS_PER_NODE 2

// warning: "UNROLL" only set up for a block size of 16!
#define COMPARE
#define NOCPUCMP
#define UNROLL
//#define PRINTMATRIX
//#define FULLINIT
//#define PRINTINPUTMATRIX
//#define MATDISP
//#define CPU



#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>


#include "mpi.h"


int nprocs,myrank;

//a simple test case...
void diaginitialize(double (*x)[MAT_SIZE])
{
  int i,j;

  for (i=0;i<MAT_SIZE;i++)
    {
      for (j=0;j<MAT_SIZE;j++)
	{
	  x[i][j]=0.;
	}
    }    

  for (j=0;j<MAT_SIZE;j++)
    {
      x[j][j]=j+1.;
    }
}



// generates a random positive definite symmetric matrix
// by adding up the outer products of many (>MAT_SIZE) vectors
void initialize(double (*x)[MAT_SIZE])
{
  double tmp[MAT_SIZE];
  int i,j,a,b;

  for (i=0;i<MAT_SIZE;i++)
    {
      for (j=0;j<MAT_SIZE;j++)
	{
	  x[i][j]=0.;
	  //	    printf("Zeroing: x[%d][%d]=%f.\n",i,j,x[i][j]);
	}
    
    }

  for (a=0;a<5*MAT_SIZE;a++)
    {
      for (b=0;b<MAT_SIZE;b++)
	{
	  tmp[b]=(1.*(double)rand()) / RAND_MAX;
	  //	    printf("tmp[%d] for vector %d = %f.\n",b,a,tmp[b]);
	}
      for (i=0;i<MAT_SIZE;i++)
	{
	  for (j=0;j<MAT_SIZE;j++)
	    {
	      x[i][j]+=tmp[i]*tmp[j];
	      //	    printf("Adding: x[%d][%d]+=%f.\n",i,j,x[i][j]);
	    }	
	}
      for (i=0;i<MAT_SIZE;i++)
	{
	  for (j=0;j<MAT_SIZE;j++)
	    {
	      //	    printf("Final: x[%d][%d]=%f.\n",i,j,x[i][j]);
	    }	
	}
    }
}

void diagonalinitialize(double (*d)[MAT_SIZE],double(*x)[MAT_SIZE])
{
  for (int k=0;k<MAT_SIZE;k++)
    {
      d[0][k]=x[k][k];
    }
}

void cpucholdc(double (*x)[MAT_SIZE],double (*d)[MAT_SIZE])
{
  printf("*Please provide your own cpu implementation...*\n");
}



// this is a small kernel that Cholesky decomposes the current "top left" 
// block of the matrix...

__global__ void d_choldc_topleft(double (*m)[MAT_SIZE],
				 int boffset)
{
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  __shared__ double topleft[BLOCK_SIZE][BLOCK_SIZE+1];

  topleft[ty][tx]=m[ty+(boffset/NUM_GPUS)*BLOCK_SIZE][tx+BLOCK_SIZE*boffset];

  __syncthreads();

  double diagelem,fac;


  // in this loop tx labels column, ty row
  for(int k=0;k<BLOCK_SIZE;k++)
    {
      __syncthreads();
      fac=1./sqrt(topleft[k][k]);
      __syncthreads();
      if ((ty==k)&&(tx>=k)) 
	{
	  topleft[ty][tx]=(topleft[ty][tx])*fac;
	}
      __syncthreads();

      if ((tx>=ty)&&(ty>k)) 
	{
	  topleft[ty][tx]=topleft[ty][tx]-topleft[k][ty]*topleft[k][tx]; 
	}

    }

  __syncthreads();


  if (tx>=ty) {
    m[ty+(boffset/NUM_GPUS)*BLOCK_SIZE][tx+BLOCK_SIZE*boffset] 
      =topleft[ty][tx];
  }

}








// this kernel updates the strip below the "topleft" block
__global__ void d_choldc_strip(double (*m)[MAT_SIZE],
			       int blockoffset)
{

  // +1 since blockoffset labels the "topleft" position
  // and boff is the working position...
  int boffx = blockIdx.x+blockoffset+1; 
  int tx = threadIdx.x;
  int ty = threadIdx.y;

    
  __shared__ double topleftt[BLOCK_SIZE][BLOCK_SIZE+1];
  __shared__ double workingmat[BLOCK_SIZE][BLOCK_SIZE+1];

  // deliberately transposed...
  topleftt[tx][ty]=
    m[ty+(blockoffset/NUM_GPUS)*BLOCK_SIZE][tx+blockoffset*BLOCK_SIZE];
  workingmat[ty][tx]=
    m[ty+(blockoffset/NUM_GPUS)*BLOCK_SIZE][tx+boffx*BLOCK_SIZE];

  __syncthreads();

  // now we forward-substitute for the new strip-elements...
  // one thread per column (a bit inefficient I'm afraid)

  if(ty==0)
    {
      for (int k=0;k<BLOCK_SIZE;k++)
	{
	  double dotprod=0.;
	  for (int m=0;m<k;m++)
	    {
	      dotprod+=topleftt[k][m]*workingmat[m][tx];
	    }
	  workingmat[k][tx]=(workingmat[k][tx]-dotprod)/topleftt[k][k];
	}
    }

  __syncthreads();

  // correction here!
  m[ty+(blockoffset/NUM_GPUS)*BLOCK_SIZE][tx+boffx*BLOCK_SIZE]
    =workingmat[ty][tx];
 
}




__global__ void d_choldc_diagupdate(double (*m)[MAT_SIZE],
				    double (*wsm)[MAT_SIZE],
				    int startingblock)  
{
  int boffx = NUM_GPUS*blockIdx.x+startingblock; 

  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // the +1's stop shared memory bank conflicts when accessing down columns
  // There are already no shared bank conflicts when accessing by row

  __shared__ double topt[BLOCK_SIZE][BLOCK_SIZE+1];

  // deliberately transposed...
  topt[tx][ty]=wsm[ty][tx+boffx*BLOCK_SIZE];

  __syncthreads();

  // ty,tx'th thread works out (ty,tx) cmpt of the product...
  double matrixprod=0.;
     

  // C'=C-top^T top = C topt topt^T ...
  if(tx>=ty)  
    {

#ifdef UNROLL

      matrixprod+=topt[ty][0]*topt[tx][0];
      matrixprod+=topt[ty][1]*topt[tx][1];
      matrixprod+=topt[ty][2]*topt[tx][2];
      matrixprod+=topt[ty][3]*topt[tx][3];
      matrixprod+=topt[ty][4]*topt[tx][4];
      matrixprod+=topt[ty][5]*topt[tx][5];
      matrixprod+=topt[ty][6]*topt[tx][6];
      matrixprod+=topt[ty][7]*topt[tx][7];
      matrixprod+=topt[ty][8]*topt[tx][8];
      matrixprod+=topt[ty][9]*topt[tx][9];
      matrixprod+=topt[ty][10]*topt[tx][10];
      matrixprod+=topt[ty][11]*topt[tx][11];
      matrixprod+=topt[ty][12]*topt[tx][12];
      matrixprod+=topt[ty][13]*topt[tx][13];
      matrixprod+=topt[ty][14]*topt[tx][14];
      matrixprod+=topt[ty][15]*topt[tx][15];


#else

      for (int kk=0;kk<BLOCK_SIZE;kk++)
	{
	  matrixprod+=topt[ty][kk]*topt[tx][kk];
	}
    
#endif



      m[ty+(boffx/NUM_GPUS)*BLOCK_SIZE][tx+boffx*BLOCK_SIZE]-=matrixprod; 
    }
}






// this kernel takes the results of the above ones and applies them to the 
//rest of the matrix...
__global__ void d_choldc_hiupdate(double (*m)[MAT_SIZE],
				  double (*wsm)[MAT_SIZE],
				  int startingblock)  
{

  int tx = threadIdx.x;
  int ty = threadIdx.y;

  int boffy=NUM_GPUS*blockIdx.x+startingblock;
  int boffx=boffy+1;

  // the +1's stop shared memory bank conflicts when accessing down columns
  // There are already no shared bank conflicts when accessing by row

  __shared__ double leftt[BLOCK_SIZE][BLOCK_SIZE+1];
  __shared__ double rightt[BLOCK_SIZE][BLOCK_SIZE+1];


  // now read in the data, always from top right

  int tmpx,tmpy;

  tmpy=__mul24(boffy,BLOCK_SIZE);

  // note the tmpy in the latter term to ensure we get the
  // correct common matrix for the row
  leftt[tx][ty]=wsm[ty][tx+tmpy];

  for (;boffx<MAT_BLOCKS;boffx++){


    tmpx=__mul24(boffx,BLOCK_SIZE);



    rightt[tx][ty]=wsm[ty][tx+tmpx];

    __syncthreads();



 
    double matrixprod=0.;

    // ty,tx'th thread works out (ty,tx) cmpt of the product...
#ifdef UNROLL


    matrixprod+=leftt[ty][0]*rightt[tx][0];
    matrixprod+=leftt[ty][1]*rightt[tx][1];
    matrixprod+=leftt[ty][2]*rightt[tx][2];
    matrixprod+=leftt[ty][3]*rightt[tx][3];
    matrixprod+=leftt[ty][4]*rightt[tx][4];
    matrixprod+=leftt[ty][5]*rightt[tx][5];
    matrixprod+=leftt[ty][6]*rightt[tx][6];
    matrixprod+=leftt[ty][7]*rightt[tx][7];
    matrixprod+=leftt[ty][8]*rightt[tx][8];
    matrixprod+=leftt[ty][9]*rightt[tx][9];
    matrixprod+=leftt[ty][10]*rightt[tx][10];
    matrixprod+=leftt[ty][11]*rightt[tx][11];
    matrixprod+=leftt[ty][12]*rightt[tx][12];
    matrixprod+=leftt[ty][13]*rightt[tx][13];
    matrixprod+=leftt[ty][14]*rightt[tx][14];
    matrixprod+=leftt[ty][15]*rightt[tx][15];


#else

    for (int kk=0;kk<BLOCK_SIZE;kk++)
      {
	matrixprod+=leftt[ty][kk]*rightt[tx][kk];
      }
    
#endif

    __syncthreads();

    m[ty+(boffy/NUM_GPUS)*BLOCK_SIZE][tx+tmpx]-=matrixprod;

  }

}



void cpumatdisp(double (*mat)[MAT_SIZE],double (*diag)[MAT_SIZE])
{
  int i,j;
  printf("CPU output: \n");

  for (j=0;j<MAT_SIZE;j++)
    {
      for (i=0;i<MAT_SIZE;i++)
	{
	  printf("%7.4f ",mat[j][i]);
	}
      printf("\n");
    }
  printf("\n");
  for (i=0;i<MAT_SIZE;i++)
    {
      printf("%7.4f ",diag[0][i]);
    }

  printf("\n");
}


void matdisp(double (*matptr)[MAT_SIZE])
{
  double mat[MAT_SIZE][MAT_SIZE];

  unsigned int mat_size=MAT_SIZE*MAT_SIZE*sizeof(double);

  int i,j;
  cudaError_t error;
  cudaThreadSynchronize();

  //    printf("In matdisp, matptr=%p.\n\n",matptr);

  cudaMemcpy(mat,matptr, mat_size,cudaMemcpyDeviceToHost);
  //	error=cudaGetLastError();
  //	printf("In mat disp, Error code %d: %s.\n",error,cudaGetErrorString(error));

  cudaThreadSynchronize();

  printf("\n");

  for (j=0;j<MAT_SIZE;j++)
    {
      for (i=0;i<MAT_SIZE;i++)
	{
	  printf("%7.4f ",mat[j][i]);
	}
      printf("\n");
    }

  printf("\n");


  cudaThreadSynchronize();
}
    


void choldc(double (*mat)[MAT_SIZE])
{
  volatile clock_t gputime;

  double workingblockrow[BLOCK_SIZE][MAT_SIZE];

  double (*d_mat)[MAT_SIZE];
  unsigned int mat_size=(MAT_SIZE/NUM_GPUS)*MAT_SIZE*sizeof(double);

  double (*d_workingblockrow)[MAT_SIZE];
  unsigned int workingblockrow_size=BLOCK_SIZE*MAT_SIZE*sizeof(double);

  cudaError_t error; 


  dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
  dim3 stripgrid;
  dim3 diaggrid;
  dim3 higrid;

  int worker,workstartrow;

  int j=MAT_BLOCKS; // doesn't change
  int i=MAT_BLOCKS; // counts down; the no. of block cols still to do 

  int tmp1,tmp2,tmp3,tmp4; // for indexing calculations
  int pushforward;
  int firstdiagblock; 

  cudaMalloc((void**) &d_mat,mat_size);
  cudaMemcpy(d_mat, mat, mat_size,cudaMemcpyHostToDevice);

  cudaMalloc((void**) &d_workingblockrow,workingblockrow_size);


  error=cudaGetLastError();
  printf("     Error code from process %i is %d: %s.\n",
	 myrank,error,cudaGetErrorString(error)); 

  gputime=clock();

#ifdef MATDISP
  matdisp(d_mat);
#endif

  while(i>2)
    {
      worker=(j-i)%nprocs;
      workstartrow=((j-i)/nprocs)*BLOCK_SIZE;

      //      printf("worker for i=%i is %i.\n",i,worker);

      stripgrid.x=i-1;
      stripgrid.y=1;
      stripgrid.z=1;

      // tmp3 shall contain the total no of blockrows a process will handle 
      tmp1=(MAT_BLOCKS)/nprocs; 
      tmp2=(MAT_BLOCKS)%nprocs; 
      if (myrank<tmp2){
	tmp3=tmp1+1;
      }
      else{
	tmp3=tmp1;
      }

      // tmp4 shall contain the number of irrelevant blockrows
      // for a process...
      tmp1=(j-i+1)/nprocs; 
      tmp2=(j-i+1)%nprocs; 
      if (myrank<tmp2){
	tmp4=tmp1+1;
      }
      else{
	tmp4=tmp1;
      }

      diaggrid.x=tmp3-tmp4;
      diaggrid.y=1;
      diaggrid.z=1;

      // now subtracting 1 off the total because hiupdate
      // doesn't operate on the last blockrow 
      tmp1=(MAT_BLOCKS-1)/nprocs; 
      tmp2=(MAT_BLOCKS-1)%nprocs; 
      if (myrank<tmp2){
	tmp3=tmp1+1;
      }
      else{
	tmp3=tmp1;
      }

      higrid.x=tmp3-tmp4;
      higrid.y=1;
      higrid.z=1;

      if(myrank==worker){
	//	printf("rank %i about to work...\n",myrank);
	d_choldc_topleft<<<1,threads>>>(d_mat,j-i);
	d_choldc_strip<<<stripgrid,threads>>>(d_mat,j-i);

	// the following could possibly be replaced by an appropriate
	// call to cudaMemcpy2D, saving communication...
	cudaMemcpy(workingblockrow,d_mat[workstartrow],
		   workingblockrow_size,
		   cudaMemcpyDeviceToHost);

      }

      MPI_Bcast(workingblockrow,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE
		,worker,MPI_COMM_WORLD);
      cudaMemcpy(d_workingblockrow,workingblockrow,workingblockrow_size,
		 cudaMemcpyHostToDevice);

      if(myrank>worker)  firstdiagblock=j-i+myrank-worker;
      if(myrank==worker) firstdiagblock=j-i+nprocs;
      if(myrank<worker)  firstdiagblock=j-i+nprocs+myrank-worker;

      if(diaggrid.x>0){
	d_choldc_diagupdate<<<diaggrid,threads>>>(d_mat,
						  d_workingblockrow,
						  firstdiagblock);
      }
      if(higrid.x>0){
	d_choldc_hiupdate<<<higrid,threads>>>(d_mat,
					      d_workingblockrow,
					      firstdiagblock);
      }
      i--;
    }

  if(j>1)
    {
      worker=(j-2)%nprocs;
      workstartrow=((j-2)/nprocs)*BLOCK_SIZE;

      if(myrank==worker){
	//	printf("rank %i about to work in part %i...\n",myrank,i);
	
	d_choldc_topleft<<<1,threads>>>(d_mat,j-2);
	d_choldc_strip<<<1,threads>>>(d_mat,j-2);

	cudaMemcpy(workingblockrow,d_mat[workstartrow]
		   ,workingblockrow_size,
		   cudaMemcpyDeviceToHost);
      }

      MPI_Bcast(workingblockrow,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE
		,worker,MPI_COMM_WORLD);
      cudaMemcpy(d_workingblockrow,workingblockrow,workingblockrow_size,
		 cudaMemcpyHostToDevice);

      if(myrank>worker)  firstdiagblock=j-i+myrank-worker;
      if(myrank==worker) firstdiagblock=j-i+nprocs;
      if(myrank<worker)  firstdiagblock=j-i+nprocs+myrank-worker;

      // tmp3 shall contain the total no of blockrows a process will handle 
      tmp1=(MAT_BLOCKS)/nprocs; 
      tmp2=(MAT_BLOCKS)%nprocs; 
      if (myrank<tmp2){
	tmp3=tmp1+1;
      }
      else{
	tmp3=tmp1;
      }

      // tmp4 shall contain the number of irrelevant blockrows
      // for a process...
      tmp1=(j-i+1)/nprocs; 
      tmp2=(j-i+1)%nprocs; 
      if (myrank<tmp2){
	tmp4=tmp1+1;
      }
      else{
	tmp4=tmp1;
      }

      diaggrid.x=tmp3-tmp4;
      diaggrid.y=1;
      diaggrid.z=1;


      if(diaggrid.x>0){
	d_choldc_diagupdate<<<diaggrid,threads>>>(d_mat,
						  d_workingblockrow,
						  firstdiagblock);
      }
      i--;
    }

  //  printf("rank %i out of the if...\n",myrank);


  worker=(j-1)%nprocs;
  if(myrank==worker){
    // printf("rank %i about to work in the last topleft\n",myrank);
    d_choldc_topleft<<<1,threads>>>(d_mat,j-1);
  }

  cudaThreadSynchronize();
  gputime=clock()-gputime;

  printf("kernel time from process %i=%f s.\n",myrank,gputime/1.e6f);

  cudaMemcpy(mat,d_mat, mat_size,cudaMemcpyDeviceToHost);
 
  cudaFree(d_mat);
  cudaFree(d_workingblockrow);
 
  cudaThreadSynchronize();

  error=cudaGetLastError();
  printf("     Error code from process %i is %d: %s.\n",
	 myrank,error,cudaGetErrorString(error)); 

}







int main(int argc, char** argv)
{
  double x[(MAT_SIZE/NUM_GPUS)][MAT_SIZE];
  double *bigxptr=NULL;
  double *bigx=NULL;
  double *xptr=NULL;
  int i,j,devcnt,mydev;

  int tmp;

  clock_t maincputime,maingputime;

  cudaError_t error;

  MPI_Init(&argc,&argv);

  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);

  printf("Hi from rank %i.\n",myrank);

  if(nprocs!=NUM_GPUS) {
    printf("disagreement between NUM_GPUS and nprocs, from rank %i.\n",
	   myrank);
    MPI_Abort(MPI_COMM_WORLD,-1);
    MPI_Finalize();
    return -1;
  }

  cudaGetDeviceCount(&devcnt);
  if(devcnt!=NUM_GPUS_PER_NODE) {
    printf("disagreement between NUM_GPUS_PER_NODE and devcnt on rank %i.\n",
	   myrank);
    MPI_Abort(MPI_COMM_WORLD,-1);
    MPI_Finalize();
    return -1;
  }
  mydev=myrank%devcnt;
  cudaSetDevice(mydev);
  error=cudaGetLastError();
  printf("     Error code from process %i is %d: %s.\n",
	 myrank,error,cudaGetErrorString(error)); 
  cudaGetDevice(&tmp);
  printf("     cuda device for process %i is %i.\n",myrank,tmp);

  if (myrank==0) {
    printf("In from proc 0.\n");
    bigx=(double*)calloc(sizeof(double),MAT_SIZE*MAT_SIZE);
     if (bigx==NULL) printf("Problem with allocation.\n");
   memset(bigx,0,sizeof(double)*MAT_SIZE*MAT_SIZE);

    printf("Initializing test matrix...\n");

    for (i=0;i < MAT_SIZE; i++) {
      bigx[i*MAT_SIZE+i]=i+1.;
    }

    //    for (i=0;i<MAT_SIZE;i+=100){
    // printf("m[%i][%i]=%f\n",i,i,bigx[i*MAT_SIZE+i]);
    //}
  }

  for(i=0;i<MAT_SIZE/(NUM_GPUS*BLOCK_SIZE);i++){
    if(myrank==0){
      bigxptr=&(bigx[NUM_GPUS*BLOCK_SIZE*i*MAT_SIZE]);
      //    printf("%p, %f, %f.\n",bigxptr,*bigxptr,bigxptr[3*MAT_SIZE+3]);
    }
    else{
      bigxptr=NULL;
    }
    xptr=&(x[BLOCK_SIZE*i][0]);
    //    printf("From rank %d, %p, %f, %f.\n",myrank,xptr,*xptr,xptr[3]);
    MPI_Scatter((void*)bigxptr,
		BLOCK_SIZE*MAT_SIZE,
		MPI_DOUBLE,
		(void*)xptr,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE,
		0,MPI_COMM_WORLD);
    
  }

  //  printf("From rank %d, %f, %f.\n",myrank,x[16][32],x[16][33]);

  printf("Cholesky factorizing...\n");

#ifdef COMPARE
  maincputime=clock();
#ifdef NOCPUCMP
#else
  cpucholdc(x,diagonal);
#endif
  maincputime=clock()-maincputime;


#ifdef WARMUP
  printf("gpu warmup...\n");
  maingputime=clock();
  choldc(x);
  maingputime=clock()-maingputime;

  printf("maingputime=%u, maincputime=%u.\n",maingputime,maincputime);

#ifdef FULLINIT
  initialize(x);
#else
  diaginitialize(x);
#endif

  diagonalinitialize(diagonal,x);
#endif

  printf("gpu proper...\n");
  maingputime=clock();
  choldc(x);
  maingputime=clock()-maingputime;

  printf("maingputime=%u, maincputime=%u.\n",maingputime,maincputime);



#else

#ifdef CPU
  cpucholdc(x,diagonal);
#ifdef MATDISP
  cpumatdisp(x,diagonal);
#endif

#else
  choldc(x,diagonal);
#endif
#endif

  //  printf("x[%d][%d]=%f.\n",MAT_SIZE-1,MAT_SIZE-1,x[MAT_SIZE-1][MAT_SIZE-1]);

  for(i=0;i<MAT_SIZE/(NUM_GPUS*BLOCK_SIZE);i++){
    if(myrank==0){
      bigxptr=&(bigx[NUM_GPUS*BLOCK_SIZE*i*MAT_SIZE]);
      //    printf("%p, %f, %f.\n",bigxptr,*bigxptr,bigxptr[3*MAT_SIZE+3]);
    }
    else{
      bigxptr=NULL;
    }
    xptr=&(x[BLOCK_SIZE*i][0]);
    //    printf("From rank %d, %p, %f, %f.\n",myrank,xptr,*xptr,xptr[3]);
    MPI_Gather((void*)xptr,
		BLOCK_SIZE*MAT_SIZE,
		MPI_DOUBLE,
		(void*)bigxptr,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE,
		0,MPI_COMM_WORLD);
    
  }


  if (myrank==0) {
  for (i=0;i<MAT_SIZE;i+=100)
    {
      printf("d[%d]=%f.\n",i,bigx[i*MAT_SIZE+i]);
    }

free(bigx);
  }

  MPI_Finalize();

  return 0;
}