// 25 Mar 2009: MPI double precision, finally! Needs more testing... // See some comments and define's below... // 14 Oct 2008: Important correction to d_choldc_strip made // I am now trying to "transpose" everything and make the code // return L^T in order to see if the memory related slowdowns go away... // It appears they do! /* (C) Steven Gratton 2008. This program computes the Cholesky factorization of a matrix. Basic program layout inspired by the examples in the NVIDIA CUDA (TM) SDK. Basic algorithm based on that in Numerical Recipes for F90, that referencing Golub and Van Loan, Matrix Computations, Johns Hopkins University Press. I came up with the parallelization scheme for the G80 myself, but (unsurprisingly) it is rather similar to those discussed for general parallel computing, see e.g. SCALAPACK documentation. I */ /* This code is in an unfinished state and as such should not be used for anything serious or be redistributed. I plan to make a "proper" version available in the near future, hopefully supporting double precision when that is available in hardware. Any comments/suggestions are welcome; please see my homepage at www.ast.cam.ac.uk/~stg20 for contact details. */ // You may need to increase stack size to get the program to run, // try "ulimit -s unlimited" or similar on linux. // block indices follow matrix conventions // i.e. (bp,bq) // refers to the bp'th block matrix in the bq'th column // thread x,y indices are always chosen to coalesce global memory reads // for large matrices you may want to use diaginitialize... // WARNING: various options will break if BLOCK_SIZE is changed from 16 // MAT_SIZE should be a multiple of BLOCK_SIZE // MAT_BLOCKS should be a multiple of NUM_GPUS // Set the number of processes "-np ..." in the argument to mpirun to NUM_GPUS // Set the number of slots per node in the mpi hostfile to NUM_GPUS_PER_NODE #define MAT_SIZE (16*1024) #define BLOCK_SIZE 16 #define MAT_BLOCKS (MAT_SIZE/BLOCK_SIZE) #define NUM_GPUS 2 #define NUM_GPUS_PER_NODE 2 // warning: "UNROLL" only set up for a block size of 16! #define COMPARE #define NOCPUCMP #define UNROLL //#define PRINTMATRIX //#define FULLINIT //#define PRINTINPUTMATRIX //#define MATDISP //#define CPU #include #include #include #include #include "mpi.h" int nprocs,myrank; //a simple test case... void diaginitialize(double (*x)[MAT_SIZE]) { int i,j; for (i=0;iMAT_SIZE) vectors void initialize(double (*x)[MAT_SIZE]) { double tmp[MAT_SIZE]; int i,j,a,b; for (i=0;i=k)) { topleft[ty][tx]=(topleft[ty][tx])*fac; } __syncthreads(); if ((tx>=ty)&&(ty>k)) { topleft[ty][tx]=topleft[ty][tx]-topleft[k][ty]*topleft[k][tx]; } } __syncthreads(); if (tx>=ty) { m[ty+(boffset/NUM_GPUS)*BLOCK_SIZE][tx+BLOCK_SIZE*boffset] =topleft[ty][tx]; } } // this kernel updates the strip below the "topleft" block __global__ void d_choldc_strip(double (*m)[MAT_SIZE], int blockoffset) { // +1 since blockoffset labels the "topleft" position // and boff is the working position... int boffx = blockIdx.x+blockoffset+1; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double topleftt[BLOCK_SIZE][BLOCK_SIZE+1]; __shared__ double workingmat[BLOCK_SIZE][BLOCK_SIZE+1]; // deliberately transposed... topleftt[tx][ty]= m[ty+(blockoffset/NUM_GPUS)*BLOCK_SIZE][tx+blockoffset*BLOCK_SIZE]; workingmat[ty][tx]= m[ty+(blockoffset/NUM_GPUS)*BLOCK_SIZE][tx+boffx*BLOCK_SIZE]; __syncthreads(); // now we forward-substitute for the new strip-elements... // one thread per column (a bit inefficient I'm afraid) if(ty==0) { for (int k=0;k=ty) { #ifdef UNROLL matrixprod+=topt[ty][0]*topt[tx][0]; matrixprod+=topt[ty][1]*topt[tx][1]; matrixprod+=topt[ty][2]*topt[tx][2]; matrixprod+=topt[ty][3]*topt[tx][3]; matrixprod+=topt[ty][4]*topt[tx][4]; matrixprod+=topt[ty][5]*topt[tx][5]; matrixprod+=topt[ty][6]*topt[tx][6]; matrixprod+=topt[ty][7]*topt[tx][7]; matrixprod+=topt[ty][8]*topt[tx][8]; matrixprod+=topt[ty][9]*topt[tx][9]; matrixprod+=topt[ty][10]*topt[tx][10]; matrixprod+=topt[ty][11]*topt[tx][11]; matrixprod+=topt[ty][12]*topt[tx][12]; matrixprod+=topt[ty][13]*topt[tx][13]; matrixprod+=topt[ty][14]*topt[tx][14]; matrixprod+=topt[ty][15]*topt[tx][15]; #else for (int kk=0;kk2) { worker=(j-i)%nprocs; workstartrow=((j-i)/nprocs)*BLOCK_SIZE; // printf("worker for i=%i is %i.\n",i,worker); stripgrid.x=i-1; stripgrid.y=1; stripgrid.z=1; // tmp3 shall contain the total no of blockrows a process will handle tmp1=(MAT_BLOCKS)/nprocs; tmp2=(MAT_BLOCKS)%nprocs; if (myrank>>(d_mat,j-i); d_choldc_strip<<>>(d_mat,j-i); // the following could possibly be replaced by an appropriate // call to cudaMemcpy2D, saving communication... cudaMemcpy(workingblockrow,d_mat[workstartrow], workingblockrow_size, cudaMemcpyDeviceToHost); } MPI_Bcast(workingblockrow,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE ,worker,MPI_COMM_WORLD); cudaMemcpy(d_workingblockrow,workingblockrow,workingblockrow_size, cudaMemcpyHostToDevice); if(myrank>worker) firstdiagblock=j-i+myrank-worker; if(myrank==worker) firstdiagblock=j-i+nprocs; if(myrank0){ d_choldc_diagupdate<<>>(d_mat, d_workingblockrow, firstdiagblock); } if(higrid.x>0){ d_choldc_hiupdate<<>>(d_mat, d_workingblockrow, firstdiagblock); } i--; } if(j>1) { worker=(j-2)%nprocs; workstartrow=((j-2)/nprocs)*BLOCK_SIZE; if(myrank==worker){ // printf("rank %i about to work in part %i...\n",myrank,i); d_choldc_topleft<<<1,threads>>>(d_mat,j-2); d_choldc_strip<<<1,threads>>>(d_mat,j-2); cudaMemcpy(workingblockrow,d_mat[workstartrow] ,workingblockrow_size, cudaMemcpyDeviceToHost); } MPI_Bcast(workingblockrow,BLOCK_SIZE*MAT_SIZE,MPI_DOUBLE ,worker,MPI_COMM_WORLD); cudaMemcpy(d_workingblockrow,workingblockrow,workingblockrow_size, cudaMemcpyHostToDevice); if(myrank>worker) firstdiagblock=j-i+myrank-worker; if(myrank==worker) firstdiagblock=j-i+nprocs; if(myrank0){ d_choldc_diagupdate<<>>(d_mat, d_workingblockrow, firstdiagblock); } i--; } // printf("rank %i out of the if...\n",myrank); worker=(j-1)%nprocs; if(myrank==worker){ // printf("rank %i about to work in the last topleft\n",myrank); d_choldc_topleft<<<1,threads>>>(d_mat,j-1); } cudaThreadSynchronize(); gputime=clock()-gputime; printf("kernel time from process %i=%f s.\n",myrank,gputime/1.e6f); cudaMemcpy(mat,d_mat, mat_size,cudaMemcpyDeviceToHost); cudaFree(d_mat); cudaFree(d_workingblockrow); cudaThreadSynchronize(); error=cudaGetLastError(); printf(" Error code from process %i is %d: %s.\n", myrank,error,cudaGetErrorString(error)); } int main(int argc, char** argv) { double x[(MAT_SIZE/NUM_GPUS)][MAT_SIZE]; double *bigxptr=NULL; double *bigx=NULL; double *xptr=NULL; int i,j,devcnt,mydev; int tmp; clock_t maincputime,maingputime; cudaError_t error; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myrank); printf("Hi from rank %i.\n",myrank); if(nprocs!=NUM_GPUS) { printf("disagreement between NUM_GPUS and nprocs, from rank %i.\n", myrank); MPI_Abort(MPI_COMM_WORLD,-1); MPI_Finalize(); return -1; } cudaGetDeviceCount(&devcnt); if(devcnt!=NUM_GPUS_PER_NODE) { printf("disagreement between NUM_GPUS_PER_NODE and devcnt on rank %i.\n", myrank); MPI_Abort(MPI_COMM_WORLD,-1); MPI_Finalize(); return -1; } mydev=myrank%devcnt; cudaSetDevice(mydev); error=cudaGetLastError(); printf(" Error code from process %i is %d: %s.\n", myrank,error,cudaGetErrorString(error)); cudaGetDevice(&tmp); printf(" cuda device for process %i is %i.\n",myrank,tmp); if (myrank==0) { printf("In from proc 0.\n"); bigx=(double*)calloc(sizeof(double),MAT_SIZE*MAT_SIZE); if (bigx==NULL) printf("Problem with allocation.\n"); memset(bigx,0,sizeof(double)*MAT_SIZE*MAT_SIZE); printf("Initializing test matrix...\n"); for (i=0;i < MAT_SIZE; i++) { bigx[i*MAT_SIZE+i]=i+1.; } // for (i=0;i