// (c) 2008 Steven Gratton
// Guided by examples from the AMD Stream SDK

// Note the assignment of the same resource
// to both an input and output buffer of a kernel
// in order to allow the factorization to 
// be done in place 

// Performance is better than the code using
// a global buffer but still needs improving...

#include <iostream>
#include <iomanip>
#include <string>

#include <ctime>
#include <cmath>

#include "cal.h"
#include "calcl.h"

#include "cal_ext.h"
#include "cal_ext_counter.h"

#include "newcholforweb.h"

std::string ILcheck=
  "il_ps_2_0\n"
  "ret_dyn\n"
  "end\n";

using namespace std;

static PFNCALCTXCREATECOUNTER  calCtxCreateCounterExt;
static PFNCALCTXDESTROYCOUNTER calCtxDestroyCounterExt;
static PFNCALCTXBEGINCOUNTER   calCtxBeginCounterExt;
static PFNCALCTXENDCOUNTER     calCtxEndCounterExt;
static PFNCALCTXGETCOUNTER     calCtxGetCounterExt;


void maketestmat(int n,float* mat)
{
  for (int i=0;i<n;i++){
    mat[i*n+i]=(float)(i+1);
    //    mat[i*n+i]=(float) i*n+i+1;
    for (int j=i+1;j<n;j++){
      mat[i*n+j]=.1f;
      mat[j*n+i]=.1f;
      //  mat[i*n+j]=(float) i*n+j;
      // mat[j*n+i]=(float) i*n+j;
    }
  }
}

void splitmat(int width,int height,float* mat,float* split0,float* split1,float* split2,float* split3)
{
  for (int i=0;i<(height/4);i++){
    for (int j=0;j<width;j++){
      split0[i*width+j]=mat[width*4*i+j];
      split1[i*width+j]=mat[width*(4*i+1)+j];
      split2[i*width+j]=mat[width*(4*i+2)+j];
      split3[i*width+j]=mat[width*(4*i+3)+j];
    }
  }
}

void unsplitmat(int width,int height,float* mat,float* split0,float* split1,float* split2,float* split3)
{
  for (int i=0;i<(height/4);i++){
    for (int j=0;j<width;j++){
      mat[width*4*i+j]=split0[i*width+j];
      mat[width*(4*i+1)+j]=split1[i*width+j];
      mat[width*(4*i+2)+j]=split2[i*width+j];
      mat[width*(4*i+3)+j]=split3[i*width+j];
    }
  }
}

void dispmat(int width,int height,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      printf("%10.6f ",mat[i*width+j]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}

void dispdiag(int width,int height,float* mat)
{
  for (int i=0;i<min(width,height);i++){
    printf("%10.6f ",mat[i*width+i]);
    if (!((i+1)&0x3)) printf("  ");
    if (!((i+1)&0x1f)) printf("\n");
  }
  printf("\n");
}

void dispmattr(int width,int height,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      printf("%10.6f ",mat[j*width+i]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}


void copytogpu(int width,int height,float* cpumat,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  cout << "pitch=" << gpupitch << endl;
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
	{
	  tmp[j]=cpumat[i*width+j];
	}
    }
  calResUnmap(gpumat);
}

void zeroongpu(int width,int height,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  memset(gpuptr,0,height*gpupitch*4*sizeof(float));
  calResUnmap(gpumat);
}

void copytocpu(int width,int height,float* cpumat,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
	{
	  cpumat[i*width+j]=tmp[j];
	}
    }
  calResUnmap(gpumat);
}



int main(int argc, char** argv)
{
  int n=8192;
  int n4=n/4;

  if((n%4)!=0) {
    cout << "Sorry, only matrices with size a multiple \
 of 4 are supported." << endl; return 1;}

  float* cpumat=new float[n*n];
  float* cpupart0=new float[n*n4];
  float* cpupart1=new float[n*n4];
  float* cpupart2=new float[n*n4];
  float* cpupart3=new float[n*n4];

  maketestmat(n,cpumat);
  //  dispmat(n,n,cpumat);
  splitmat(n,n,cpumat,cpupart0,cpupart1,cpupart2,cpupart3);
  //  dispmat(n,n4,cpupart0);
  //dispmat(n,n4,cpupart1);
  //dispmat(n,n4,cpupart2);
  //dispmat(n,n4,cpupart3);
  //unsplitmat(n,n,cpumat,cpupart0,cpupart1,cpupart2,cpupart3);
  //dispmat(n,n,cpumat);

  std::string kernel0 = choltopleft;
  std::string kernel1 = cholstrip;
  std::string kernel2 = cholcopy;
  std::string kernel3 = cholupdate;

  // kernel0=ILcheck;
  // kernel1=ILcheck;
  // kernel2=ILcheck;
  // kernel3=ILcheck;

  calInit();
  CALuint numDevices = 0;
  calDeviceGetCount(&numDevices);

  cout << "Num devices =" << numDevices << endl;

  CALdevice device = 0;
  calDeviceOpen(&device, 0);

  CALdeviceinfo info;
  calDeviceGetInfo(&info, 0);






  if (calExtSupported((CALextid)CAL_EXT_COUNTERS) != CAL_RESULT_OK)
    {
      return 1;
    }
        
  if (calExtGetProc((CALextproc*)&calCtxCreateCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxCreateCounter"))
    {
      return 1;
    }

  if (calExtGetProc((CALextproc*)&calCtxDestroyCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxDestroyCounter"))
    {
      return 1;
    }
    
  if (calExtGetProc((CALextproc*)&calCtxBeginCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxBeginCounter"))
    {
      return 1;
    }
    
  if (calExtGetProc((CALextproc*)&calCtxEndCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxEndCounter"))
    {
      return 1;
    }

  if (calExtGetProc((CALextproc*)&calCtxGetCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxGetCounter"))
    {
      return 1;
    }

  CALcontext ctx = 0;
  calCtxCreate(&ctx, device);

  CALobject obj0 = NULL;
  CALimage image0 = NULL;
  CALlanguage lang0 = CAL_LANGUAGE_IL;

  if (calclCompile(&obj0, lang0, kernel0.c_str(), info.target) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel0 compilation failed. Exiting.\n");
      return 1;
    }
  else
    {
      cout << "kernel0 compiled fine" << endl;
    };

  if (calclLink(&image0, &obj0, 1) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel0 linking failed. Exiting.\n");
      return 1;
    }

  CALobject obj1 = NULL;
  CALimage image1 = NULL;
  CALlanguage lang1 = CAL_LANGUAGE_IL;

  if (calclCompile(&obj1, lang1, kernel1.c_str(), info.target) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel1 compilation failed. Exiting.\n");
      return 1;
    }
  else
    {
      cout << "kernel1 compiled fine" << endl;
    };
  if (calclLink(&image1, &obj1, 1) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel1 linking failed. Exiting.\n");
      return 1;
    }

  CALobject obj2 = NULL;
  CALimage image2 = NULL;
  CALlanguage lang2 = CAL_LANGUAGE_IL;

  if (calclCompile(&obj2, lang2, kernel2.c_str(), info.target) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel2 compilation failed. Exiting.\n");
      return 1;
    }
  else
    {
      cout << "kernel2 compiled fine" << endl;
    };
  if (calclLink(&image2, &obj2, 1) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel2 linking failed. Exiting.\n");
      return 1;
    }

  CALobject obj3 = NULL;
  CALimage image3 = NULL;
  CALlanguage lang3 = CAL_LANGUAGE_IL;

  if (calclCompile(&obj3, lang3, kernel3.c_str(), info.target) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel3 compilation failed. Exiting.\n");
      return 1;
    }
  else
    {
      cout << "kernel3 compiled fine" << endl;
    };
  if (calclLink(&image3, &obj3, 1) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel3 linking failed. Exiting.\n");
      return 1;
    }

  cout << "after compiles..." << endl;


  CALresource part0=0;
  if(calResAllocLocal2D(&part0, device,n4,n4, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("part0 resource allocation failed.\n");
    }
  else
    {
      cout << "part0 fine." << endl;
    }


  CALresource part1=0;
  if(calResAllocLocal2D(&part1, device, n4,n4, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("part1 resource allocation failed.\n");
    }
  else
    {
      cout << "part1 fine." << endl;
    }

  CALresource part2=0;
  if(calResAllocLocal2D(&part2, device, n4,n4, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("part2 resource allocation failed.\n");
    }
  else
    {
      cout << "part2 fine." << endl;
    }
  
  CALresource part3=0;
  if(calResAllocLocal2D(&part3, device, n4,n4, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("part3 resource allocation failed.\n");
    }
  else
    {
      cout << "part3 fine." << endl;
    }
  
  /*
    cout << "to the gpu and back..." << endl;
    copytogpu(n,n4,cpupart0,part0);
    copytocpu(n,n4,cpupart0,part0);
    dispmat(n,n4,cpupart0);
  */

  copytogpu(n,n4,cpupart0,part0);
  copytogpu(n,n4,cpupart1,part1);
  copytogpu(n,n4,cpupart2,part2);
  copytogpu(n,n4,cpupart3,part3);

  CALmem part0mem=0;
  CALmem part1mem=0;
  CALmem part2mem=0;
  CALmem part3mem=0;

  calCtxGetMem(&part0mem, ctx, part0);
  calCtxGetMem(&part1mem, ctx, part1);
  calCtxGetMem(&part2mem, ctx, part2);
  calCtxGetMem(&part3mem, ctx, part3);

  CALresource extra0=0;
  if(calResAllocLocal2D(&extra0, device,n4,1, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("extra0 resource allocation failed.\n");
    }
  else
    {
      cout << "extra0 fine." << endl;
    }


  CALresource extra1=0;
  if(calResAllocLocal2D(&extra1, device, n4,1, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("extra1 resource allocation failed.\n");
    }
  else
    {
      cout << "extra1 fine." << endl;
    }

  CALresource extra2=0;
  if(calResAllocLocal2D(&extra2, device, n4,1, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("extra2 resource allocation failed.\n");
    }
  else
    {
      cout << "extra2 fine." << endl;
    }

  CALresource extra3=0;
  if(calResAllocLocal2D(&extra3, device, n4,1, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("extra3 resource allocation failed.\n");
    }
  else
    {
      cout << "extra3 fine." << endl;
    }

  zeroongpu(n,1,extra0);
  zeroongpu(n,1,extra1);
  zeroongpu(n,1,extra2);
  zeroongpu(n,1,extra3);


  CALmem extra0mem=0;
  CALmem extra1mem=0;
  CALmem extra2mem=0;
  CALmem extra3mem=0;

  calCtxGetMem(&extra0mem, ctx, extra0);
  calCtxGetMem(&extra1mem, ctx, extra1);
  calCtxGetMem(&extra2mem, ctx, extra2);
  calCtxGetMem(&extra3mem, ctx, extra3);

  CALresource const2=0;
  if(calResAllocLocal1D(&const2, device, 1, 
			CAL_FORMAT_FLOAT_1, 0)
     !=CAL_RESULT_OK) 
    {
      printf("const2 resource allocation failed.\n");
    }
  else
    {
      cout << "const2 fine." << endl;
    }

  CALmem const2mem=0;

  calCtxGetMem(&const2mem,ctx,const2);

  CALmodule module0 = 0;
  CALmodule module1 = 0;
  CALmodule module2 = 0;
  CALmodule module3 = 0;

  calModuleLoad(&module0, ctx, image0);
  calModuleLoad(&module1, ctx, image1);
  calModuleLoad(&module2, ctx, image2);
  calModuleLoad(&module3, ctx, image3);

  CALfunc func0 = 0;
  CALfunc func1 = 0;
  CALfunc func2 = 0;
  CALfunc func3 = 0;

  CALname inname0formod0=0;
  CALname inname1formod0=0;
  CALname inname2formod0=0;
  CALname inname3formod0=0;

  CALname inname0formod1=0;
  CALname inname1formod1=0;
  CALname inname2formod1=0;
  CALname inname3formod1=0;

  CALname inname0formod2=0;
  CALname inname1formod2=0;
  CALname inname2formod2=0;
  CALname inname3formod2=0;

  CALname inname0formod3=0;
  CALname inname1formod3=0;
  CALname inname2formod3=0;
  CALname inname3formod3=0;
  CALname inname4formod3=0;
  CALname inname5formod3=0;
  CALname inname6formod3=0;
  CALname inname7formod3=0;

  CALname outname0formod0=0;
  CALname outname1formod0=0;
  CALname outname2formod0=0;
  CALname outname3formod0=0;

  CALname outname0formod1=0;
  CALname outname1formod1=0;
  CALname outname2formod1=0;
  CALname outname3formod1=0;

  CALname outname0formod2=0;
  CALname outname1formod2=0;
  CALname outname2formod2=0;
  CALname outname3formod2=0;

  CALname outname0formod3=0;
  CALname outname1formod3=0;
  CALname outname2formod3=0;
  CALname outname3formod3=0;

  CALname constname0formod2=0;

  calModuleGetEntry(&func0, ctx, module0, "main");
  calModuleGetEntry(&func1, ctx, module1, "main");
  calModuleGetEntry(&func2, ctx, module2, "main");
  calModuleGetEntry(&func3, ctx, module3, "main");

  calModuleGetName(&inname0formod0, ctx, module0, "i0");
  calModuleGetName(&inname1formod0, ctx, module0, "i1");
  calModuleGetName(&inname2formod0, ctx, module0, "i2");
  calModuleGetName(&inname3formod0, ctx, module0, "i3");

  calModuleGetName(&inname0formod1, ctx, module1, "i0");
  calModuleGetName(&inname1formod1, ctx, module1, "i1");
  calModuleGetName(&inname2formod1, ctx, module1, "i2");
  calModuleGetName(&inname3formod1, ctx, module1, "i3");

  calModuleGetName(&inname0formod2, ctx, module2, "i0");
  calModuleGetName(&inname1formod2, ctx, module2, "i1");
  calModuleGetName(&inname2formod2, ctx, module2, "i2");
  calModuleGetName(&inname3formod2, ctx, module2, "i3");

  calModuleGetName(&inname0formod3, ctx, module3, "i0");
  calModuleGetName(&inname1formod3, ctx, module3, "i1");
  calModuleGetName(&inname2formod3, ctx, module3, "i2");
  calModuleGetName(&inname3formod3, ctx, module3, "i3");
  calModuleGetName(&inname4formod3, ctx, module3, "i4");
  calModuleGetName(&inname5formod3, ctx, module3, "i5");
  calModuleGetName(&inname6formod3, ctx, module3, "i6");
  calModuleGetName(&inname7formod3, ctx, module3, "i7");

  calModuleGetName(&outname0formod0, ctx, module0, "o0");
  calModuleGetName(&outname1formod0, ctx, module0, "o1");
  calModuleGetName(&outname2formod0, ctx, module0, "o2");
  calModuleGetName(&outname3formod0, ctx, module0, "o3");

  calModuleGetName(&outname0formod1, ctx, module1, "o0");
  calModuleGetName(&outname1formod1, ctx, module1, "o1");
  calModuleGetName(&outname2formod1, ctx, module1, "o2");
  calModuleGetName(&outname3formod1, ctx, module1, "o3");

  calModuleGetName(&outname0formod2, ctx, module2, "o0");
  calModuleGetName(&outname1formod2, ctx, module2, "o1");
  calModuleGetName(&outname2formod2, ctx, module2, "o2");
  calModuleGetName(&outname3formod2, ctx, module2, "o3");

  calModuleGetName(&outname0formod3, ctx, module3, "o0");
  calModuleGetName(&outname1formod3, ctx, module3, "o1");
  calModuleGetName(&outname2formod3, ctx, module3, "o2");
  calModuleGetName(&outname3formod3, ctx, module3, "o3");

  calModuleGetName(&constname0formod2, ctx, module2, "cb0");

  calCtxSetMem(ctx, inname0formod0, part0mem);
  calCtxSetMem(ctx, inname1formod0, part1mem);
  calCtxSetMem(ctx, inname2formod0, part2mem);
  calCtxSetMem(ctx, inname3formod0, part3mem);
  
  calCtxSetMem(ctx, inname0formod1, part0mem);
  calCtxSetMem(ctx, inname1formod1, part1mem);
  calCtxSetMem(ctx, inname2formod1, part2mem);
  calCtxSetMem(ctx, inname3formod1, part3mem);
  
  calCtxSetMem(ctx, inname0formod2, part0mem);
  calCtxSetMem(ctx, inname1formod2, part1mem);
  calCtxSetMem(ctx, inname2formod2, part2mem);
  calCtxSetMem(ctx, inname3formod2, part3mem);
  
  calCtxSetMem(ctx, inname0formod3, part0mem);
  calCtxSetMem(ctx, inname1formod3, part1mem);
  calCtxSetMem(ctx, inname2formod3, part2mem);
  calCtxSetMem(ctx, inname3formod3, part3mem);
  calCtxSetMem(ctx, inname4formod3, extra0mem);
  calCtxSetMem(ctx, inname5formod3, extra1mem);
  calCtxSetMem(ctx, inname6formod3, extra2mem);
  calCtxSetMem(ctx, inname7formod3, extra3mem);

  calCtxSetMem(ctx, outname0formod0, part0mem);
  calCtxSetMem(ctx, outname1formod0, part1mem);
  calCtxSetMem(ctx, outname2formod0, part2mem);
  calCtxSetMem(ctx, outname3formod0, part3mem);
  
  calCtxSetMem(ctx, outname0formod1, part0mem);
  calCtxSetMem(ctx, outname1formod1, part1mem);
  calCtxSetMem(ctx, outname2formod1, part2mem);
  calCtxSetMem(ctx, outname3formod1, part3mem);
  
  calCtxSetMem(ctx, outname0formod2, extra0mem);
  calCtxSetMem(ctx, outname1formod2, extra1mem);
  calCtxSetMem(ctx, outname2formod2, extra2mem);
  calCtxSetMem(ctx, outname3formod2, extra3mem);
  
  calCtxSetMem(ctx, outname0formod3, part0mem);
  calCtxSetMem(ctx, outname1formod3, part1mem);
  calCtxSetMem(ctx, outname2formod3, part2mem);
  calCtxSetMem(ctx, outname3formod3, part3mem);
  
  calCtxSetMem(ctx, constname0formod2,const2mem);

  CALevent e = 0;

  cout << "Just before running, " << calGetErrorString() << "." << endl;

  volatile clock_t gputime;
  gputime=clock();

  int pos=0;

  //  cout << "just before while loop" << endl;
  CALcounter idleCounter;
  if (calCtxCreateCounterExt(&idleCounter, ctx, CAL_COUNTER_IDLE) != CAL_RESULT_OK)
    {
      return 1;
    }

  CALcounter cacheCounter;
  if (calCtxCreateCounterExt(&cacheCounter, ctx, CAL_COUNTER_INPUT_CACHE_HIT_RATE) != CAL_RESULT_OK)
    {
      return 1;
    }
    
  if (calCtxBeginCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
    {
      return 1;
    }
    
  if (calCtxBeginCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  float* constdata=NULL;
  CALuint constpitch=0;
 
  while (pos<(n4-1))
    {
      //  cout << "For pos=" << pos << endl;
      
      calResMap((void**)&constdata,&constpitch,const2,0);
      *constdata=(float) pos;
      calResUnmap(const2);

      CALdomain domain0 = {pos, pos, 1, 1};
      CALdomain domain1 = {pos+1, pos, n4-pos-1, 1};
      CALdomain domain2 = {pos, 0, n4-pos,1};
      CALdomain domain3 = {pos+1, pos+1, n4-pos-1, n4-pos-1};

      calCtxRunProgram(&e, ctx, func0, &domain0);
      //      while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;
      calCtxFlush(ctx);

      //cout<< "   after 0" << endl;

      calCtxRunProgram(&e, ctx, func1, &domain1);
      //while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;
      calCtxFlush(ctx);

      //cout << "   after 1" << endl;

      calCtxRunProgram(&e, ctx, func2, &domain2);
      //don't seem to need this wait...
      //while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;
      calCtxFlush(ctx);

      //cout << "   after 2" << endl;


      calCtxRunProgram(&e, ctx, func3, &domain3);
      //  while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;
      calCtxFlush(ctx);

      //cout << "   after 3" << endl;

      //cout << "   " << calGetErrorString()<<endl;
      pos++;
    }
 
  //  cout << "Out of loop" << endl;
  
 
  CALdomain domain0 = {pos,pos,1,1};
  calCtxRunProgram(&e, ctx, func0, &domain0);
  while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);

  //cout << calGetErrorString();
  //cout << ": done kernel0" << endl;

  if (calCtxEndCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  if (calCtxEndCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
    {
      return 1;
    }
  gputime=clock()-gputime;


  CALfloat idlePercentage = 0.0f;
  if (calCtxGetCounterExt(&idlePercentage, ctx, idleCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  CALfloat cachePercentage = 0.0f;
  if (calCtxGetCounterExt(&cachePercentage, ctx, cacheCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  cout << "gpu time=" << gputime/1.e6f <<" s." <<endl;
  printf("Idle percentage: %0.2f%% Cache hit rate: %0.2f%%\n", idlePercentage * 100.0f, cachePercentage * 100.0f);

  if (calCtxDestroyCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  if (calCtxDestroyCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
    {
      return 1;
    }

  cout << "After calculation, " << calGetErrorString() << "." << endl;

  /*
    float* cpuextramat=new float[n*4];
    float* cpuextra0= new float[n];
    float* cpuextra1= new float[n];
    float* cpuextra2= new float[n];
    float* cpuextra3= new float[n];

    copytocpu(n,1,cpuextra0,extra0);
    copytocpu(n,1,cpuextra1,extra1);
    copytocpu(n,1,cpuextra2,extra2);
    copytocpu(n,1,cpuextra3,extra3);

    unsplitmat(n,4,cpuextramat,cpuextra0,cpuextra1,cpuextra2,cpuextra3);
    dispmat(n,4,cpuextramat);

    delete[] cpuextramat;
    delete[] cpuextra0;
    delete[] cpuextra1;
    delete[] cpuextra2;
    delete[] cpuextra3;
  */

  copytocpu(n,n4,cpupart0,part0);
  copytocpu(n,n4,cpupart1,part1);
  copytocpu(n,n4,cpupart2,part2);
  copytocpu(n,n4,cpupart3,part3);

  unsplitmat(n,n,cpumat,cpupart0,cpupart1,cpupart2,cpupart3);

  //dispmat(n,n,cpumat);
  dispdiag(n,n,cpumat);

  delete[] cpumat;
  delete[] cpupart0;
  delete[] cpupart1;
  delete[] cpupart2;
  delete[] cpupart3;
  
  calModuleUnload(ctx, module0);
  calModuleUnload(ctx, module1);
  calModuleUnload(ctx, module2);
  calModuleUnload(ctx, module3);

  calclFreeImage(image0);
  calclFreeObject(obj0);
  calclFreeImage(image1);
  calclFreeObject(obj1);
  calclFreeImage(image2);
  calclFreeObject(obj2);
  calclFreeImage(image3);
  calclFreeObject(obj3);

  calCtxReleaseMem(ctx,part0mem);
  calCtxReleaseMem(ctx,part1mem);
  calCtxReleaseMem(ctx,part2mem);
  calCtxReleaseMem(ctx,part3mem);
  calCtxReleaseMem(ctx,extra0mem);
  calCtxReleaseMem(ctx,extra1mem);
  calCtxReleaseMem(ctx,extra2mem);
  calCtxReleaseMem(ctx,extra3mem);
  calCtxReleaseMem(ctx,const2mem);

  calResFree(part0);
  calResFree(part1);
  calResFree(part2);
  calResFree(part3);
  calResFree(extra0);
  calResFree(extra1);
  calResFree(extra2);
  calResFree(extra3);
  calResFree(const2);

  calCtxDestroy(ctx);

  calDeviceClose(device);

  calShutdown();

  return 0;
}