// This program computes c+=a^t b using an IL Compute Shader
// Copyright (C) 2008 Steven Gratton

#include <iostream>
#include <iomanip>
#include <string>

#include <ctime>

#include "cal.h"
#include "calcl.h"



// c=a^t b, or 3=1^t 2
// a thread group does
// 256x16=(16x256)^t (16x16)
// each thread does a 4x16 block
std::string ILcompute=
"il_cs_2_0\n"
"dcl_num_thread_per_group 64\n"
"dcl_lds_size_per_thread 16\n"
"dcl_lds_sharing_mode _wavefrontRel \n"
  "dcl_cb cb0[2]\n" // 0: int m/256, k/16, n/16,0  1: int pitch1, pitch2, pitch3,0
"dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_resource_id(2)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_resource_id(3)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"

 
  // r7 contains x,y block location in c
"umod r7.x___,vTGroupid.x,cb0[0].z\n"
"udiv r7._y__,vTGroupid.x,cb0[0].z\n"

  // c...
  "dcl_literal l3,4,256,0,0\n"
"imul r5.x,l3.x,r7.x\n"
  "imul r5.y,l3.y,r7.y\n"
  "imad r5.y,vTid.x,l3.x,r5.y\n"
 "itof r1.xy,r5.xy\n"
"dcl_literal l31,0.0,0.5,1.0,2.0\n"
"add r1.xy,r1.xy,l31.yy\n"

 
"sample_resource(3)_sampler(0)_aoffimmi(0.0,0.0,0.0) r300, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(1.0,0.0,0.0) r301, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(2.0,0.0,0.0) r302, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(3.0,0.0,0.0) r303, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(0.0,1.0,0.0) r310, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(1.0,1.0,0.0) r311, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(2.0,1.0,0.0) r312, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(3.0,1.0,0.0) r313, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(0.0,2.0,0.0) r320, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(1.0,2.0,0.0) r321, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(2.0,2.0,0.0) r322, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(3.0,2.0,0.0) r323, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(0.0,3.0,0.0) r330, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(1.0,3.0,0.0) r331, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(2.0,3.0,0.0) r332, r1.xy \n"
"sample_resource(3)_sampler(0)_aoffimmi(3.0,3.0,0.0) r333, r1.xy \n"


"mov r8,r8.0000\n"

"whileloop"
"iadd r9,cb0[0].y,r8_neg(xyzw)\n"
"break_logicalz r9.x\n"

 
  // a...
"dcl_literal l4,4,64,16,0\n"
"imad r1.x,l4.y,r7.y,vTid.x\n"
"imul r1.y,l4.z,r8.x\n"
   "itof r1.xy,r1.xy\n"
  //"add r1.xy,r1.xy,l31.yy\n"
 
"sample_resource(1)_sampler(0)_aoffimmi(0.0,0.0,0.0) r100, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,1.0,0.0) r101, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,2.0,0.0) r102, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,3.0,0.0) r103, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,4.0,0.0) r104, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,5.0,0.0) r105, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,6.0,0.0) r106, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,7.0,0.0) r107, r1.xy \n"


"dcl_literal l7,0.0,2.0,4.0,8.0\n" 
"add r1.y,l7.w,r1.y\n"

"sample_resource(1)_sampler(0)_aoffimmi(0.0,0.0,0.0) r108, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,1.0,0.0) r109, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,2.0,0.0) r110, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,3.0,0.0) r111, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,4.0,0.0) r112, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,5.0,0.0) r113, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,6.0,0.0) r114, r1.xy \n"
"sample_resource(1)_sampler(0)_aoffimmi(0.0,7.0,0.0) r115, r1.xy \n"



  // b...
"dcl_literal l5,3,4,16,2\n"
  //  "and r1.x,vTid.x,l5.x\n"
  // "ishr r1._y,vTid.x,l5.w\n"
  //  "udiv r1._y,vTid.xx,l5.y\n"
  //"imad r1.x,r1.y_neg(xyzw),l5.x,vTid.x\n"
"umod r1.x,vTid.x,l5.y\n"
"udiv r1.y,vTid.x,l5.y\n"

"imad r1.x,l5.y,r7.x,r1.x\n"
"imad r1.y,l5.z,r8.x,r1.y\n"           
"itof r1.xy,r1.xy\n"
  "add r1.xy,r1.xy,l31.yy\n"

"sample_resource(2)_sampler(0)_aoffimmi(0.0,0.0,0.0) r200, r1.xy \n"

"lds_write_vec _lOffset(0) mem, r200 \n"
"fence_threads_lds\n"
 
"dcl_literal l10, 0,1,2,3\n"
"dcl_literal l11, 4,5,6,7\n"
"dcl_literal l12, 8,9,10,11\n"
"dcl_literal l13, 12,13,14,15\n"
"dcl_literal l14, 16,17,18,19\n"
"dcl_literal l15, 20,21,22,23\n"
"dcl_literal l16, 24,25,26,27\n"
"dcl_literal l17, 28,29,30,31\n"
"dcl_literal l18, 32,33,34,35\n"
"dcl_literal l19, 36,37,38,39\n"
"dcl_literal l20, 40,41,42,43\n"
"dcl_literal l21, 44,45,46,47\n"
"dcl_literal l22, 48,49,50,51\n"
"dcl_literal l23, 52,53,54,55\n"
"dcl_literal l24, 56,57,58,59\n"
"dcl_literal l25, 60,61,62,63\n"
 

"mov r600,r100\n"
"lds_read_vec r400,l10.x0\n"
"lds_read_vec r401,l10.y0\n"
"lds_read_vec r402,l10.z0\n"
"lds_read_vec r403,l10.w0\n"
"call 0\n"

"mov r600,r101\n"
"lds_read_vec r400,l11.x0\n"
"lds_read_vec r401,l11.y0\n"
"lds_read_vec r402,l11.z0\n"
"lds_read_vec r403,l11.w0\n"
"call 0\n"

"mov r600,r102\n"
"lds_read_vec r400,l12.x0\n"
"lds_read_vec r401,l12.y0\n"
"lds_read_vec r402,l12.z0\n"
"lds_read_vec r403,l12.w0\n"
"call 0\n"

"mov r600,r103\n"
"lds_read_vec r400,l13.x0\n"
"lds_read_vec r401,l13.y0\n"
"lds_read_vec r402,l13.z0\n"
"lds_read_vec r403,l13.w0\n"
"call 0\n"

"mov r600,r104\n"
"lds_read_vec r400,l14.x0\n"
"lds_read_vec r401,l14.y0\n"
"lds_read_vec r402,l14.z0\n"
"lds_read_vec r403,l14.w0\n"
"call 0\n"

"mov r600,r105\n"
"lds_read_vec r400,l15.x0\n"
"lds_read_vec r401,l15.y0\n"
"lds_read_vec r402,l15.z0\n"
"lds_read_vec r403,l15.w0\n"
"call 0\n"

"mov r600,r106\n"
"lds_read_vec r400,l16.x0\n"
"lds_read_vec r401,l16.y0\n"
"lds_read_vec r402,l16.z0\n"
"lds_read_vec r403,l16.w0\n"
"call 0\n"

"mov r600,r107\n"
"lds_read_vec r400,l17.x0\n"
"lds_read_vec r401,l17.y0\n"
"lds_read_vec r402,l17.z0\n"
"lds_read_vec r403,l17.w0\n"
"call 0\n"



"mov r600,r108\n"
"lds_read_vec r400,l18.x0\n"
"lds_read_vec r401,l18.y0\n"
"lds_read_vec r402,l18.z0\n"
"lds_read_vec r403,l18.w0\n"
"call 0\n"

"mov r600,r109\n"
"lds_read_vec r400,l19.x0\n"
"lds_read_vec r401,l19.y0\n"
"lds_read_vec r402,l19.z0\n"
"lds_read_vec r403,l19.w0\n"
"call 0\n"

"mov r600,r110\n"
"lds_read_vec r400,l20.x0\n"
"lds_read_vec r401,l20.y0\n"
"lds_read_vec r402,l20.z0\n"
"lds_read_vec r403,l20.w0\n"
"call 0\n"

"mov r600,r111\n"
"lds_read_vec r400,l21.x0\n"
"lds_read_vec r401,l21.y0\n"
"lds_read_vec r402,l21.z0\n"
"lds_read_vec r403,l21.w0\n"
"call 0\n"

"mov r600,r112\n"
"lds_read_vec r400,l22.x0\n"
"lds_read_vec r401,l22.y0\n"
"lds_read_vec r402,l22.z0\n"
"lds_read_vec r403,l22.w0\n"
"call 0\n"

"mov r600,r113\n"
"lds_read_vec r400,l23.x0\n"
"lds_read_vec r401,l23.y0\n"
"lds_read_vec r402,l23.z0\n"
"lds_read_vec r403,l23.w0\n"
"call 0\n"

"mov r600,r114\n"
"lds_read_vec r400,l24.x0\n"
"lds_read_vec r401,l24.y0\n"
"lds_read_vec r402,l24.z0\n"
"lds_read_vec r403,l24.w0\n"
"call 0\n"

"mov r600,r115\n"
"lds_read_vec r400,l25.x0\n"
"lds_read_vec r401,l25.y0\n"
"lds_read_vec r402,l25.z0\n"
"lds_read_vec r403,l25.w0\n"
"call 0\n"

"dcl_literal l6,1,0,0,0\n"
"iadd r8.x,r8.x,l6.x\n"
"endloop\n" 

//r500 will contain initial position of thread...
"dcl_literal l30, 4,256,0,0\n"
"imul r500.x,r7.x,l30.x\n"
"imul r500.y,r7.y,l30.y\n"
"imad r500.y,vTid.x,l30.x,r500.y\n"
//r1 will contain initial memory location of thread...
"imad r1.x,r500.y,cb0[1].z,r500.x\n" 


  //  "itof r300,vaTid.x\n"
  //  "itof r310,vTid.x\n"
//"lds_read_vec r311,l12.z0\n"
  //"mov g[vaTid.x],r900\n"
  "mov g[r1.x+0],r300\n"
  "mov g[r1.x+1],r301\n"
"mov g[r1.x+2],r302\n"
"mov g[r1.x+3],r303\n"

"iadd r1.x,r1,cb0[1].z\n"

"mov g[r1.x+0],r310\n"
"mov g[r1.x+1],r311\n"
"mov g[r1.x+2],r312\n"
"mov g[r1.x+3],r313\n"

"iadd r1.x,r1,cb0[1].z\n"

"mov g[r1.x+0],r320\n"
"mov g[r1.x+1],r321\n"
"mov g[r1.x+2],r322\n"
"mov g[r1.x+3],r323\n"

"iadd r1.x,r1,cb0[1].z\n"

"mov g[r1.x+0],r330\n"
"mov g[r1.x+1],r331\n"
"mov g[r1.x+2],r332\n"
"mov g[r1.x+3],r333\n"

"ret_dyn\n"
"endmain\n"


"func 0\n"
  
"mad r300,r600.x,r400,r300\n"
"mad r301,r600.x,r401,r301\n"
"mad r302,r600.x,r402,r302\n"
"mad r303,r600.x,r403,r303\n"

"mad r310,r600.y,r400,r310\n"
"mad r311,r600.y,r401,r311\n"
"mad r312,r600.y,r402,r312\n"
"mad r313,r600.y,r403,r313\n"

"mad r320,r600.z,r400,r320\n"
"mad r321,r600.z,r401,r321\n"
"mad r322,r600.z,r402,r322\n"
"mad r323,r600.z,r403,r323\n"

"mad r330,r600.w,r400,r330\n"
"mad r331,r600.w,r401,r331\n"
"mad r332,r600.w,r402,r332\n"
"mad r333,r600.w,r403,r333\n"

"ret_dyn\n"
"endfunc\n"
 
 "end\n"
  ;

using namespace std;


void maketestmatcol(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=(float) (j);
    }
  }
}

void maketestmatrow(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=(float) (i);
    }
  }
}

void makediagtestmat(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=0.f;
    }
    mat[i*width+i]=(float) i;
  }

}
void maketestmat(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=i*10+j;
    }
  }
}


void makeidentitymat(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=0.f;
    }
    mat[i*width+i]=1.f;
  }
}

void makezeromat(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=0.f;
    }
  }
}

void dispmat(int height,int width,float* mat)
{
  cout << "In dispmat." << endl;
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      printf("%10.2e ",mat[i*width+j]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}


void dispmatpart(int height,int width,int maximum,float* mat)
{
  int viewheight=height;
  if (maximum<viewheight) viewheight=maximum;
  int viewwidth=width;
  if (maximum<viewwidth) viewwidth=maximum;


  cout << "In dispmat." << endl;
  for (int i=0;i<viewheight;i++){
    for (int j=0;j<viewwidth;j++){
      printf("%12.4e ",mat[i*width+j]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}

void dispmatsection(int width,int starti,int startj, int viewheight,int viewwidth,float* mat)
{
  for (int i=starti;i<starti+viewheight;i++){
    for (int j=startj;j<startj+viewwidth;j++){
      printf("%15.7e ",mat[i*width+j]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}

void dispdiag(int height,int width,float* mat)
{
  int top=0;
  if (width<=height) 
    top=width;
  else
    top=height;
  for (int i=0;i<top;i++){
    printf("%10.6e ",mat[i*width+i]);
    if (!((i+1)&0x3)) printf("  ");
    if (!((i+1)&0x1f)) printf("\n");
  }
  printf("\n");
}


void dispdiagpart(int height,int width,int maximum, float* mat)
{
  int top=height;
  if (width<height) top=width;
  if (maximum<top) top=maximum;
  for (int i=0;i<top;i++){
    printf("%10.6e ",mat[i*width+i]);
    if (!((i+1)&0x3)) printf("  ");
    if (!((i+1)&0x1f)) printf("\n");
  }
  printf("\n");
}

int copytogpu(int width,int height,float* cpumat,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  cout << "pitch=" << gpupitch << endl;
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
        {
          tmp[j]=cpumat[i*width+j];
        }
    }
  calResUnmap(gpumat);
  return (int) gpupitch;
}

int zeroongpu(int width,int height,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  memset(gpuptr,0,height*gpupitch*4*sizeof(float));
  calResUnmap(gpumat);
  return (int) gpupitch;
}

void copytocpu(int width,int height,float* cpumat,CALresource gpumat)
{
  cout << "In copytocpu." << endl;
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
        {
          cpumat[i*width+j]=tmp[j];
	  //	  if (tmp[j]!=0.f) cout <<i << "," << j << ": " << tmp[j] <<  endl;

        }
    }
  calResUnmap(gpumat);
}


void disp(const char* msg)
{
    cout << msg;
}


// does 3 = 1^T * 2
int main(int argc, char** argv)
{
  int m=6144;
  int n=6144;
  int k=6144;
  int m256=m/256;
  int n16=n/16;
  int k16=k/16;
  int m4=m/4;
  int n4=n/4;
  int k4=k/4;


  if(m%256!=0||n%16!=0||k%16!=0||(m256*n16)%64!=0) {
    cout << "Sorry, a bad size." << endl;
    return 1;
  };

  if(n4<64){
    cout << "Sorry, according to release notes, for a global buffer\
 to work properly it has to be at least of size 64." << endl;
    return 1;
  };


  float* cpumat1=new float[k*m];
  float* cpumat2=new float[k*n];
  float* cpumat3=new float[m*n];

  maketestmat(k,m,cpumat1);
  maketestmat(k,n,cpumat2);
  maketestmat(m,n,cpumat3);

  dispmatpart(k,m,32,cpumat1);
  dispmatpart(k,n,32,cpumat2);
  dispmatpart(m,n,32,cpumat3);

  int viewstartx=0;
  int viewstarty=0;  
  int viewwidth= 16;
  int viewheight=16;


  std::string kernel0 = ILcompute;

  calInit();
  CALuint numDevices = 0;
  calDeviceGetCount(&numDevices);

  cout << "Num devices =" << numDevices << endl;

  CALdevice device = 0;
  calDeviceOpen(&device, 0);

  CALdeviceinfo info;
  calDeviceGetInfo(&info, 0);

  CALcontext ctx = 0;
  calCtxCreate(&ctx, device);

  CALobject obj0 = NULL;
  CALimage image0 = NULL;
  CALlanguage lang0 = CAL_LANGUAGE_IL;

  //  CALtarget testtarget0=CAL_TARGET_7XX;
  if (calclCompile(&obj0, lang0, kernel0.c_str(), info.target) != CAL_RESULT_OK)
    {
      cout << "Kernel0 compilation failed. Exiting." << endl;
      cout << calclGetErrorString() << endl;
      return 1;
    }
  else
    {
      cout << "Kernel0 compiled fine." << endl;
    };

  if (calclLink(&image0, &obj0, 1) != CAL_RESULT_OK)
    {
      cout <<  "Kernel0 linking failed. Exiting." << endl;
      return 1;
    }
  else
    {
      cout << "Kernel0 linked fine." << endl;
    };



calclDisassembleObject(&obj0,disp);



  CALresource mat1=0;
  if(calResAllocLocal2D(&mat1, device,m4,k, 
				CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      cout << "mat1 resource allocation failed." << endl;
    }
  else
    {
      cout << "mat1 fine." << endl;
    };

  CALuint mat1pitch=0;
  mat1pitch=copytogpu(m,k,cpumat1,mat1);

  CALresource mat2=0;
  if(calResAllocLocal2D(&mat2, device,n4,k, 
				CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      cout << "mat2 resource allocation failed." << endl;
    }
  else
    {
      cout << "mat2 fine." << endl;
    };


  CALuint mat2pitch=0;
  mat2pitch=copytogpu(n,k,cpumat2,mat2);

  CALresource mat3=0;
  if(calResAllocLocal2D(&mat3, device,n4,m, 
			CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER)
     !=CAL_RESULT_OK) 
    {
      cout << "mat3 resource allocation failed." << endl;
    }
  else
    {
      cout << "mat3 fine." << endl;
    };

  CALuint mat3pitch=0;
  mat3pitch=copytogpu(n,m,cpumat3,mat3);

  cout << "mat3pitch=" << mat3pitch << endl;

  CALresource const0=0;
  if(calResAllocLocal1D(&const0,device,2,CAL_FORMAT_INT_4,0)
     !=CAL_RESULT_OK)
    {
      cout << "const0 resource allocation failed." << endl;
    }
  else
    {
      cout << "const0 fine." << endl;
    };


  int* const0ptr=NULL;
  CALuint const0pitch=0;
  calResMap((CALvoid**)&const0ptr, &const0pitch, const0, 0);
  cout << "Const0 pitch =" << const0pitch << "." << endl;
  const0ptr[0]=m256;
  const0ptr[1]=k16;
  const0ptr[2]=n16;
  const0ptr[3]=0;
  const0ptr[4]=mat1pitch;
  const0ptr[5]=mat2pitch;
  const0ptr[6]=mat3pitch;
  const0ptr[7]=0;

  calResUnmap(const0);

  CALmem mat1mem=0;
  calCtxGetMem(&mat1mem, ctx, mat1);
  CALmem mat2mem=0;
  calCtxGetMem(&mat2mem, ctx, mat2);
  CALmem mat3mem=0;
  calCtxGetMem(&mat3mem, ctx, mat3);

  CALmem const0mem=0;
  calCtxGetMem(&const0mem, ctx, const0);

  CALmodule module0 = 0;
  calModuleLoad(&module0, ctx, image0);

  CALfunc func0 = 0;
  CALname globname0=0;
  CALname constname0=0;

  CALname inputname1=0;  
  CALname inputname2=0;  
  CALname inputname3=0;
  //  CALname outputname0=0;

 cout << "Here -2:" <<  calGetErrorString()<<endl;


  calModuleGetEntry(&func0, ctx, module0, "main");
  calModuleGetName(&constname0, ctx, module0, "cb0");
  calModuleGetName(&globname0, ctx, module0, "g[]");
  calModuleGetName(&inputname1,ctx,module0, "i1");
  calModuleGetName(&inputname2,ctx,module0, "i2");
  calModuleGetName(&inputname3,ctx,module0, "i3");
  //  calModuleGetName(&outputname0,ctx,module0, "o0");

 cout << "Here -1:" <<  calGetErrorString()<<endl;

   calCtxSetMem(ctx, globname0, mat3mem);
 cout << "Here 0:" <<  calGetErrorString()<<endl;
  calCtxSetMem(ctx, constname0, const0mem);
  cout << "Here 1:" <<  calGetErrorString()<<endl;

  calCtxSetMem(ctx,inputname1, mat1mem);
  calCtxSetMem(ctx,inputname2, mat2mem);
  calCtxSetMem(ctx,inputname3, mat3mem);
  cout << "Here 2:" <<  calGetErrorString()<<endl;
  //    calCtxSetMem(ctx,outputname0, mat3mem);
   cout << "Here 3:" <<  calGetErrorString()<<endl;
  
  CALevent e = 0;

    CALdomain3D gb={64,1,1};
            CALdomain3D gs={m256*n16,1,1};
	    //    CALdomain3D gs={1,1,1};

    cout << "Num threads per group=" << gb.width <<endl;
    cout << "Num thread groups=" << gs.width <<endl;
    CALuint flags=0;

    CALprogramGrid pg={func0,gb,gs,flags};


    volatile clock_t gputime=clock();

    if(calCtxRunProgramGrid(&e, ctx, &pg)!=CAL_RESULT_OK) 
      cout << "No run!!!!!" << endl;
  while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;

  gputime=clock()-gputime;
  cout << "gputime="<< (float)gputime/(float)CLOCKS_PER_SEC << "s." << endl;
  // cerr << "gputime="<< (float)gputime/(float)CLOCKS_PER_SEC << "s." << endl;
  cout << "Here 4:" <<  calGetErrorString()<<endl;

    copytocpu(n,m,cpumat3,mat3);

  //  dispmatsection(n,6100-128,6100-10,44+128,44+10,cpumat3);

  dispmatsection(n,0,0,260,36,cpumat3);

  delete [] cpumat1;
  delete [] cpumat2;
  delete [] cpumat3;


  calModuleUnload(ctx, module0);
  calclFreeImage(image0);
  calclFreeObject(obj0);
  calCtxReleaseMem(ctx,mat1mem);
  calCtxReleaseMem(ctx,mat2mem);
  calCtxReleaseMem(ctx,mat3mem);
  calCtxReleaseMem(ctx,const0mem);
  calResFree(mat1);
  calResFree(mat2);
  calResFree(mat3);
  calResFree(const0);
  calCtxDestroy(ctx);
  calDeviceClose(device);

  calShutdown();

  return 0;
}