// This program computes c+=a^t b using an IL Compute Shader // Copyright (C) 2008 Steven Gratton #include #include #include #include #include "cal.h" #include "calcl.h" // c=a^t b, or 3=1^t 2 // a thread group does // 256x16=(16x256)^t (16x16) // each thread does a 4x16 block std::string ILcompute= "il_cs_2_0\n" "dcl_num_thread_per_group 64\n" "dcl_lds_size_per_thread 16\n" "dcl_lds_sharing_mode _wavefrontRel \n" "dcl_cb cb0[2]\n" // 0: int m/256, k/16, n/16,0 1: int pitch1, pitch2, pitch3,0 "dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" "dcl_resource_id(2)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" "dcl_resource_id(3)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" // r7 contains x,y block location in c "umod r7.x___,vTGroupid.x,cb0[0].z\n" "udiv r7._y__,vTGroupid.x,cb0[0].z\n" // c... "dcl_literal l3,4,256,0,0\n" "imul r5.x,l3.x,r7.x\n" "imul r5.y,l3.y,r7.y\n" "imad r5.y,vTid.x,l3.x,r5.y\n" "itof r1.xy,r5.xy\n" "dcl_literal l31,0.0,0.5,1.0,2.0\n" "add r1.xy,r1.xy,l31.yy\n" "sample_resource(3)_sampler(0)_aoffimmi(0.0,0.0,0.0) r300, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(1.0,0.0,0.0) r301, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(2.0,0.0,0.0) r302, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(3.0,0.0,0.0) r303, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(0.0,1.0,0.0) r310, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(1.0,1.0,0.0) r311, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(2.0,1.0,0.0) r312, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(3.0,1.0,0.0) r313, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(0.0,2.0,0.0) r320, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(1.0,2.0,0.0) r321, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(2.0,2.0,0.0) r322, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(3.0,2.0,0.0) r323, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(0.0,3.0,0.0) r330, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(1.0,3.0,0.0) r331, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(2.0,3.0,0.0) r332, r1.xy \n" "sample_resource(3)_sampler(0)_aoffimmi(3.0,3.0,0.0) r333, r1.xy \n" "mov r8,r8.0000\n" "whileloop" "iadd r9,cb0[0].y,r8_neg(xyzw)\n" "break_logicalz r9.x\n" // a... "dcl_literal l4,4,64,16,0\n" "imad r1.x,l4.y,r7.y,vTid.x\n" "imul r1.y,l4.z,r8.x\n" "itof r1.xy,r1.xy\n" //"add r1.xy,r1.xy,l31.yy\n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,0.0,0.0) r100, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,1.0,0.0) r101, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,2.0,0.0) r102, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,3.0,0.0) r103, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,4.0,0.0) r104, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,5.0,0.0) r105, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,6.0,0.0) r106, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,7.0,0.0) r107, r1.xy \n" "dcl_literal l7,0.0,2.0,4.0,8.0\n" "add r1.y,l7.w,r1.y\n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,0.0,0.0) r108, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,1.0,0.0) r109, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,2.0,0.0) r110, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,3.0,0.0) r111, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,4.0,0.0) r112, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,5.0,0.0) r113, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,6.0,0.0) r114, r1.xy \n" "sample_resource(1)_sampler(0)_aoffimmi(0.0,7.0,0.0) r115, r1.xy \n" // b... "dcl_literal l5,3,4,16,2\n" // "and r1.x,vTid.x,l5.x\n" // "ishr r1._y,vTid.x,l5.w\n" // "udiv r1._y,vTid.xx,l5.y\n" //"imad r1.x,r1.y_neg(xyzw),l5.x,vTid.x\n" "umod r1.x,vTid.x,l5.y\n" "udiv r1.y,vTid.x,l5.y\n" "imad r1.x,l5.y,r7.x,r1.x\n" "imad r1.y,l5.z,r8.x,r1.y\n" "itof r1.xy,r1.xy\n" "add r1.xy,r1.xy,l31.yy\n" "sample_resource(2)_sampler(0)_aoffimmi(0.0,0.0,0.0) r200, r1.xy \n" "lds_write_vec _lOffset(0) mem, r200 \n" "fence_threads_lds\n" "dcl_literal l10, 0,1,2,3\n" "dcl_literal l11, 4,5,6,7\n" "dcl_literal l12, 8,9,10,11\n" "dcl_literal l13, 12,13,14,15\n" "dcl_literal l14, 16,17,18,19\n" "dcl_literal l15, 20,21,22,23\n" "dcl_literal l16, 24,25,26,27\n" "dcl_literal l17, 28,29,30,31\n" "dcl_literal l18, 32,33,34,35\n" "dcl_literal l19, 36,37,38,39\n" "dcl_literal l20, 40,41,42,43\n" "dcl_literal l21, 44,45,46,47\n" "dcl_literal l22, 48,49,50,51\n" "dcl_literal l23, 52,53,54,55\n" "dcl_literal l24, 56,57,58,59\n" "dcl_literal l25, 60,61,62,63\n" "mov r600,r100\n" "lds_read_vec r400,l10.x0\n" "lds_read_vec r401,l10.y0\n" "lds_read_vec r402,l10.z0\n" "lds_read_vec r403,l10.w0\n" "call 0\n" "mov r600,r101\n" "lds_read_vec r400,l11.x0\n" "lds_read_vec r401,l11.y0\n" "lds_read_vec r402,l11.z0\n" "lds_read_vec r403,l11.w0\n" "call 0\n" "mov r600,r102\n" "lds_read_vec r400,l12.x0\n" "lds_read_vec r401,l12.y0\n" "lds_read_vec r402,l12.z0\n" "lds_read_vec r403,l12.w0\n" "call 0\n" "mov r600,r103\n" "lds_read_vec r400,l13.x0\n" "lds_read_vec r401,l13.y0\n" "lds_read_vec r402,l13.z0\n" "lds_read_vec r403,l13.w0\n" "call 0\n" "mov r600,r104\n" "lds_read_vec r400,l14.x0\n" "lds_read_vec r401,l14.y0\n" "lds_read_vec r402,l14.z0\n" "lds_read_vec r403,l14.w0\n" "call 0\n" "mov r600,r105\n" "lds_read_vec r400,l15.x0\n" "lds_read_vec r401,l15.y0\n" "lds_read_vec r402,l15.z0\n" "lds_read_vec r403,l15.w0\n" "call 0\n" "mov r600,r106\n" "lds_read_vec r400,l16.x0\n" "lds_read_vec r401,l16.y0\n" "lds_read_vec r402,l16.z0\n" "lds_read_vec r403,l16.w0\n" "call 0\n" "mov r600,r107\n" "lds_read_vec r400,l17.x0\n" "lds_read_vec r401,l17.y0\n" "lds_read_vec r402,l17.z0\n" "lds_read_vec r403,l17.w0\n" "call 0\n" "mov r600,r108\n" "lds_read_vec r400,l18.x0\n" "lds_read_vec r401,l18.y0\n" "lds_read_vec r402,l18.z0\n" "lds_read_vec r403,l18.w0\n" "call 0\n" "mov r600,r109\n" "lds_read_vec r400,l19.x0\n" "lds_read_vec r401,l19.y0\n" "lds_read_vec r402,l19.z0\n" "lds_read_vec r403,l19.w0\n" "call 0\n" "mov r600,r110\n" "lds_read_vec r400,l20.x0\n" "lds_read_vec r401,l20.y0\n" "lds_read_vec r402,l20.z0\n" "lds_read_vec r403,l20.w0\n" "call 0\n" "mov r600,r111\n" "lds_read_vec r400,l21.x0\n" "lds_read_vec r401,l21.y0\n" "lds_read_vec r402,l21.z0\n" "lds_read_vec r403,l21.w0\n" "call 0\n" "mov r600,r112\n" "lds_read_vec r400,l22.x0\n" "lds_read_vec r401,l22.y0\n" "lds_read_vec r402,l22.z0\n" "lds_read_vec r403,l22.w0\n" "call 0\n" "mov r600,r113\n" "lds_read_vec r400,l23.x0\n" "lds_read_vec r401,l23.y0\n" "lds_read_vec r402,l23.z0\n" "lds_read_vec r403,l23.w0\n" "call 0\n" "mov r600,r114\n" "lds_read_vec r400,l24.x0\n" "lds_read_vec r401,l24.y0\n" "lds_read_vec r402,l24.z0\n" "lds_read_vec r403,l24.w0\n" "call 0\n" "mov r600,r115\n" "lds_read_vec r400,l25.x0\n" "lds_read_vec r401,l25.y0\n" "lds_read_vec r402,l25.z0\n" "lds_read_vec r403,l25.w0\n" "call 0\n" "dcl_literal l6,1,0,0,0\n" "iadd r8.x,r8.x,l6.x\n" "endloop\n" //r500 will contain initial position of thread... "dcl_literal l30, 4,256,0,0\n" "imul r500.x,r7.x,l30.x\n" "imul r500.y,r7.y,l30.y\n" "imad r500.y,vTid.x,l30.x,r500.y\n" //r1 will contain initial memory location of thread... "imad r1.x,r500.y,cb0[1].z,r500.x\n" // "itof r300,vaTid.x\n" // "itof r310,vTid.x\n" //"lds_read_vec r311,l12.z0\n" //"mov g[vaTid.x],r900\n" "mov g[r1.x+0],r300\n" "mov g[r1.x+1],r301\n" "mov g[r1.x+2],r302\n" "mov g[r1.x+3],r303\n" "iadd r1.x,r1,cb0[1].z\n" "mov g[r1.x+0],r310\n" "mov g[r1.x+1],r311\n" "mov g[r1.x+2],r312\n" "mov g[r1.x+3],r313\n" "iadd r1.x,r1,cb0[1].z\n" "mov g[r1.x+0],r320\n" "mov g[r1.x+1],r321\n" "mov g[r1.x+2],r322\n" "mov g[r1.x+3],r323\n" "iadd r1.x,r1,cb0[1].z\n" "mov g[r1.x+0],r330\n" "mov g[r1.x+1],r331\n" "mov g[r1.x+2],r332\n" "mov g[r1.x+3],r333\n" "ret_dyn\n" "endmain\n" "func 0\n" "mad r300,r600.x,r400,r300\n" "mad r301,r600.x,r401,r301\n" "mad r302,r600.x,r402,r302\n" "mad r303,r600.x,r403,r303\n" "mad r310,r600.y,r400,r310\n" "mad r311,r600.y,r401,r311\n" "mad r312,r600.y,r402,r312\n" "mad r313,r600.y,r403,r313\n" "mad r320,r600.z,r400,r320\n" "mad r321,r600.z,r401,r321\n" "mad r322,r600.z,r402,r322\n" "mad r323,r600.z,r403,r323\n" "mad r330,r600.w,r400,r330\n" "mad r331,r600.w,r401,r331\n" "mad r332,r600.w,r402,r332\n" "mad r333,r600.w,r403,r333\n" "ret_dyn\n" "endfunc\n" "end\n" ; using namespace std; void maketestmatcol(int height,int width,float* mat) { for (int i=0;i