// (c) 2008 Steven Gratton // use "itof r9,r1\n" "mov g[vObjIndex0.x],r9\n" etc. for debugging std::string maketestmat = "il_ps_2_0\n" "dcl_input_generic vObjIndex0 \n" "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n" "dcl_cb cb0[1] ; matpitchinblocks,matheightinblocks, counter,matwidthinblocks\n" "dcl_literal l0, 0x00000000, 0x00000001, 0x00000002,0x00000004 ; int 0,1,2,4\n" "dcl_literal l1, 0x00000001, 0x00000002, 0x00000003,0x00000004 ; int 1,2,3,4\n" "dcl_literal l2, 0x358637BD , 0x38D1B717, 0x3C23D70A,0x3DCCCCCD ; float 1.e-6f,1.e-4f, 1.e-2f, 1.e-1f\n" "iadd r0.x,vObjIndex0.x,l0.x \n" "iadd r0.y,r0.x,l0.x \n" "; r0 stores block position\n" "inegate r8,r0\n" "iadd r4.x,cb0[0].w,r8.x\n" "imax r5.x,l0.x,r4.x\n" "ret_logicalz r5.x\n" "\n" "imad r1.x,cb0[0].x,r0.y,r0.x \n" "imul r1.x,r1.x,l0.w \n" ";r1 stores memory position\n" "imul r2,vObjIndex0.xxxx,l0.w\n" "iadd r2,r2,l1\n" "itof r2,r2\n" "mov r6,l2.wwww\n" "add r3,r2.x000,r6.0yyy\n" "mov g[r1.x],r3\n" "add r3,r2.0y00,r6.y0yy\n" "mov g[r1.x+1],r3\n" "add r3,r2.00z0,r6.yy0y\n" "mov g[r1.x+2],r3\n" "add r3,r2.000w,r6.yyy0\n" "mov g[r1.x+3],r3\n" "ret_dyn\n" "end\n"; std::string choltopleft = "il_ps_2_0\n" "dcl_input_generic vObjIndex0 \n" "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n" "dcl_cb cb0[1] ; matpitchinblocks,matheightinblocks, counter,matwidthinblocks\n" "dcl_literal l0, 0x00000000, 0x00000001, 0x00000002,0x00000004 ; int 0,1,2,4\n" "iadd r0.x,vObjIndex0.x,cb0[0].z \n" "iadd r0.y,r0.x,l0.x \n" "; r0 stores block position\n" "imad r1.x,cb0[0].x,r0.y,r0.x \n" "imul r1.x,r1.x,l0.w \n" ";r1 stores memory position\n" "mov r10,g[r1.x]\n" "mov r11,g[r1.x+1]\n" "mov r12,g[r1.x+2]\n" "mov r13,g[r1.x+3]\n" "sqrt r10.x,r10.xxxx\n" "rcp r30.x,r10.xxxx\n" "mul r11.x,r11.x,r30.x\n" "mul r12.x,r12.x,r30.x\n" "mul r13.x,r13.x,r30.x\n" "mad r11.y,r11.x,r11.x_neg(xyzw),r11.y\n" "mad r12.z,r12.x,r12.x_neg(xyzw),r12.z\n" "mad r13.w,r13.x,r13.x_neg(xyzw),r13.w\n" "mad r12.y,r11.x,r12.x_neg(xyzw),r12.y\n" "mad r13.y,r11.x,r13.x_neg(xyzw),r13.y\n" "mad r13.z,r12.x,r13.x_neg(xyzw),r13.z\n" "sqrt r11.y,r11.yyyy\n" "rcp r30.x,r11.yyyy\n" "mul r12.y,r12.y,r30.x\n" "mul r13.y,r13.y,r30.x\n" "mad r12.z,r12.y,r12.y_neg(xyzw),r12.z\n" "mad r13.w,r13.y,r13.y_neg(xyzw),r13.w\n" "mad r13.z,r12.y,r13.y_neg(xyzw),r13.z\n" "sqrt r12.z,r12.zzzz\n" "rcp r30.x,r12.zzzz\n" "mul r13.z,r13.z,r30.x\n" "mad r13.w,r13.z,r13.z_neg(xyzw),r13.w\n" "sqrt r13.w,r13.wwww\n" "mov g[r1.x],r10\n" "mov g[r1.x+1],r11\n" "mov g[r1.x+2],r12\n" "mov g[r1.x+3],r13\n" "ret_dyn\n" "end\n"; std::string cholstrip = "il_ps_2_0\n" "dcl_input_generic vObjIndex0 \n" "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n" "dcl_cb cb0[1] ; matpitchinblocks,matheightinblocks, counter,matwidthinblocks\n" "dcl_literal l0, 0x00000000, 0x00000001, 0x00000002,0x00000004 ; int 0,1,2,4\n" "iadd r0.x,vObjIndex0.x,cb0[0].z \n" "iadd r0.x,r0.x,l0.y\n" "mov r0.y,cb0[0].z \n" "; check we should proceed...\n" "; needed due to quad counting restrictions\n" "inegate r8,r0\n" "iadd r4.x,cb0[0].w,r8.x\n" "imax r5.x,l0.x,r4.x\n" "ret_logicalz r5.x\n" "imad r1.x,cb0[0].x,r0.y,r0.x \n" "imul r1.x,r1.x,l0.w \n" "; r1.x contains position of matrix to operate on\n" "imad r2.x,cb0[0].x,cb0[0].z,cb0[0].z \n" "imul r2.x,r2.x,l0.w \n" "; r2.x contains position of the l^T matrix \n" "mov r10,g[r1.x]\n" "mov r11,g[r1.x+1]\n" "mov r12,g[r1.x+2]\n" "mov r13,g[r1.x+3]\n" "mov r20,g[r2.x]\n" "mov r21,g[r2.x+1]\n" "mov r22,g[r2.x+2]\n" "mov r23,g[r2.x+3]\n" "div r10.x,r10.x,r20.x\n" "mad r10.y,r21.x,r10.x_neg(xyzw),r10.y\n" "div r10.y,r10.y,r21.y\n" "mad r10.z,r22.x,r10.x_neg(xyzw),r10.z\n" "mad r10.z,r22.y,r10.y_neg(xyzw),r10.z\n" "div r10.z,r10.z,r22.z\n" "mad r10.w,r23.x,r10.x_neg(xyzw),r10.w\n" "mad r10.w,r23.y,r10.y_neg(xyzw),r10.w\n" "mad r10.w,r23.z,r10.z_neg(xyzw),r10.w\n" "div r10.w,r10.w,r23.w\n" "div r11.x,r11.x,r20.x\n" "mad r11.y,r21.x,r11.x_neg(xyzw),r11.y\n" "div r11.y,r11.y,r21.y\n" "mad r11.z,r22.x,r11.x_neg(xyzw),r11.z\n" "mad r11.z,r22.y,r11.y_neg(xyzw),r11.z\n" "div r11.z,r11.z,r22.z\n" "mad r11.w,r23.x,r11.x_neg(xyzw),r11.w\n" "mad r11.w,r23.y,r11.y_neg(xyzw),r11.w\n" "mad r11.w,r23.z,r11.z_neg(xyzw),r11.w\n" "div r11.w,r11.w,r23.w\n" "div r12.x,r12.x,r20.x\n" "mad r12.y,r21.x,r12.x_neg(xyzw),r12.y\n" "div r12.y,r12.y,r21.y\n" "mad r12.z,r22.x,r12.x_neg(xyzw),r12.z\n" "mad r12.z,r22.y,r12.y_neg(xyzw),r12.z\n" "div r12.z,r12.z,r22.z\n" "mad r12.w,r23.x,r12.x_neg(xyzw),r12.w\n" "mad r12.w,r23.y,r12.y_neg(xyzw),r12.w\n" "mad r12.w,r23.z,r12.z_neg(xyzw),r12.w\n" "div r12.w,r12.w,r23.w\n" "div r13.x,r13.x,r20.x\n" "mad r13.y,r21.x,r13.x_neg(xyzw),r13.y\n" "div r13.y,r13.y,r21.y\n" "mad r13.z,r22.x,r13.x_neg(xyzw),r13.z\n" "mad r13.z,r22.y,r13.y_neg(xyzw),r13.z\n" "div r13.z,r13.z,r22.z\n" "mad r13.w,r23.x,r13.x_neg(xyzw),r13.w\n" "mad r13.w,r23.y,r13.y_neg(xyzw),r13.w\n" "mad r13.w,r23.z,r13.z_neg(xyzw),r13.w\n" "div r13.w,r13.w,r23.w\n" "mov g[r1.x],r10\n" "mov g[r1.x+1],r11\n" "mov g[r1.x+2],r12\n" "mov g[r1.x+3],r13\n" "ret_dyn\n" "end\n"; std::string choldiag = "il_ps_2_0\n" "dcl_input_generic vObjIndex0 \n" "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n" "dcl_cb cb0[1] ; matpitchinblocks,matheightinblocks, counter,matwidthinblocks\n" "dcl_literal l0, 0x00000000, 0x00000001, 0x00000002,0x00000004 ; int 0,1,2,4\n" "iadd r0.x,vObjIndex0.x,cb0[0].z \n" "iadd r0.y,r0.x,l0.y \n" "iadd r0.x,r0.x,l0.y \n" "; check we should proceed...\n" "; needed due to quad counting restrictions\n" "inegate r8,r0\n" "iadd r4.x,cb0[0].w,r8.x\n" "imax r5.x,l0.x,r4.x\n" "ret_logicalz r5.x\n" "\n" "imad r1.x,cb0[0].x,r0.y,r0.x \n" "imul r1.x,r1.x,l0.w \n" "imad r2.x,cb0[0].x,cb0[0].z,r0.x \n" "imul r2.x,r2.x,l0.w \n" "; r1.x contains location of main matrix, r2.x that of top matrix\n" "mov r10,g[r1.x]\n" "mov r11,g[r1.x+1]\n" "mov r12,g[r1.x+2]\n" "mov r13,g[r1.x+3]\n" "mov r20,g[r2.x]\n" "mov r21,g[r2.x+1]\n" "mov r22,g[r2.x+2]\n" "mov r23,g[r2.x+3]\n" "dp4_ieee r30.x,r20,r20\n" "dp4_ieee r31.x,r20,r21\n" "dp4_ieee r31.y,r21,r21\n" "dp4_ieee r32.x,r20,r22\n" "dp4_ieee r32.y,r21,r22\n" "dp4_ieee r32.z,r22,r22\n" "dp4_ieee r33.x,r20,r23\n" "dp4_ieee r33.y,r21,r23\n" "dp4_ieee r33.z,r22,r23\n" "dp4_ieee r33.w,r23,r23\n" "sub r10.x___,r10,r30\n" "sub r11.xy__,r11,r31\n" "sub r12.xyz_,r12,r32\n" "sub r13.xyzw,r13,r33\n" "mov g[r1.x],r10\n" "mov g[r1.x+1],r11\n" "mov g[r1.x+2],r12\n" "mov g[r1.x+3],r13\n" /* "itof r9,cb0[0].z\n" "mov g[r1.x],r9\n" "mov g[r1.x+1],r9\n" "mov g[r1.x+2],r9\n" "mov g[r1.x+3],r9\n" */ "ret_dyn\n" "end\n"; std::string cholhiup= "il_ps_2_0\n" "dcl_input_generic vObjIndex0 \n" "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n" "dcl_cb cb0[1] ; matpitchinblocks,matheightinblocks, counter,matwidthinblocks\n" "dcl_literal l0, 0x00000000, 0x00000001, 0x00000002,0x00000004 ; int 0,1,2,4\n" "iadd r0.x,vObjIndex0.x,cb0[0].z \n" "iadd r0.y,r0.x,l0.y \n" "iadd r0.x,r0.x,l0.z \n" "\n" "; check we should proceed...\n" "; needed due to quad counting restrictions\n" "inegate r8,r0\n" "iadd r4.x,cb0[0].w,r8.x\n" "imax r5.x,l0.x,r4.x\n" "ret_logicalz r5.x\n" "\n" "; setting up the memory locations: r1.x is to be the start of the subblock \n" "; in question and r2.x is to be the start of the top matrix...\n" "imad r1.x,cb0[0].x,r0.y,r0.x \n" "imul r1.x,r1.x,l0.w\n" "imad r2.x,cb0[0].x,cb0[0].z,r0.y \n" "imul r2.x,r2.x,l0.w \n" " ; Nb. you have to use the pitch, i.e. local mem is padded! \n" "mov r10, g[r2.x] ; reading in the common matrix\n" "mov r11, g[r2.x+1]\n" "mov r12, g[r2.x+2]\n" "mov r13, g[r2.x+3]\n" ";now make r2 point to the top matrix...\n" "imad r2.x,cb0[0].x,cb0[0].z,r0.x\n" "imul r2.x,r2.x,l0.w \n" "\n" "\n" "\n" " \n" "; beginning loop\n" "whileloop\n" "\n" " \n" "\n" "inegate r7,r0\n" "iadd r6.x,cb0[0].w,r7.x ; 0 if we've gone too far\n" "break_logicalz r6.x; exit loop\n" //"itof r9,vObjIndex0.xxxx\n" //"mov g[vObjIndex0.x+4],r9\n" "\n" "; read in new main matrix and top matrix\n" "mov r20, g[r1.x+0] ;main\n" "mov r21, g[r1.x+1]\n" "mov r22, g[r1.x+2]\n" "mov r23, g[r1.x+3]\n" "\n" "mov r30, g[r2.x+0] ;top\n" "mov r31, g[r2.x+1]\n" "mov r32, g[r2.x+2]\n" "mov r33, g[r2.x+3]\n" "\n" "; multiply and store\n" "\n" " \n" "dp4_ieee r40.x,r10,r30\n" "dp4_ieee r40.y,r11,r30\n" "dp4_ieee r40.z,r12,r30\n" "dp4_ieee r40.w,r13,r30\n" "\n" "dp4_ieee r41.x,r10,r31\n" "dp4_ieee r41.y,r11,r31\n" "dp4_ieee r41.z,r12,r31\n" "dp4_ieee r41.w,r13,r31\n" "\n" "dp4_ieee r42.x,r10,r32\n" "dp4_ieee r42.y,r11,r32\n" "dp4_ieee r42.z,r12,r32\n" "dp4_ieee r42.w,r13,r32\n" "\n" "dp4_ieee r43.x,r10,r33\n" "dp4_ieee r43.y,r11,r33\n" "dp4_ieee r43.z,r12,r33\n" "dp4_ieee r43.w,r13,r33\n" "\n" "sub r50,r20,r40\n" "sub r51,r21,r41\n" "sub r52,r22,r42\n" "sub r53,r23,r43\n" "\n" " \n" "mov g[r1.x],r50\n" "mov g[r1.x+1],r51\n" "mov g[r1.x+2],r52\n" "mov g[r1.x+3],r53\n" "; increment counters and reading positions \n" "iadd r0.x, l0.y,r0.x \n" "iadd r1.x,l0.w,r1.x\n" "iadd r2.x,l0.w,r2.x\n" "\n" "\n" "\n" "endloop\n" "\n" "\n" "\n" "\n" "\n" "ret_dyn\n" "end\n" ; /* "itof r9,vObjIndex0.xxxx\n" "mov g[r1.x],r9\n" "mov g[r1.x+1],r9\n" "mov g[r1.x+2],r9\n" "mov g[r1.x+3],r9\n" */ /* "mov r9,vWinCoord0.xy00\n" "flr r9,r9\n" "itof r9.w,vObjIndex0.xxxx\n" "mov g[vObjIndex0.x],r9.0000\n" */ /* for testing doubles... "mov r17,g[0] \n" "mov r18,g[1] \n" "dadd r19.xy,r17.xy,r18.xy_neg(xyzw) \n" "mov g[0],r19.xy01 \n" */