9696outer_loop:
9797 setp.gt.u32 %p0, %remainingIters, 0;
9898 @!%p0 bra outer_loop_end;
99+ sub.u32 %remainingIters, %remainingIters, 1;
100+
99101 {
100102 .reg .u32 %i;
101103 mov.u32 %i, 0;
@@ -113,7 +115,7 @@ outer_loop:
113115 inner_loop_end:
114116 }
115117
116- sub.u32 %remainingIters, %remainingIters, 1 ;
118+ bra outer_loop ;
117119outer_loop_end:
118120
119121 {
@@ -129,11 +131,19 @@ outer_loop_end:
129131
130132 shl.b64 %outOffset, %stride, 7; // turn into a row offset (4 bytes), times 32 rows
131133 mul.lo.u64 %outOffset, %outOffset, %ctaY;
134+ cvt.u64.u32 %tmp, %tidY;
135+
136+ // Offset for bottom half.
137+ and.b64 %tmp, %tmp, 2; // 2 if second row of block, 0 if first
138+ mul.lo.u64 %tmp, %tmp, %stride;
139+ shl.b64 %tmp, %tmp, 5; // for second row: 16 * stride * 4 bytes (already was 2, not 1)
140+ add.u64 %outOffset, %outOffset, %tmp;
141+
132142 add.u64 %outOffset, %outOffset, %outColumn;
133143 add.u64 %ptrOut, %ptrOut, %outOffset;
134144
135145 // Copy to %ptrOut.
136- wmma.store.d.sync.aligned.m16n16k16.global.row.f32 [%ptrOut], {%out0, %out1, %out2, %out3, %out4, %out5, %out6, %out7};
146+ wmma.store.d.sync.aligned.m16n16k16.global.row.f32 [%ptrOut], {%out0, %out1, %out2, %out3, %out4, %out5, %out6, %out7}, %stride32 ;
137147 }
138148
139149 ret;
0 commit comments