Skip to content

Commit ad18a8f

Browse files
committed
kernel works
1 parent c2aa6fa commit ad18a8f

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

learn_ptx/kernels/matmul_wmma_v1.ptx

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696
outer_loop:
9797
setp.gt.u32 %p0, %remainingIters, 0;
9898
@!%p0 bra outer_loop_end;
99+
sub.u32 %remainingIters, %remainingIters, 1;
100+
99101
{
100102
.reg .u32 %i;
101103
mov.u32 %i, 0;
@@ -113,7 +115,7 @@ outer_loop:
113115
inner_loop_end:
114116
}
115117

116-
sub.u32 %remainingIters, %remainingIters, 1;
118+
bra outer_loop;
117119
outer_loop_end:
118120

119121
{
@@ -129,11 +131,19 @@ outer_loop_end:
129131

130132
shl.b64 %outOffset, %stride, 7; // turn into a row offset (4 bytes), times 32 rows
131133
mul.lo.u64 %outOffset, %outOffset, %ctaY;
134+
cvt.u64.u32 %tmp, %tidY;
135+
136+
// Offset for bottom half.
137+
and.b64 %tmp, %tmp, 2; // 2 if second row of block, 0 if first
138+
mul.lo.u64 %tmp, %tmp, %stride;
139+
shl.b64 %tmp, %tmp, 5; // for second row: 16 * stride * 4 bytes (already was 2, not 1)
140+
add.u64 %outOffset, %outOffset, %tmp;
141+
132142
add.u64 %outOffset, %outOffset, %outColumn;
133143
add.u64 %ptrOut, %ptrOut, %outOffset;
134144

135145
// Copy to %ptrOut.
136-
wmma.store.d.sync.aligned.m16n16k16.global.row.f32 [%ptrOut], {%out0, %out1, %out2, %out3, %out4, %out5, %out6, %out7};
146+
wmma.store.d.sync.aligned.m16n16k16.global.row.f32 [%ptrOut], {%out0, %out1, %out2, %out3, %out4, %out5, %out6, %out7}, %stride32;
137147
}
138148

139149
ret;

0 commit comments

Comments
 (0)