|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2de |
|
.extern .shared .align 1 .b8 global_smem[] |
|
|
|
.visible .entry triton__0d1d2de( |
|
.param .u64 triton__0d1d2de_param_0, |
|
.param .u64 triton__0d1d2de_param_1, |
|
.param .u32 triton__0d1d2de_param_2 |
|
) |
|
.maxntid 128, 1, 1 |
|
{ |
|
.reg .pred %p<4> |
|
.reg .b16 %rs<9> |
|
.reg .b32 %r<37> |
|
.reg .b64 %rd<13> |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd4, [triton__0d1d2de_param_0] |
|
ld.param.u64 %rd5, [triton__0d1d2de_param_1] |
|
$L__tmp0: |
|
.loc 1 21 36 |
|
mov.u32 %r22, %tid.x |
|
and.b32 %r23, %r22, 127 |
|
shl.b32 %r24, %r23, 3 |
|
shl.b32 %r25, %r23, 2 |
|
.loc 1 20 28 |
|
mov.u32 %r1, %ctaid.x |
|
.loc 1 20 33 |
|
shl.b32 %r26, %r1, 10 |
|
.loc 1 21 23 |
|
or.b32 %r27, %r26, %r24 |
|
or.b32 %r28, %r26, %r25 |
|
.loc 1 24 30 |
|
mul.wide.s32 %rd6, %r27, 2 |
|
add.s64 %rd1, %rd4, %rd6 |
|
mov.pred %p1, -1 |
|
.loc 1 24 35 |
|
mov.u32 %r2, 0x0 |
|
mov.u32 %r3, 0x0 |
|
mov.u32 %r4, 0x0 |
|
mov.u32 %r5, 0x0 |
|
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ] |
|
shr.u32 %r29, %r2, 16 |
|
shr.u32 %r30, %r3, 16 |
|
shr.u32 %r31, %r4, 16 |
|
shr.u32 %r32, %r5, 16 |
|
.loc 1 24 44 |
|
shl.b32 %r33, %r23, 4 |
|
mov.u32 %r34, global_smem |
|
add.s32 %r35, %r34, %r33 |
|
st.shared.u16 [%r35], %r2 |
|
st.shared.u16 [%r35+2], %r29 |
|
st.shared.u16 [%r35+4], %r3 |
|
st.shared.u16 [%r35+6], %r30 |
|
st.shared.u16 [%r35+8], %r4 |
|
st.shared.u16 [%r35+10], %r31 |
|
st.shared.u16 [%r35+12], %r5 |
|
st.shared.u16 [%r35+14], %r32 |
|
bar.sync 0 |
|
add.s32 %r36, %r34, %r24 |
|
ld.shared.u16 %rs1, [%r36] |
|
ld.shared.u16 %rs2, [%r36+2] |
|
ld.shared.u16 %rs3, [%r36+4] |
|
ld.shared.u16 %rs4, [%r36+6] |
|
ld.shared.u16 %rs5, [%r36+1024] |
|
ld.shared.u16 %rs6, [%r36+1026] |
|
ld.shared.u16 %rs7, [%r36+1028] |
|
ld.shared.u16 %rs8, [%r36+1030] |
|
cvt.f32.bf16 %r14, %rs1 |
|
cvt.f32.bf16 %r15, %rs2 |
|
cvt.f32.bf16 %r16, %rs3 |
|
cvt.f32.bf16 %r17, %rs4 |
|
cvt.f32.bf16 %r18, %rs5 |
|
cvt.f32.bf16 %r19, %rs6 |
|
cvt.f32.bf16 %r20, %rs7 |
|
cvt.f32.bf16 %r21, %rs8 |
|
.loc 1 26 25 |
|
mul.wide.s32 %rd7, %r28, 4 |
|
add.s64 %rd2, %rd5, %rd7 |
|
cvt.s64.s32 %rd8, %r26 |
|
cvt.u64.u32 %rd9, %r25 |
|
or.b64 %rd10, %rd8, %rd9 |
|
shl.b64 %rd11, %rd10, 2 |
|
add.s64 %rd12, %rd5, %rd11 |
|
add.s64 %rd3, %rd12, 2048 |
|
.loc 1 26 36 |
|
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 } |
|
@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 } |
|
.loc 1 26 4 |
|
ret |
|
$L__tmp1: |
|
$L__func_end0: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 176 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 107 |
|
.b8 54 |
|
.b8 50 |
|
.b8 107 |
|
.b8 50 |
|
.b8 120 |
|
.b8 122 |
|
.b8 98 |
|
.b8 98 |
|
.b8 54 |
|
.b8 53 |
|
.b8 55 |
|
.b8 115 |
|
.b8 110 |
|
.b8 102 |
|
.b8 100 |
|
.b8 111 |
|
.b8 119 |
|
.b8 119 |
|
.b8 97 |
|
.b8 110 |
|
.b8 122 |
|
.b8 115 |
|
.b8 122 |
|
.b8 97 |
|
.b8 105 |
|
.b8 106 |
|
.b8 54 |
|
.b8 113 |
|
.b8 122 |
|
.b8 119 |
|
.b8 54 |
|
.b8 118 |
|
.b8 117 |
|
.b8 99 |
|
.b8 55 |
|
.b8 99 |
|
.b8 102 |
|
.b8 105 |
|
.b8 100 |
|
.b8 111 |
|
.b8 109 |
|
.b8 106 |
|
.b8 112 |
|
.b8 107 |
|
.b8 107 |
|
.b8 54 |
|
.b8 105 |
|
.b8 103 |
|
.b8 99 |
|
.b8 109 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 107 |
|
.b8 54 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|