|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2d3de4e |
|
.extern .shared .align 1 .b8 global_smem[] |
|
|
|
.visible .entry triton__0d1d2d3de4e( |
|
.param .u64 triton__0d1d2d3de4e_param_0, |
|
.param .u64 triton__0d1d2d3de4e_param_1, |
|
.param .u64 triton__0d1d2d3de4e_param_2, |
|
.param .u32 triton__0d1d2d3de4e_param_3, |
|
.param .u32 triton__0d1d2d3de4e_param_4 |
|
) |
|
.maxntid 256, 1, 1 |
|
{ |
|
.reg .pred %p<10> |
|
.reg .b32 %r<44> |
|
.reg .f32 %f<11> |
|
.reg .b64 %rd<16> |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2] |
|
ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1] |
|
ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0] |
|
$L__tmp0: |
|
.loc 1 22 44 |
|
mov.u32 %r1, %tid.x |
|
and.b32 %r2, %r1, 63 |
|
.loc 1 24 33 |
|
bfe.u32 %r3, %r1, 6, 2 |
|
.loc 1 21 28 |
|
mov.u32 %r10, %ctaid.x |
|
.loc 1 21 33 |
|
shl.b32 %r12, %r10, 6 |
|
.loc 1 22 23 |
|
or.b32 %r4, %r12, %r2 |
|
.loc 1 27 36 |
|
shl.b32 %r13, %r3, 17 |
|
add.s32 %r14, %r13, %r12 |
|
or.b32 %r42, %r14, %r2 |
|
mov.f32 %f10, 0f00000000 |
|
mov.b32 %r43, -4 |
|
mov.pred %p4, -1 |
|
$L__BB0_1: |
|
.loc 1 31 34 |
|
mul.wide.s32 %rd5, %r42, 4 |
|
add.s64 %rd4, %rd1, %rd5 |
|
mov.b32 %r16, 0 |
|
.loc 1 31 53 |
|
mov.u32 %r15, 0x0 |
|
@%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ] |
|
@!%p4 mov.u32 %r15, %r16 |
|
mov.b32 %f4, %r15 |
|
.loc 1 34 38 |
|
add.f32 %f10, %f10, %f4 |
|
.loc 1 27 36 |
|
add.s32 %r43, %r43, 4 |
|
add.s32 %r42, %r42, 524288 |
|
setp.lt.u32 %p3, %r43, 116 |
|
@%p3 bra $L__BB0_1 |
|
$L__tmp1: |
|
.loc 2 243 36 |
|
shl.b32 %r25, %r3, 2 |
|
shl.b32 %r26, %r2, 4 |
|
or.b32 %r27, %r26, %r25 |
|
mov.u32 %r28, global_smem |
|
add.s32 %r17, %r28, %r27 |
|
mov.b32 %r18, %f10 |
|
@%p4 st.shared.b32 [ %r17 + 0 ], %r18 |
|
bar.sync 0 |
|
setp.lt.s32 %p5, %r1, 256 |
|
shl.b32 %r29, %r1, 2 |
|
add.s32 %r20, %r28, %r29 |
|
@%p5 ld.shared.b32 %r19, [ %r20 + 0 ] |
|
mov.b32 %f5, %r19 |
|
shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1 |
|
mov.b32 %f6, %r30 |
|
$L__tmp2: |
|
.loc 2 233 15 |
|
add.f32 %f7, %f5, %f6 |
|
$L__tmp3: |
|
.loc 2 243 36 |
|
mov.b32 %r31, %f7 |
|
shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1 |
|
mov.b32 %f8, %r32 |
|
$L__tmp4: |
|
.loc 2 233 15 |
|
add.f32 %f9, %f7, %f8 |
|
$L__tmp5: |
|
.loc 2 243 36 |
|
and.b32 %r33, %r1, 3 |
|
setp.eq.s32 %p9, %r33, 0 |
|
and.pred %p6, %p5, %p9 |
|
mov.b32 %r22, %f9 |
|
@%p6 st.shared.b32 [ %r20 + 0 ], %r22 |
|
bar.sync 0 |
|
add.s32 %r34, %r28, %r26 |
|
$L__tmp6: |
|
.loc 1 36 20 |
|
shr.s32 %r36, %r4, 31 |
|
shr.u32 %r37, %r36, 24 |
|
add.s32 %r38, %r4, %r37 |
|
shr.s32 %r39, %r38, 8 |
|
and.b32 %r40, %r38, -256 |
|
sub.s32 %r41, %r4, %r40 |
|
.loc 1 38 30 |
|
mul.wide.s32 %rd9, %r39, 8 |
|
add.s64 %rd7, %rd2, %rd9 |
|
.loc 1 45 55 |
|
ld.shared.u32 %r24, [%r34] |
|
.loc 1 38 35 |
|
mov.u64 %rd6, 0x0 |
|
@%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ] |
|
.loc 1 41 32 |
|
shr.u64 %rd10, %rd6, 54 |
|
and.b64 %rd11, %rd10, 512 |
|
add.s64 %rd12, %rd11, %rd6 |
|
.loc 1 45 30 |
|
shl.b64 %rd13, %rd12, 10 |
|
add.s64 %rd14, %rd3, %rd13 |
|
mul.wide.s32 %rd15, %r41, 4 |
|
add.s64 %rd8, %rd14, %rd15 |
|
.loc 1 45 55 |
|
setp.eq.s32 %p8, %r3, 0 |
|
mov.u32 %r23, 0x0 |
|
@%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24 |
|
.loc 1 45 4 |
|
ret |
|
$L__tmp7: |
|
$L__func_end0: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" |
|
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 32 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 3 |
|
.b8 46 |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 49 |
|
.b8 19 |
|
.b8 0 |
|
.b8 0 |
|
.b8 4 |
|
.b8 29 |
|
.b8 0 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 5 |
|
.b8 29 |
|
.b8 1 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 264 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 54 |
|
.b8 105 |
|
.b8 107 |
|
.b8 53 |
|
.b8 118 |
|
.b8 120 |
|
.b8 55 |
|
.b8 112 |
|
.b8 50 |
|
.b8 50 |
|
.b8 102 |
|
.b8 112 |
|
.b8 107 |
|
.b8 52 |
|
.b8 100 |
|
.b8 99 |
|
.b8 118 |
|
.b8 104 |
|
.b8 53 |
|
.b8 53 |
|
.b8 122 |
|
.b8 105 |
|
.b8 109 |
|
.b8 119 |
|
.b8 52 |
|
.b8 116 |
|
.b8 53 |
|
.b8 110 |
|
.b8 114 |
|
.b8 53 |
|
.b8 122 |
|
.b8 110 |
|
.b8 50 |
|
.b8 98 |
|
.b8 55 |
|
.b8 105 |
|
.b8 110 |
|
.b8 117 |
|
.b8 106 |
|
.b8 120 |
|
.b8 106 |
|
.b8 97 |
|
.b8 117 |
|
.b8 120 |
|
.b8 115 |
|
.b8 104 |
|
.b8 108 |
|
.b8 106 |
|
.b8 117 |
|
.b8 109 |
|
.b8 109 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 54 |
|
.b8 105 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 101 |
|
.b8 52 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 101 |
|
.b8 52 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 1 |
|
.b8 3 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b32 125 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp6 |
|
.b8 2 |
|
.b8 35 |
|
.b8 25 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp5 |
|
.b8 2 |
|
.b8 35 |
|
.b8 25 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp5 |
|
.b8 2 |
|
.b8 243 |
|
.b8 36 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 268 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 101 |
|
.b8 52 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 268 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|