0-hero's picture
Add files using upload-large-folder tool
d742687 verified
raw
history blame
5.91 kB
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3de4e
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2d3de4e(
.param .u64 triton__0d1d2d3de4e_param_0,
.param .u64 triton__0d1d2d3de4e_param_1,
.param .u64 triton__0d1d2d3de4e_param_2,
.param .u32 triton__0d1d2d3de4e_param_3,
.param .u32 triton__0d1d2d3de4e_param_4
)
.maxntid 256, 1, 1
{
.reg .pred %p<10>;
.reg .b32 %r<44>;
.reg .f32 %f<11>;
.reg .b64 %rd<16>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
$L__tmp0:
.loc 1 22 44
mov.u32 %r1, %tid.x;
and.b32 %r2, %r1, 63;
.loc 1 24 33
bfe.u32 %r3, %r1, 6, 2;
.loc 1 21 28
mov.u32 %r10, %ctaid.x;
.loc 1 21 33
shl.b32 %r12, %r10, 6;
.loc 1 22 23
or.b32 %r4, %r12, %r2;
.loc 1 27 36
shl.b32 %r13, %r3, 17;
add.s32 %r14, %r13, %r12;
or.b32 %r42, %r14, %r2;
mov.f32 %f10, 0f00000000;
mov.b32 %r43, -4;
mov.pred %p4, -1;
$L__BB0_1:
.loc 1 31 34
mul.wide.s32 %rd5, %r42, 4;
add.s64 %rd4, %rd1, %rd5;
mov.b32 %r16, 0;
.loc 1 31 53
mov.u32 %r15, 0x0;
@%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ];
@!%p4 mov.u32 %r15, %r16;
mov.b32 %f4, %r15;
.loc 1 34 38
add.f32 %f10, %f10, %f4;
.loc 1 27 36
add.s32 %r43, %r43, 4;
add.s32 %r42, %r42, 524288;
setp.lt.u32 %p3, %r43, 116;
@%p3 bra $L__BB0_1;
$L__tmp1:
.loc 2 243 36
shl.b32 %r25, %r3, 2;
shl.b32 %r26, %r2, 4;
or.b32 %r27, %r26, %r25;
mov.u32 %r28, global_smem;
add.s32 %r17, %r28, %r27;
mov.b32 %r18, %f10;
@%p4 st.shared.b32 [ %r17 + 0 ], %r18;
bar.sync 0;
setp.lt.s32 %p5, %r1, 256;
shl.b32 %r29, %r1, 2;
add.s32 %r20, %r28, %r29;
@%p5 ld.shared.b32 %r19, [ %r20 + 0 ];
mov.b32 %f5, %r19;
shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1;
mov.b32 %f6, %r30;
$L__tmp2:
.loc 2 233 15
add.f32 %f7, %f5, %f6;
$L__tmp3:
.loc 2 243 36
mov.b32 %r31, %f7;
shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1;
mov.b32 %f8, %r32;
$L__tmp4:
.loc 2 233 15
add.f32 %f9, %f7, %f8;
$L__tmp5:
.loc 2 243 36
and.b32 %r33, %r1, 3;
setp.eq.s32 %p9, %r33, 0;
and.pred %p6, %p5, %p9;
mov.b32 %r22, %f9;
@%p6 st.shared.b32 [ %r20 + 0 ], %r22;
bar.sync 0;
add.s32 %r34, %r28, %r26;
$L__tmp6:
.loc 1 36 20
shr.s32 %r36, %r4, 31;
shr.u32 %r37, %r36, 24;
add.s32 %r38, %r4, %r37;
shr.s32 %r39, %r38, 8;
and.b32 %r40, %r38, -256;
sub.s32 %r41, %r4, %r40;
.loc 1 38 30
mul.wide.s32 %rd9, %r39, 8;
add.s64 %rd7, %rd2, %rd9;
.loc 1 45 55
ld.shared.u32 %r24, [%r34];
.loc 1 38 35
mov.u64 %rd6, 0x0;
@%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
.loc 1 41 32
shr.u64 %rd10, %rd6, 54;
and.b64 %rd11, %rd10, 512;
add.s64 %rd12, %rd11, %rd6;
.loc 1 45 30
shl.b64 %rd13, %rd12, 10;
add.s64 %rd14, %rd3, %rd13;
mul.wide.s32 %rd15, %r41, 4;
add.s64 %rd8, %rd14, %rd15;
.loc 1 45 55
setp.eq.s32 %p8, %r3, 0;
mov.u32 %r23, 0x0;
@%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24;
.loc 1 45 4
ret;
$L__tmp7:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 264
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 54
.b8 105
.b8 107
.b8 53
.b8 118
.b8 120
.b8 55
.b8 112
.b8 50
.b8 50
.b8 102
.b8 112
.b8 107
.b8 52
.b8 100
.b8 99
.b8 118
.b8 104
.b8 53
.b8 53
.b8 122
.b8 105
.b8 109
.b8 119
.b8 52
.b8 116
.b8 53
.b8 110
.b8 114
.b8 53
.b8 122
.b8 110
.b8 50
.b8 98
.b8 55
.b8 105
.b8 110
.b8 117
.b8 106
.b8 120
.b8 106
.b8 97
.b8 117
.b8 120
.b8 115
.b8 104
.b8 108
.b8 106
.b8 117
.b8 109
.b8 109
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 54
.b8 105
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 101
.b8 52
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 101
.b8 52
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp6
.b8 2
.b8 35
.b8 25
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp5
.b8 2
.b8 35
.b8 25
.b8 4
.b32 125
.b64 $L__tmp2
.b64 $L__tmp5
.b8 2
.b8 243
.b8 36
.b8 0
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 268
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 101
.b8 52
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 268
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }