0-hero's picture
Add files using upload-large-folder tool
00602c7 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4de
.visible .entry triton__0d1d2d3d4de(
.param .u64 triton__0d1d2d3d4de_param_0,
.param .u64 triton__0d1d2d3d4de_param_1,
.param .u64 triton__0d1d2d3d4de_param_2,
.param .u64 triton__0d1d2d3d4de_param_3,
.param .u32 triton__0d1d2d3d4de_param_4
)
.maxntid 128, 1, 1
{
.reg .pred %p<8>;
.reg .b16 %rs<33>;
.reg .b32 %r<77>;
.reg .f32 %f<65>;
.reg .b64 %rd<11>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0];
ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1];
$L__tmp0:
.loc 1 21 36
mov.u32 %r50, %tid.x;
shl.b32 %r51, %r50, 3;
ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2];
and.b32 %r52, %r51, 1016;
ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3];
.loc 1 20 28
mov.u32 %r1, %ctaid.x;
.loc 1 20 33
shl.b32 %r53, %r1, 10;
.loc 1 21 23
or.b32 %r54, %r53, %r52;
.loc 1 23 20
shr.s32 %r56, %r54, 31;
shr.u32 %r57, %r56, 24;
add.s32 %r58, %r54, %r57;
shr.s32 %r59, %r58, 8;
.loc 1 23 27
mul.hi.s32 %r60, %r59, 1431655766;
shr.u32 %r61, %r60, 31;
add.s32 %r62, %r60, %r61;
mul.lo.s32 %r63, %r62, 3;
sub.s32 %r64, %r59, %r63;
and.b32 %r65, %r58, -256;
sub.s32 %r66, %r54, %r65;
.loc 1 25 20
mul.hi.s32 %r67, %r54, 715827883;
shr.u32 %r68, %r67, 31;
shr.u32 %r69, %r67, 7;
add.s32 %r70, %r69, %r68;
.loc 1 27 40
shl.b32 %r71, %r70, 8;
.loc 1 27 36
add.s32 %r72, %r71, %r66;
.loc 1 27 30
mul.wide.s32 %rd9, %r72, 2;
add.s64 %rd1, %rd5, %rd9;
mov.pred %p1, -1;
.loc 1 27 46
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
cvt.u16.u32 %rs1, %r2;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
cvt.u16.u32 %rs3, %r3;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
cvt.u16.u32 %rs5, %r4;
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
cvt.u16.u32 %rs7, %r5;
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
.loc 1 27 85
cvt.f32.bf16 %r6, %rs1;
mov.b32 %f1, %r6;
cvt.f32.bf16 %r7, %rs2;
mov.b32 %f2, %r7;
cvt.f32.bf16 %r8, %rs3;
mov.b32 %f3, %r8;
cvt.f32.bf16 %r9, %rs4;
mov.b32 %f4, %r9;
cvt.f32.bf16 %r10, %rs5;
mov.b32 %f5, %r10;
cvt.f32.bf16 %r11, %rs6;
mov.b32 %f6, %r11;
cvt.f32.bf16 %r12, %rs7;
mov.b32 %f7, %r12;
cvt.f32.bf16 %r13, %rs8;
mov.b32 %f8, %r13;
.loc 1 28 30
add.s64 %rd2, %rd6, %rd9;
.loc 1 28 46
mov.u32 %r14, 0x0;
mov.u32 %r15, 0x0;
mov.u32 %r16, 0x0;
mov.u32 %r17, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
cvt.u16.u32 %rs9, %r14;
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
cvt.u16.u32 %rs11, %r15;
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
cvt.u16.u32 %rs13, %r16;
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
cvt.u16.u32 %rs15, %r17;
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
.loc 1 28 85
cvt.f32.bf16 %r18, %rs9;
mov.b32 %f9, %r18;
cvt.f32.bf16 %r19, %rs10;
mov.b32 %f10, %r19;
cvt.f32.bf16 %r20, %rs11;
mov.b32 %f11, %r20;
cvt.f32.bf16 %r21, %rs12;
mov.b32 %f12, %r21;
cvt.f32.bf16 %r22, %rs13;
mov.b32 %f13, %r22;
cvt.f32.bf16 %r23, %rs14;
mov.b32 %f14, %r23;
cvt.f32.bf16 %r24, %rs15;
mov.b32 %f15, %r24;
cvt.f32.bf16 %r25, %rs16;
mov.b32 %f16, %r25;
.loc 1 29 31
add.s64 %rd3, %rd7, %rd9;
.loc 1 29 47
mov.u32 %r26, 0x0;
mov.u32 %r27, 0x0;
mov.u32 %r28, 0x0;
mov.u32 %r29, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ];
cvt.u16.u32 %rs17, %r26;
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; }
cvt.u16.u32 %rs19, %r27;
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; }
cvt.u16.u32 %rs21, %r28;
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; }
cvt.u16.u32 %rs23, %r29;
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; }
.loc 1 29 86
cvt.f32.bf16 %r30, %rs17;
mov.b32 %f17, %r30;
cvt.f32.bf16 %r31, %rs18;
mov.b32 %f18, %r31;
cvt.f32.bf16 %r32, %rs19;
mov.b32 %f19, %r32;
cvt.f32.bf16 %r33, %rs20;
mov.b32 %f20, %r33;
cvt.f32.bf16 %r34, %rs21;
mov.b32 %f21, %r34;
cvt.f32.bf16 %r35, %rs22;
mov.b32 %f22, %r35;
cvt.f32.bf16 %r36, %rs23;
mov.b32 %f23, %r36;
cvt.f32.bf16 %r37, %rs24;
mov.b32 %f24, %r37;
.loc 1 32 19
setp.eq.s32 %p5, %r64, 2;
.loc 1 34 32
selp.f32 %f25, %f1, 0f00000000, %p5;
selp.f32 %f26, %f2, 0f00000000, %p5;
selp.f32 %f27, %f3, 0f00000000, %p5;
selp.f32 %f28, %f4, 0f00000000, %p5;
selp.f32 %f29, %f5, 0f00000000, %p5;
selp.f32 %f30, %f6, 0f00000000, %p5;
selp.f32 %f31, %f7, 0f00000000, %p5;
selp.f32 %f32, %f8, 0f00000000, %p5;
.loc 1 36 19
setp.eq.s32 %p6, %r64, 1;
.loc 1 37 32
selp.f32 %f33, %f9, 0f00000000, %p6;
selp.f32 %f34, %f10, 0f00000000, %p6;
selp.f32 %f35, %f11, 0f00000000, %p6;
selp.f32 %f36, %f12, 0f00000000, %p6;
selp.f32 %f37, %f13, 0f00000000, %p6;
selp.f32 %f38, %f14, 0f00000000, %p6;
selp.f32 %f39, %f15, 0f00000000, %p6;
selp.f32 %f40, %f16, 0f00000000, %p6;
.loc 1 38 19
add.f32 %f41, %f25, %f33;
add.f32 %f42, %f26, %f34;
add.f32 %f43, %f27, %f35;
add.f32 %f44, %f28, %f36;
add.f32 %f45, %f29, %f37;
add.f32 %f46, %f30, %f38;
add.f32 %f47, %f31, %f39;
add.f32 %f48, %f32, %f40;
.loc 1 40 20
setp.eq.s32 %p7, %r64, 0;
.loc 1 41 35
selp.f32 %f49, %f17, 0f00000000, %p7;
selp.f32 %f50, %f18, 0f00000000, %p7;
selp.f32 %f51, %f19, 0f00000000, %p7;
selp.f32 %f52, %f20, 0f00000000, %p7;
selp.f32 %f53, %f21, 0f00000000, %p7;
selp.f32 %f54, %f22, 0f00000000, %p7;
selp.f32 %f55, %f23, 0f00000000, %p7;
selp.f32 %f56, %f24, 0f00000000, %p7;
.loc 1 42 20
add.f32 %f57, %f41, %f49;
add.f32 %f58, %f42, %f50;
add.f32 %f59, %f43, %f51;
add.f32 %f60, %f44, %f52;
add.f32 %f61, %f45, %f53;
add.f32 %f62, %f46, %f54;
add.f32 %f63, %f47, %f55;
add.f32 %f64, %f48, %f56;
.loc 1 43 25
mul.wide.s32 %rd10, %r54, 2;
add.s64 %rd4, %rd8, %rd10;
.loc 1 43 37
mov.b32 %r38, %f57;
cvt.rn.bf16.f32 %rs25, %r38;
mov.b32 %r39, %f58;
cvt.rn.bf16.f32 %rs26, %r39;
mov.b32 %r40, %f59;
cvt.rn.bf16.f32 %rs27, %r40;
mov.b32 %r41, %f60;
cvt.rn.bf16.f32 %rs28, %r41;
mov.b32 %r42, %f61;
cvt.rn.bf16.f32 %rs29, %r42;
mov.b32 %r43, %f62;
cvt.rn.bf16.f32 %rs30, %r43;
mov.b32 %r44, %f63;
cvt.rn.bf16.f32 %rs31, %r44;
mov.b32 %r45, %f64;
cvt.rn.bf16.f32 %rs32, %r45;
mov.b32 %r73, {%rs25, %rs26};
mov.b32 %r74, {%rs27, %rs28};
mov.b32 %r75, {%rs29, %rs30};
mov.b32 %r76, {%rs31, %rs32};
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 };
.loc 1 43 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 184
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 54
.b8 51
.b8 114
.b8 55
.b8 105
.b8 117
.b8 114
.b8 119
.b8 107
.b8 53
.b8 121
.b8 100
.b8 108
.b8 115
.b8 119
.b8 104
.b8 55
.b8 114
.b8 118
.b8 104
.b8 99
.b8 109
.b8 108
.b8 120
.b8 50
.b8 99
.b8 102
.b8 114
.b8 101
.b8 116
.b8 108
.b8 114
.b8 101
.b8 119
.b8 103
.b8 119
.b8 54
.b8 116
.b8 108
.b8 106
.b8 108
.b8 117
.b8 114
.b8 115
.b8 115
.b8 104
.b8 103
.b8 116
.b8 102
.b8 112
.b8 112
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 54
.b8 51
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 188
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 188
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }