0-hero's picture
Add files using upload-large-folder tool
6f0bac9 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6d7de8de
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2d3d4d5d6d7de8de(
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
)
.maxntid 64, 1, 1
{
.reg .pred %p<37>;
.reg .b16 %rs<9>;
.reg .b32 %r<110>;
.reg .f32 %f<86>;
.reg .b64 %rd<26>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0];
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1];
$L__tmp0:
.loc 1 26 26
mov.u32 %r76, %tid.x;
and.b32 %r77, %r76, 31;
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2];
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3];
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4];
shl.b32 %r78, %r76, 2;
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5];
and.b32 %r79, %r78, 252;
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6];
.loc 1 23 28
mov.u32 %r1, %ctaid.x;
.loc 1 30 40
shl.b32 %r80, %r1, 8;
.loc 1 30 36
or.b32 %r81, %r80, %r79;
.loc 1 30 30
mul.wide.s32 %rd22, %r81, 2;
add.s64 %rd1, %rd16, %rd22;
mov.b32 %r4, 0;
mov.pred %p1, -1;
.loc 1 30 46
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
@!%p1 mov.u32 %r2, %r4;
@!%p1 mov.u32 %r3, %r4;
cvt.u16.u32 %rs1, %r2;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
cvt.u16.u32 %rs3, %r3;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
.loc 1 30 67
cvt.f32.bf16 %r6, %rs1;
mov.b32 %f1, %r6;
cvt.f32.bf16 %r7, %rs2;
mov.b32 %f2, %r7;
cvt.f32.bf16 %r8, %rs3;
mov.b32 %f3, %r8;
cvt.f32.bf16 %r9, %rs4;
mov.b32 %f4, %r9;
.loc 1 31 30
mul.wide.u32 %rd23, %r79, 4;
add.s64 %rd2, %rd17, %rd23;
.loc 1 31 35
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
mov.u32 %r12, 0x0;
mov.u32 %r13, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
@!%p1 mov.u32 %r10, %r4;
@!%p1 mov.u32 %r11, %r4;
@!%p1 mov.u32 %r12, %r4;
@!%p1 mov.u32 %r13, %r4;
mov.b32 %f5, %r10;
mov.b32 %f6, %r11;
mov.b32 %f7, %r12;
mov.b32 %f8, %r13;
.loc 1 32 30
mul.wide.s32 %rd24, %r81, 4;
add.s64 %rd3, %rd18, %rd24;
.loc 1 32 46
mov.u32 %r18, 0x0;
mov.u32 %r19, 0x0;
mov.u32 %r20, 0x0;
mov.u32 %r21, 0x0;
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
@!%p1 mov.u32 %r18, %r4;
@!%p1 mov.u32 %r19, %r4;
@!%p1 mov.u32 %r20, %r4;
@!%p1 mov.u32 %r21, %r4;
mov.b32 %f9, %r18;
mov.b32 %f10, %r19;
mov.b32 %f11, %r20;
mov.b32 %f12, %r21;
.loc 1 33 30
mul.wide.s32 %rd25, %r1, 4;
add.s64 %rd4, %rd19, %rd25;
.loc 1 33 35
mov.u32 %r26, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
mov.b32 %f13, %r26;
mov.u32 %r27, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
mov.u32 %r28, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
mov.u32 %r29, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
.loc 1 34 31
add.s64 %rd8, %rd20, %rd25;
.loc 1 34 36
mov.u32 %r55, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
mov.b32 %f14, %r55;
mov.u32 %r31, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
mov.u32 %r32, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
mov.u32 %r33, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
.loc 1 35 35
add.s64 %rd12, %rd15, %rd24;
.loc 1 35 51
mov.u32 %r34, 0x0;
mov.u32 %r35, 0x0;
mov.u32 %r36, 0x0;
mov.u32 %r37, 0x0;
@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ];
@!%p1 mov.u32 %r34, %r4;
@!%p1 mov.u32 %r35, %r4;
@!%p1 mov.u32 %r36, %r4;
@!%p1 mov.u32 %r37, %r4;
mov.b32 %f15, %r34;
mov.b32 %f16, %r35;
mov.b32 %f17, %r36;
mov.b32 %f18, %r37;
.loc 1 37 18
mul.f32 %f19, %f1, %f5;
mul.f32 %f20, %f2, %f6;
mul.f32 %f21, %f3, %f7;
mul.f32 %f22, %f4, %f8;
$L__tmp1:
.loc 2 233 15
fma.rn.f32 %f23, %f1, %f5, %f20;
fma.rn.f32 %f24, %f3, %f7, %f23;
fma.rn.f32 %f25, %f4, %f8, %f24;
$L__tmp2:
.loc 2 243 36
mov.b32 %r82, %f25;
shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1;
mov.b32 %f26, %r83;
$L__tmp3:
.loc 2 233 15
add.f32 %f27, %f25, %f26;
$L__tmp4:
.loc 2 243 36
mov.b32 %r84, %f27;
shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1;
mov.b32 %f28, %r85;
$L__tmp5:
.loc 2 233 15
add.f32 %f29, %f27, %f28;
$L__tmp6:
.loc 2 243 36
mov.b32 %r86, %f29;
shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1;
mov.b32 %f30, %r87;
$L__tmp7:
.loc 2 233 15
add.f32 %f31, %f29, %f30;
$L__tmp8:
.loc 2 243 36
mov.b32 %r88, %f31;
shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1;
mov.b32 %f32, %r89;
$L__tmp9:
.loc 2 233 15
add.f32 %f33, %f31, %f32;
$L__tmp10:
.loc 2 243 36
mov.b32 %r90, %f33;
shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1;
mov.b32 %f34, %r91;
$L__tmp11:
.loc 2 233 15
add.f32 %f35, %f33, %f34;
$L__tmp12:
.loc 2 243 36
setp.eq.s32 %p27, %r77, 0;
shr.u32 %r92, %r76, 3;
and.b32 %r93, %r92, 4;
mov.u32 %r94, global_smem;
add.s32 %r42, %r94, %r93;
mov.b32 %r43, %f35;
@%p27 st.shared.b32 [ %r42 + 0 ], %r43;
bar.sync 0;
setp.lt.s32 %p28, %r76, 2;
add.s32 %r45, %r94, %r78;
@%p28 ld.shared.b32 %r44, [ %r45 + 0 ];
mov.b32 %f36, %r44;
shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1;
mov.b32 %f37, %r95;
$L__tmp13:
.loc 2 233 15
add.f32 %f38, %f36, %f37;
$L__tmp14:
.loc 2 243 36
and.b32 %r96, %r76, 1;
setp.eq.b32 %p35, %r96, 1;
not.pred %p36, %p35;
and.pred %p29, %p28, %p36;
mov.b32 %r47, %f38;
@%p29 st.shared.b32 [ %r45 + 0 ], %r47;
bar.sync 0;
ld.shared.f32 %f39, [global_smem];
$L__tmp15:
.loc 3 8 15
add.f32 %f40, %f39, 0f00000000;
$L__tmp16:
.loc 1 41 19
sub.f32 %f41, %f9, %f13;
sub.f32 %f42, %f10, %f13;
sub.f32 %f43, %f11, %f13;
sub.f32 %f44, %f12, %f13;
.loc 1 42 20
mul.f32 %f45, %f41, %f14;
mul.f32 %f46, %f42, %f14;
mul.f32 %f47, %f43, %f14;
mul.f32 %f48, %f44, %f14;
.loc 1 43 19
mul.f32 %f49, %f20, %f46;
$L__tmp17:
.loc 2 243 36
bar.sync 0;
$L__tmp18:
.loc 2 233 15
fma.rn.f32 %f50, %f19, %f45, %f49;
fma.rn.f32 %f51, %f21, %f47, %f50;
fma.rn.f32 %f52, %f22, %f48, %f51;
$L__tmp19:
.loc 2 243 36
mov.b32 %r97, %f52;
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
mov.b32 %f53, %r98;
$L__tmp20:
.loc 2 233 15
add.f32 %f54, %f52, %f53;
$L__tmp21:
.loc 2 243 36
mov.b32 %r99, %f54;
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
mov.b32 %f55, %r100;
$L__tmp22:
.loc 2 233 15
add.f32 %f56, %f54, %f55;
$L__tmp23:
.loc 2 243 36
mov.b32 %r101, %f56;
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
mov.b32 %f57, %r102;
$L__tmp24:
.loc 2 233 15
add.f32 %f58, %f56, %f57;
$L__tmp25:
.loc 2 243 36
mov.b32 %r103, %f58;
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
mov.b32 %f59, %r104;
$L__tmp26:
.loc 2 233 15
add.f32 %f60, %f58, %f59;
$L__tmp27:
.loc 2 243 36
mov.b32 %r105, %f60;
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
mov.b32 %f61, %r106;
$L__tmp28:
.loc 2 233 15
add.f32 %f62, %f60, %f61;
$L__tmp29:
.loc 2 243 36
mov.b32 %r49, %f62;
@%p27 st.shared.b32 [ %r42 + 0 ], %r49;
bar.sync 0;
@%p28 ld.shared.b32 %r50, [ %r45 + 0 ];
mov.b32 %f63, %r50;
shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1;
mov.b32 %f64, %r107;
$L__tmp30:
.loc 2 233 15
add.f32 %f65, %f63, %f64;
$L__tmp31:
.loc 2 243 36
mov.b32 %r53, %f65;
@%p29 st.shared.b32 [ %r45 + 0 ], %r53;
bar.sync 0;
ld.shared.f32 %f66, [global_smem];
$L__tmp32:
.loc 3 8 15
add.f32 %f67, %f66, 0f00000000;
mov.b32 %r56, 1132462080;
$L__tmp33:
.loc 1 48 20
div.full.f32 %r54, %r55, %r56;
mov.b32 %f68, %r54;
.loc 1 50 20
neg.f32 %f69, %f40;
fma.rn.f32 %f70, %f19, 0f43800000, %f69;
fma.rn.f32 %f71, %f20, 0f43800000, %f69;
fma.rn.f32 %f72, %f21, 0f43800000, %f69;
fma.rn.f32 %f73, %f22, 0f43800000, %f69;
.loc 1 52 20
neg.f32 %f74, %f45;
fma.rn.f32 %f75, %f74, %f67, %f70;
neg.f32 %f76, %f46;
fma.rn.f32 %f77, %f76, %f67, %f71;
neg.f32 %f78, %f47;
fma.rn.f32 %f79, %f78, %f67, %f72;
neg.f32 %f80, %f48;
fma.rn.f32 %f81, %f80, %f67, %f73;
.loc 1 54 20
fma.rn.f32 %f82, %f68, %f75, %f15;
fma.rn.f32 %f83, %f68, %f77, %f16;
fma.rn.f32 %f84, %f68, %f79, %f17;
fma.rn.f32 %f85, %f68, %f81, %f18;
.loc 1 56 51
mov.b32 %r66, %f82;
mov.b32 %r67, %f83;
mov.b32 %r68, %f84;
mov.b32 %r69, %f85;
@%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 };
.loc 1 57 25
add.s64 %rd14, %rd21, %rd22;
.loc 1 57 48
cvt.rn.bf16.f32 %rs5, %r66;
cvt.rn.bf16.f32 %rs6, %r67;
cvt.rn.bf16.f32 %rs7, %r68;
cvt.rn.bf16.f32 %rs8, %r69;
mov.b32 %r108, {%rs5, %rs6};
mov.b32 %r109, {%rs7, %rs8};
@%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 };
.loc 1 57 4
ret;
$L__tmp34:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 403
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 115
.b8 110
.b8 101
.b8 100
.b8 52
.b8 104
.b8 121
.b8 120
.b8 112
.b8 103
.b8 119
.b8 117
.b8 53
.b8 116
.b8 116
.b8 117
.b8 98
.b8 115
.b8 51
.b8 114
.b8 55
.b8 117
.b8 120
.b8 107
.b8 106
.b8 113
.b8 53
.b8 121
.b8 102
.b8 108
.b8 51
.b8 122
.b8 104
.b8 54
.b8 99
.b8 50
.b8 115
.b8 111
.b8 122
.b8 111
.b8 98
.b8 116
.b8 107
.b8 101
.b8 107
.b8 50
.b8 117
.b8 122
.b8 102
.b8 99
.b8 118
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 115
.b8 110
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 40
.b8 57
.b8 5
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 40
.b8 57
.b8 5
.b32 125
.b64 $L__tmp15
.b64 $L__tmp16
.b8 3
.b8 40
.b8 44
.b8 5
.b32 125
.b64 $L__tmp17
.b64 $L__tmp32
.b8 2
.b8 46
.b8 59
.b8 4
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 46
.b8 59
.b8 5
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp32
.b64 $L__tmp33
.b8 3
.b8 46
.b8 45
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 407
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 407
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }