0-hero's picture
Add files using upload-large-folder tool
8c1fe04 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6de7de
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2d3d4d5d6de7de(
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
)
.maxntid 64, 1, 1
{
.reg .pred %p<33>;
.reg .b16 %rs<9>;
.reg .b32 %r<106>;
.reg .f32 %f<73>;
.reg .b64 %rd<21>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_0];
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_1];
$L__tmp0:
.loc 1 26 26
mov.u32 %r72, %tid.x;
and.b32 %r73, %r72, 31;
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_2];
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_3];
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_4];
shl.b32 %r74, %r72, 2;
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_5];
and.b32 %r75, %r74, 252;
.loc 1 23 28
mov.u32 %r1, %ctaid.x;
.loc 1 30 40
shl.b32 %r76, %r1, 8;
.loc 1 30 36
or.b32 %r77, %r76, %r75;
.loc 1 30 30
mul.wide.s32 %rd17, %r77, 2;
add.s64 %rd1, %rd12, %rd17;
mov.b32 %r4, 0;
mov.pred %p1, -1;
.loc 1 30 46
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
@!%p1 mov.u32 %r2, %r4;
@!%p1 mov.u32 %r3, %r4;
cvt.u16.u32 %rs1, %r2;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
cvt.u16.u32 %rs3, %r3;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
.loc 1 30 67
cvt.f32.bf16 %r6, %rs1;
mov.b32 %f1, %r6;
cvt.f32.bf16 %r7, %rs2;
mov.b32 %f2, %r7;
cvt.f32.bf16 %r8, %rs3;
mov.b32 %f3, %r8;
cvt.f32.bf16 %r9, %rs4;
mov.b32 %f4, %r9;
.loc 1 31 30
mul.wide.u32 %rd18, %r75, 4;
add.s64 %rd2, %rd13, %rd18;
.loc 1 31 35
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
mov.u32 %r12, 0x0;
mov.u32 %r13, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
@!%p1 mov.u32 %r10, %r4;
@!%p1 mov.u32 %r11, %r4;
@!%p1 mov.u32 %r12, %r4;
@!%p1 mov.u32 %r13, %r4;
mov.b32 %f5, %r10;
mov.b32 %f6, %r11;
mov.b32 %f7, %r12;
mov.b32 %f8, %r13;
.loc 1 32 30
mul.wide.s32 %rd19, %r77, 4;
add.s64 %rd3, %rd14, %rd19;
.loc 1 32 46
mov.u32 %r18, 0x0;
mov.u32 %r19, 0x0;
mov.u32 %r20, 0x0;
mov.u32 %r21, 0x0;
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
@!%p1 mov.u32 %r18, %r4;
@!%p1 mov.u32 %r19, %r4;
@!%p1 mov.u32 %r20, %r4;
@!%p1 mov.u32 %r21, %r4;
mov.b32 %f9, %r18;
mov.b32 %f10, %r19;
mov.b32 %f11, %r20;
mov.b32 %f12, %r21;
.loc 1 33 35
add.s64 %rd4, %rd11, %rd19;
.loc 1 33 51
mov.u32 %r26, 0x0;
mov.u32 %r27, 0x0;
mov.u32 %r28, 0x0;
mov.u32 %r29, 0x0;
@%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
@!%p1 mov.u32 %r26, %r4;
@!%p1 mov.u32 %r27, %r4;
@!%p1 mov.u32 %r28, %r4;
@!%p1 mov.u32 %r29, %r4;
mov.b32 %f13, %r26;
mov.b32 %f14, %r27;
mov.b32 %f15, %r28;
mov.b32 %f16, %r29;
.loc 1 34 31
mul.wide.s32 %rd20, %r1, 4;
add.s64 %rd5, %rd15, %rd20;
.loc 1 34 36
mov.u32 %r51, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r51 }, [ %rd5 + 0 ];
mov.u32 %r35, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r35 }, [ %rd5 + 0 ];
mov.u32 %r36, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r36 }, [ %rd5 + 0 ];
mov.u32 %r37, 0x0;
@%p1 ld.global.L1::evict_last.b32 { %r37 }, [ %rd5 + 0 ];
.loc 1 36 18
mul.f32 %f17, %f1, %f5;
mul.f32 %f18, %f2, %f6;
mul.f32 %f19, %f3, %f7;
mul.f32 %f20, %f4, %f8;
$L__tmp1:
.loc 2 233 15
fma.rn.f32 %f21, %f1, %f5, %f18;
fma.rn.f32 %f22, %f3, %f7, %f21;
fma.rn.f32 %f23, %f4, %f8, %f22;
$L__tmp2:
.loc 2 243 36
mov.b32 %r78, %f23;
shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
mov.b32 %f24, %r79;
$L__tmp3:
.loc 2 233 15
add.f32 %f25, %f23, %f24;
$L__tmp4:
.loc 2 243 36
mov.b32 %r80, %f25;
shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
mov.b32 %f26, %r81;
$L__tmp5:
.loc 2 233 15
add.f32 %f27, %f25, %f26;
$L__tmp6:
.loc 2 243 36
mov.b32 %r82, %f27;
shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1;
mov.b32 %f28, %r83;
$L__tmp7:
.loc 2 233 15
add.f32 %f29, %f27, %f28;
$L__tmp8:
.loc 2 243 36
mov.b32 %r84, %f29;
shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1;
mov.b32 %f30, %r85;
$L__tmp9:
.loc 2 233 15
add.f32 %f31, %f29, %f30;
$L__tmp10:
.loc 2 243 36
mov.b32 %r86, %f31;
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
mov.b32 %f32, %r87;
$L__tmp11:
.loc 2 233 15
add.f32 %f33, %f31, %f32;
$L__tmp12:
.loc 2 243 36
setp.eq.s32 %p23, %r73, 0;
shr.u32 %r88, %r72, 3;
and.b32 %r89, %r88, 4;
mov.u32 %r90, global_smem;
add.s32 %r38, %r90, %r89;
mov.b32 %r39, %f33;
@%p23 st.shared.b32 [ %r38 + 0 ], %r39;
bar.sync 0;
setp.lt.s32 %p24, %r72, 2;
add.s32 %r41, %r90, %r74;
@%p24 ld.shared.b32 %r40, [ %r41 + 0 ];
mov.b32 %f34, %r40;
shfl.sync.bfly.b32 %r91, %r40, 1, 31, -1;
mov.b32 %f35, %r91;
$L__tmp13:
.loc 2 233 15
add.f32 %f36, %f34, %f35;
$L__tmp14:
.loc 2 243 36
and.b32 %r92, %r72, 1;
setp.eq.b32 %p31, %r92, 1;
not.pred %p32, %p31;
and.pred %p25, %p24, %p32;
mov.b32 %r43, %f36;
@%p25 st.shared.b32 [ %r41 + 0 ], %r43;
bar.sync 0;
ld.shared.f32 %f37, [global_smem];
$L__tmp15:
.loc 3 8 15
add.f32 %f38, %f37, 0f00000000;
$L__tmp16:
.loc 1 40 18
mul.f32 %f39, %f18, %f10;
$L__tmp17:
.loc 2 243 36
bar.sync 0;
$L__tmp18:
.loc 2 233 15
fma.rn.f32 %f40, %f17, %f9, %f39;
fma.rn.f32 %f41, %f19, %f11, %f40;
fma.rn.f32 %f42, %f20, %f12, %f41;
$L__tmp19:
.loc 2 243 36
mov.b32 %r93, %f42;
shfl.sync.bfly.b32 %r94, %r93, 16, 31, -1;
mov.b32 %f43, %r94;
$L__tmp20:
.loc 2 233 15
add.f32 %f44, %f42, %f43;
$L__tmp21:
.loc 2 243 36
mov.b32 %r95, %f44;
shfl.sync.bfly.b32 %r96, %r95, 8, 31, -1;
mov.b32 %f45, %r96;
$L__tmp22:
.loc 2 233 15
add.f32 %f46, %f44, %f45;
$L__tmp23:
.loc 2 243 36
mov.b32 %r97, %f46;
shfl.sync.bfly.b32 %r98, %r97, 4, 31, -1;
mov.b32 %f47, %r98;
$L__tmp24:
.loc 2 233 15
add.f32 %f48, %f46, %f47;
$L__tmp25:
.loc 2 243 36
mov.b32 %r99, %f48;
shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1;
mov.b32 %f49, %r100;
$L__tmp26:
.loc 2 233 15
add.f32 %f50, %f48, %f49;
$L__tmp27:
.loc 2 243 36
mov.b32 %r101, %f50;
shfl.sync.bfly.b32 %r102, %r101, 1, 31, -1;
mov.b32 %f51, %r102;
$L__tmp28:
.loc 2 233 15
add.f32 %f52, %f50, %f51;
$L__tmp29:
.loc 2 243 36
mov.b32 %r45, %f52;
@%p23 st.shared.b32 [ %r38 + 0 ], %r45;
bar.sync 0;
@%p24 ld.shared.b32 %r46, [ %r41 + 0 ];
mov.b32 %f53, %r46;
shfl.sync.bfly.b32 %r103, %r46, 1, 31, -1;
mov.b32 %f54, %r103;
$L__tmp30:
.loc 2 233 15
add.f32 %f55, %f53, %f54;
$L__tmp31:
.loc 2 243 36
mov.b32 %r49, %f55;
@%p25 st.shared.b32 [ %r41 + 0 ], %r49;
bar.sync 0;
ld.shared.f32 %f56, [global_smem];
$L__tmp32:
.loc 3 8 15
add.f32 %f57, %f56, 0f00000000;
mov.b32 %r52, 1132462080;
$L__tmp33:
.loc 1 45 20
div.full.f32 %r50, %r51, %r52;
mov.b32 %f58, %r50;
.loc 1 47 20
neg.f32 %f59, %f38;
fma.rn.f32 %f60, %f17, 0f43800000, %f59;
fma.rn.f32 %f61, %f18, 0f43800000, %f59;
fma.rn.f32 %f62, %f19, 0f43800000, %f59;
fma.rn.f32 %f63, %f20, 0f43800000, %f59;
.loc 1 49 20
neg.f32 %f64, %f57;
fma.rn.f32 %f65, %f64, %f9, %f60;
fma.rn.f32 %f66, %f64, %f10, %f61;
fma.rn.f32 %f67, %f64, %f11, %f62;
fma.rn.f32 %f68, %f64, %f12, %f63;
.loc 1 51 20
fma.rn.f32 %f69, %f58, %f65, %f13;
fma.rn.f32 %f70, %f58, %f66, %f14;
fma.rn.f32 %f71, %f58, %f67, %f15;
fma.rn.f32 %f72, %f58, %f68, %f16;
.loc 1 53 51
mov.b32 %r62, %f69;
mov.b32 %r63, %f70;
mov.b32 %r64, %f71;
mov.b32 %r65, %f72;
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r62, %r63, %r64, %r65 };
.loc 1 54 25
add.s64 %rd10, %rd16, %rd17;
.loc 1 54 48
cvt.rn.bf16.f32 %rs5, %r62;
cvt.rn.bf16.f32 %rs6, %r63;
cvt.rn.bf16.f32 %rs7, %r64;
cvt.rn.bf16.f32 %rs8, %r65;
mov.b32 %r104, {%rs5, %rs6};
mov.b32 %r105, {%rs7, %rs8};
@%p1 st.global.v2.b32 [ %rd10 + 0 ], { %r104, %r105 };
.loc 1 54 4
ret;
$L__tmp34:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/rn/crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 399
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 114
.b8 110
.b8 121
.b8 110
.b8 98
.b8 109
.b8 115
.b8 100
.b8 50
.b8 121
.b8 101
.b8 108
.b8 108
.b8 50
.b8 108
.b8 112
.b8 106
.b8 121
.b8 109
.b8 98
.b8 52
.b8 54
.b8 114
.b8 116
.b8 116
.b8 102
.b8 97
.b8 101
.b8 97
.b8 50
.b8 120
.b8 106
.b8 119
.b8 115
.b8 98
.b8 120
.b8 114
.b8 55
.b8 53
.b8 106
.b8 53
.b8 52
.b8 103
.b8 99
.b8 116
.b8 102
.b8 103
.b8 105
.b8 52
.b8 53
.b8 55
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 114
.b8 110
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 39
.b8 57
.b8 5
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 39
.b8 57
.b8 5
.b32 125
.b64 $L__tmp15
.b64 $L__tmp16
.b8 3
.b8 39
.b8 44
.b8 5
.b32 125
.b64 $L__tmp17
.b64 $L__tmp32
.b8 2
.b8 43
.b8 59
.b8 4
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 43
.b8 59
.b8 5
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp32
.b64 $L__tmp33
.b8 3
.b8 43
.b8 45
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 403
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 403
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }