0-hero's picture
Add files using upload-large-folder tool
00602c7 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4de5de
.extern .shared .align 1 .b8 global_smem[];
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
.visible .entry triton__0d1d2d3d4de5de(
.param .u64 triton__0d1d2d3d4de5de_param_0,
.param .u64 triton__0d1d2d3d4de5de_param_1,
.param .u64 triton__0d1d2d3d4de5de_param_2,
.param .u64 triton__0d1d2d3d4de5de_param_3,
.param .u32 triton__0d1d2d3d4de5de_param_4,
.param .u32 triton__0d1d2d3d4de5de_param_5
)
.maxntid 64, 1, 1
{
.reg .pred %p<23>;
.reg .b16 %rs<9>;
.reg .b32 %r<84>;
.reg .f32 %f<70>;
.reg .b64 %rd<12>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0];
ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1];
$L__tmp0:
.loc 1 26 26
mov.u32 %r50, %tid.x;
and.b32 %r51, %r50, 31;
ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2];
ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3];
shl.b32 %r52, %r50, 2;
and.b32 %r53, %r52, 252;
.loc 1 23 28
mov.u32 %r1, %ctaid.x;
.loc 1 30 40
shl.b32 %r54, %r1, 8;
.loc 1 30 36
or.b32 %r55, %r54, %r53;
.loc 1 30 30
mul.wide.s32 %rd9, %r55, 4;
add.s64 %rd1, %rd5, %rd9;
mov.b32 %r6, 0;
mov.pred %p1, -1;
.loc 1 30 46
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
@!%p1 mov.u32 %r2, %r6;
@!%p1 mov.u32 %r3, %r6;
@!%p1 mov.u32 %r4, %r6;
@!%p1 mov.u32 %r5, %r6;
mov.b32 %f1, %r2;
mov.b32 %f2, %r3;
mov.b32 %f3, %r4;
mov.b32 %f4, %r5;
.loc 1 31 30
mul.wide.s32 %rd10, %r55, 2;
add.s64 %rd2, %rd6, %rd10;
.loc 1 31 46
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
@!%p1 mov.u32 %r10, %r6;
@!%p1 mov.u32 %r11, %r6;
cvt.u16.u32 %rs1, %r10;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
cvt.u16.u32 %rs3, %r11;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
.loc 1 31 67
cvt.f32.bf16 %r14, %rs1;
mov.b32 %f5, %r14;
cvt.f32.bf16 %r15, %rs2;
mov.b32 %f6, %r15;
cvt.f32.bf16 %r16, %rs3;
mov.b32 %f7, %r16;
cvt.f32.bf16 %r17, %rs4;
mov.b32 %f8, %r17;
.loc 1 32 31
mul.wide.u32 %rd11, %r53, 4;
add.s64 %rd3, %rd7, %rd11;
.loc 1 32 36
mov.u32 %r18, 0x0;
mov.u32 %r19, 0x0;
mov.u32 %r20, 0x0;
mov.u32 %r21, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
@!%p1 mov.u32 %r18, %r6;
@!%p1 mov.u32 %r19, %r6;
@!%p1 mov.u32 %r20, %r6;
@!%p1 mov.u32 %r21, %r6;
.loc 1 34 18
add.f32 %f9, %f5, %f1;
add.f32 %f10, %f6, %f2;
add.f32 %f11, %f7, %f3;
add.f32 %f12, %f8, %f4;
$L__tmp1:
.loc 2 233 15
add.f32 %f13, %f9, %f10;
add.f32 %f14, %f13, %f11;
add.f32 %f15, %f14, %f12;
$L__tmp2:
.loc 2 243 36
mov.b32 %r56, %f15;
shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1;
mov.b32 %f16, %r57;
$L__tmp3:
.loc 2 233 15
add.f32 %f17, %f15, %f16;
$L__tmp4:
.loc 2 243 36
mov.b32 %r58, %f17;
shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1;
mov.b32 %f18, %r59;
$L__tmp5:
.loc 2 233 15
add.f32 %f19, %f17, %f18;
$L__tmp6:
.loc 2 243 36
mov.b32 %r60, %f19;
shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1;
mov.b32 %f20, %r61;
$L__tmp7:
.loc 2 233 15
add.f32 %f21, %f19, %f20;
$L__tmp8:
.loc 2 243 36
mov.b32 %r62, %f21;
shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1;
mov.b32 %f22, %r63;
$L__tmp9:
.loc 2 233 15
add.f32 %f23, %f21, %f22;
$L__tmp10:
.loc 2 243 36
mov.b32 %r64, %f23;
shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1;
mov.b32 %f24, %r65;
$L__tmp11:
.loc 2 233 15
add.f32 %f25, %f23, %f24;
$L__tmp12:
.loc 2 243 36
setp.eq.s32 %p14, %r51, 0;
shr.u32 %r66, %r50, 3;
and.b32 %r67, %r66, 4;
mov.u32 %r68, global_smem;
add.s32 %r26, %r68, %r67;
mov.b32 %r27, %f25;
@%p14 st.shared.b32 [ %r26 + 0 ], %r27;
bar.sync 0;
setp.lt.s32 %p15, %r50, 2;
add.s32 %r29, %r68, %r52;
@%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
mov.b32 %f26, %r28;
shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1;
mov.b32 %f27, %r69;
$L__tmp13:
.loc 2 233 15
add.f32 %f28, %f26, %f27;
$L__tmp14:
.loc 2 243 36
and.b32 %r70, %r50, 1;
setp.eq.b32 %p21, %r70, 1;
not.pred %p22, %p21;
and.pred %p16, %p15, %p22;
mov.b32 %r31, %f28;
@%p16 st.shared.b32 [ %r29 + 0 ], %r31;
bar.sync 0;
ld.shared.f32 %f29, [global_smem];
$L__tmp15:
.loc 3 8 15
add.f32 %f30, %f29, 0f00000000;
$L__tmp16:
.loc 1 42 20
mov.b32 %r33, %f30;
mov.b32 %r34, 1132462080;
div.full.f32 %r32, %r33, %r34;
mov.b32 %f31, %r32;
.loc 1 43 19
sub.f32 %f32, %f9, %f31;
sub.f32 %f33, %f10, %f31;
sub.f32 %f34, %f11, %f31;
sub.f32 %f35, %f12, %f31;
.loc 1 44 20
mul.f32 %f36, %f33, %f33;
$L__tmp17:
.loc 2 243 36
bar.sync 0;
$L__tmp18:
.loc 2 233 15
fma.rn.f32 %f37, %f32, %f32, %f36;
fma.rn.f32 %f38, %f34, %f34, %f37;
fma.rn.f32 %f39, %f35, %f35, %f38;
$L__tmp19:
.loc 2 243 36
mov.b32 %r71, %f39;
shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1;
mov.b32 %f40, %r72;
$L__tmp20:
.loc 2 233 15
add.f32 %f41, %f39, %f40;
$L__tmp21:
.loc 2 243 36
mov.b32 %r73, %f41;
shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1;
mov.b32 %f42, %r74;
$L__tmp22:
.loc 2 233 15
add.f32 %f43, %f41, %f42;
$L__tmp23:
.loc 2 243 36
mov.b32 %r75, %f43;
shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1;
mov.b32 %f44, %r76;
$L__tmp24:
.loc 2 233 15
add.f32 %f45, %f43, %f44;
$L__tmp25:
.loc 2 243 36
mov.b32 %r77, %f45;
shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1;
mov.b32 %f46, %r78;
$L__tmp26:
.loc 2 233 15
add.f32 %f47, %f45, %f46;
$L__tmp27:
.loc 2 243 36
mov.b32 %r79, %f47;
shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1;
mov.b32 %f48, %r80;
$L__tmp28:
.loc 2 233 15
add.f32 %f49, %f47, %f48;
$L__tmp29:
.loc 2 243 36
mov.b32 %r36, %f49;
@%p14 st.shared.b32 [ %r26 + 0 ], %r36;
bar.sync 0;
@%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
mov.b32 %f50, %r37;
shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1;
mov.b32 %f51, %r81;
$L__tmp30:
.loc 2 233 15
add.f32 %f52, %f50, %f51;
$L__tmp31:
.loc 2 243 36
mov.b32 %r40, %f52;
@%p16 st.shared.b32 [ %r29 + 0 ], %r40;
bar.sync 0;
ld.shared.f32 %f53, [global_smem];
$L__tmp32:
.loc 3 8 15
add.f32 %f54, %f53, 0f00000000;
$L__tmp33:
.loc 1 50 20
mov.b32 %r42, %f54;
div.full.f32 %r41, %r42, %r34;
mov.b32 %f55, %r41;
.loc 1 52 20
add.f32 %f56, %f55, 0f3727C5AC;
.loc 1 53 26
rsqrt.approx.ftz.f32 %f57, %f56;
.loc 1 32 36
mov.b32 %f58, %r21;
mov.b32 %f59, %r20;
mov.b32 %f60, %r19;
mov.b32 %f61, %r18;
.loc 1 54 20
mul.f32 %f62, %f32, %f57;
mul.f32 %f63, %f33, %f57;
mul.f32 %f64, %f34, %f57;
mul.f32 %f65, %f35, %f57;
.loc 1 55 20
mul.f32 %f66, %f62, %f61;
mul.f32 %f67, %f63, %f60;
mul.f32 %f68, %f64, %f59;
mul.f32 %f69, %f65, %f58;
.loc 1 57 25
add.s64 %rd4, %rd8, %rd10;
.loc 1 57 48
mov.b32 %r44, %f66;
cvt.rn.bf16.f32 %rs5, %r44;
mov.b32 %r45, %f67;
cvt.rn.bf16.f32 %rs6, %r45;
mov.b32 %r46, %f68;
cvt.rn.bf16.f32 %rs7, %r46;
mov.b32 %r47, %f69;
cvt.rn.bf16.f32 %rs8, %r47;
mov.b32 %r82, {%rs5, %rs6};
mov.b32 %r83, {%rs7, %rs8};
@%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 };
.loc 1 57 4
ret;
$L__tmp34:
$L__func_end0:
}
// .globl __nv_rsqrtf
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
.param .b32 __nv_rsqrtf_param_0
)
{
.reg .f32 %f<3>;
$L__func_begin1:
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
rsqrt.approx.ftz.f32 %f2, %f1;
st.param.f32 [func_retval0+0], %f2;
ret;
$L__func_end1:
}
.file 1 "/tmp/torchinductor_root/qh/cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 391
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 113
.b8 104
.b8 50
.b8 100
.b8 106
.b8 51
.b8 53
.b8 53
.b8 105
.b8 97
.b8 116
.b8 106
.b8 122
.b8 118
.b8 105
.b8 53
.b8 99
.b8 109
.b8 122
.b8 52
.b8 116
.b8 120
.b8 118
.b8 106
.b8 100
.b8 51
.b8 97
.b8 112
.b8 53
.b8 50
.b8 115
.b8 104
.b8 103
.b8 97
.b8 115
.b8 104
.b8 52
.b8 99
.b8 122
.b8 105
.b8 102
.b8 100
.b8 99
.b8 110
.b8 97
.b8 102
.b8 110
.b8 107
.b8 107
.b8 97
.b8 109
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 113
.b8 104
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 53
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 53
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 39
.b8 58
.b8 5
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 39
.b8 58
.b8 5
.b32 125
.b64 $L__tmp15
.b64 $L__tmp16
.b8 3
.b8 39
.b8 45
.b8 5
.b32 125
.b64 $L__tmp17
.b64 $L__tmp32
.b8 2
.b8 47
.b8 59
.b8 4
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 47
.b8 59
.b8 5
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp32
.b64 $L__tmp33
.b8 3
.b8 47
.b8 45
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 395
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 101
.b8 53
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 395
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }