0-hero's picture
Add files using upload-large-folder tool
934a9ba verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6d7d8de9de
.extern .shared .align 1 .b8 global_smem[];
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
)
.maxntid 64, 1, 1
{
.reg .pred %p<33>;
.reg .b16 %rs<21>;
.reg .b32 %r<112>;
.reg .f32 %f<94>;
.reg .b64 %rd<20>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
$L__tmp0:
.loc 1 26 26
mov.u32 %r78, %tid.x;
and.b32 %r79, %r78, 31;
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
shl.b32 %r80, %r78, 2;
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
and.b32 %r81, %r80, 252;
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
.loc 1 23 28
mov.u32 %r1, %ctaid.x;
.loc 1 30 40
shl.b32 %r82, %r1, 8;
.loc 1 30 36
or.b32 %r83, %r82, %r81;
.loc 1 30 30
mul.wide.s32 %rd17, %r83, 4;
add.s64 %rd1, %rd9, %rd17;
mov.b32 %r6, 0;
mov.pred %p1, -1;
.loc 1 30 46
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
@!%p1 mov.u32 %r2, %r6;
@!%p1 mov.u32 %r3, %r6;
@!%p1 mov.u32 %r4, %r6;
@!%p1 mov.u32 %r5, %r6;
mov.b32 %f1, %r2;
mov.b32 %f2, %r3;
mov.b32 %f3, %r4;
mov.b32 %f4, %r5;
.loc 1 31 30
mul.wide.s32 %rd18, %r83, 2;
add.s64 %rd2, %rd10, %rd18;
.loc 1 31 46
mov.u32 %r10, 0x0;
mov.u32 %r11, 0x0;
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
@!%p1 mov.u32 %r10, %r6;
@!%p1 mov.u32 %r11, %r6;
cvt.u16.u32 %rs1, %r10;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
cvt.u16.u32 %rs3, %r11;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
.loc 1 31 67
cvt.f32.bf16 %r14, %rs1;
mov.b32 %f5, %r14;
cvt.f32.bf16 %r15, %rs2;
mov.b32 %f6, %r15;
cvt.f32.bf16 %r16, %rs3;
mov.b32 %f7, %r16;
cvt.f32.bf16 %r17, %rs4;
mov.b32 %f8, %r17;
.loc 1 32 30
add.s64 %rd3, %rd11, %rd18;
.loc 1 32 46
mov.u32 %r18, 0x0;
mov.u32 %r19, 0x0;
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
@!%p1 mov.u32 %r18, %r6;
@!%p1 mov.u32 %r19, %r6;
cvt.u16.u32 %rs5, %r18;
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
cvt.u16.u32 %rs7, %r19;
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
.loc 1 32 67
cvt.f32.bf16 %r22, %rs5;
mov.b32 %f9, %r22;
cvt.f32.bf16 %r23, %rs6;
mov.b32 %f10, %r23;
cvt.f32.bf16 %r24, %rs7;
mov.b32 %f11, %r24;
cvt.f32.bf16 %r25, %rs8;
mov.b32 %f12, %r25;
.loc 1 33 30
add.s64 %rd4, %rd12, %rd18;
.loc 1 33 46
mov.u32 %r26, 0x0;
mov.u32 %r27, 0x0;
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
@!%p1 mov.u32 %r26, %r6;
@!%p1 mov.u32 %r27, %r6;
cvt.u16.u32 %rs9, %r26;
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
cvt.u16.u32 %rs11, %r27;
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
.loc 1 33 67
cvt.f32.bf16 %r30, %rs9;
mov.b32 %f13, %r30;
cvt.f32.bf16 %r31, %rs10;
mov.b32 %f14, %r31;
cvt.f32.bf16 %r32, %rs11;
mov.b32 %f15, %r32;
cvt.f32.bf16 %r33, %rs12;
mov.b32 %f16, %r33;
.loc 1 34 31
add.s64 %rd5, %rd13, %rd18;
.loc 1 34 47
mov.u32 %r34, 0x0;
mov.u32 %r35, 0x0;
@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
@!%p1 mov.u32 %r34, %r6;
@!%p1 mov.u32 %r35, %r6;
cvt.u16.u32 %rs13, %r34;
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
cvt.u16.u32 %rs15, %r35;
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
.loc 1 34 68
cvt.f32.bf16 %r38, %rs13;
mov.b32 %f17, %r38;
cvt.f32.bf16 %r39, %rs14;
mov.b32 %f18, %r39;
cvt.f32.bf16 %r40, %rs15;
mov.b32 %f19, %r40;
cvt.f32.bf16 %r41, %rs16;
mov.b32 %f20, %r41;
.loc 1 35 31
mul.wide.u32 %rd19, %r81, 4;
add.s64 %rd6, %rd14, %rd19;
.loc 1 35 36
mov.u32 %r42, 0x0;
mov.u32 %r43, 0x0;
mov.u32 %r44, 0x0;
mov.u32 %r45, 0x0;
@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
@!%p1 mov.u32 %r42, %r6;
@!%p1 mov.u32 %r43, %r6;
@!%p1 mov.u32 %r44, %r6;
@!%p1 mov.u32 %r45, %r6;
.loc 1 37 18
add.f32 %f21, %f5, %f1;
add.f32 %f22, %f6, %f2;
add.f32 %f23, %f7, %f3;
.loc 1 39 18
add.f32 %f24, %f21, %f9;
add.f32 %f25, %f22, %f10;
add.f32 %f26, %f23, %f11;
.loc 1 41 18
add.f32 %f27, %f25, %f14;
add.f32 %f28, %f26, %f15;
.loc 1 43 19
add.f32 %f29, %f27, %f18;
add.f32 %f30, %f28, %f19;
.loc 1 41 18
add.f32 %f31, %f24, %f13;
add.f32 %f32, %f8, %f4;
.loc 1 43 19
add.f32 %f33, %f32, %f12;
add.f32 %f34, %f31, %f17;
$L__tmp1:
.loc 2 233 15
add.f32 %f35, %f34, %f29;
add.f32 %f36, %f33, %f16;
add.f32 %f37, %f35, %f30;
add.f32 %f38, %f36, %f20;
mov.b32 %r71, %f38;
add.f32 %f39, %f37, %f38;
$L__tmp2:
.loc 2 243 36
mov.b32 %r84, %f39;
shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1;
mov.b32 %f40, %r85;
$L__tmp3:
.loc 2 233 15
add.f32 %f41, %f39, %f40;
$L__tmp4:
.loc 2 243 36
mov.b32 %r86, %f41;
shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1;
mov.b32 %f42, %r87;
$L__tmp5:
.loc 2 233 15
add.f32 %f43, %f41, %f42;
$L__tmp6:
.loc 2 243 36
mov.b32 %r88, %f43;
shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1;
mov.b32 %f44, %r89;
$L__tmp7:
.loc 2 233 15
add.f32 %f45, %f43, %f44;
$L__tmp8:
.loc 2 243 36
mov.b32 %r90, %f45;
shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1;
mov.b32 %f46, %r91;
$L__tmp9:
.loc 2 233 15
add.f32 %f47, %f45, %f46;
$L__tmp10:
.loc 2 243 36
mov.b32 %r92, %f47;
shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1;
mov.b32 %f48, %r93;
$L__tmp11:
.loc 2 233 15
add.f32 %f49, %f47, %f48;
$L__tmp12:
.loc 2 243 36
setp.eq.s32 %p23, %r79, 0;
shr.u32 %r94, %r78, 3;
and.b32 %r95, %r94, 4;
mov.u32 %r96, global_smem;
add.s32 %r50, %r96, %r95;
mov.b32 %r51, %f49;
@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
bar.sync 0;
setp.lt.s32 %p24, %r78, 2;
add.s32 %r53, %r96, %r80;
@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
mov.b32 %f50, %r52;
shfl.sync.bfly.b32 %r97, %r52, 1, 31, -1;
mov.b32 %f51, %r97;
$L__tmp13:
.loc 2 233 15
add.f32 %f52, %f50, %f51;
$L__tmp14:
.loc 2 243 36
and.b32 %r98, %r78, 1;
setp.eq.b32 %p31, %r98, 1;
not.pred %p32, %p31;
and.pred %p25, %p24, %p32;
mov.b32 %r55, %f52;
@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
bar.sync 0;
ld.shared.f32 %f53, [global_smem];
$L__tmp15:
.loc 3 8 15
add.f32 %f54, %f53, 0f00000000;
$L__tmp16:
.loc 1 51 20
mov.b32 %r57, %f54;
mov.b32 %r58, 1132462080;
div.full.f32 %r56, %r57, %r58;
mov.b32 %f55, %r56;
.loc 1 52 20
sub.f32 %f56, %f34, %f55;
sub.f32 %f57, %f29, %f55;
sub.f32 %f58, %f30, %f55;
sub.f32 %f59, %f38, %f55;
.loc 1 53 20
mul.f32 %f60, %f57, %f57;
$L__tmp17:
.loc 2 243 36
bar.sync 0;
$L__tmp18:
.loc 2 233 15
fma.rn.f32 %f61, %f56, %f56, %f60;
fma.rn.f32 %f62, %f58, %f58, %f61;
fma.rn.f32 %f63, %f59, %f59, %f62;
$L__tmp19:
.loc 2 243 36
mov.b32 %r99, %f63;
shfl.sync.bfly.b32 %r100, %r99, 16, 31, -1;
mov.b32 %f64, %r100;
$L__tmp20:
.loc 2 233 15
add.f32 %f65, %f63, %f64;
$L__tmp21:
.loc 2 243 36
mov.b32 %r101, %f65;
shfl.sync.bfly.b32 %r102, %r101, 8, 31, -1;
mov.b32 %f66, %r102;
$L__tmp22:
.loc 2 233 15
add.f32 %f67, %f65, %f66;
$L__tmp23:
.loc 2 243 36
mov.b32 %r103, %f67;
shfl.sync.bfly.b32 %r104, %r103, 4, 31, -1;
mov.b32 %f68, %r104;
$L__tmp24:
.loc 2 233 15
add.f32 %f69, %f67, %f68;
$L__tmp25:
.loc 2 243 36
mov.b32 %r105, %f69;
shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1;
mov.b32 %f70, %r106;
$L__tmp26:
.loc 2 233 15
add.f32 %f71, %f69, %f70;
$L__tmp27:
.loc 2 243 36
mov.b32 %r107, %f71;
shfl.sync.bfly.b32 %r108, %r107, 1, 31, -1;
mov.b32 %f72, %r108;
$L__tmp28:
.loc 2 233 15
add.f32 %f73, %f71, %f72;
$L__tmp29:
.loc 2 243 36
mov.b32 %r60, %f73;
@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
bar.sync 0;
@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
mov.b32 %f74, %r61;
shfl.sync.bfly.b32 %r109, %r61, 1, 31, -1;
mov.b32 %f75, %r109;
$L__tmp30:
.loc 2 233 15
add.f32 %f76, %f74, %f75;
$L__tmp31:
.loc 2 243 36
mov.b32 %r64, %f76;
@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
bar.sync 0;
ld.shared.f32 %f77, [global_smem];
$L__tmp32:
.loc 3 8 15
add.f32 %f78, %f77, 0f00000000;
$L__tmp33:
.loc 1 59 20
mov.b32 %r66, %f78;
div.full.f32 %r65, %r66, %r58;
mov.b32 %f79, %r65;
.loc 1 61 20
add.f32 %f80, %f79, 0f3727C5AC;
.loc 1 62 26
rsqrt.approx.ftz.f32 %f81, %f80;
.loc 1 35 36
mov.b32 %f82, %r45;
mov.b32 %f83, %r44;
mov.b32 %f84, %r43;
mov.b32 %f85, %r42;
.loc 1 63 20
mul.f32 %f86, %f56, %f81;
mul.f32 %f87, %f57, %f81;
mul.f32 %f88, %f58, %f81;
mul.f32 %f89, %f59, %f81;
.loc 1 64 20
mul.f32 %f90, %f86, %f85;
mul.f32 %f91, %f87, %f84;
mul.f32 %f92, %f88, %f83;
mul.f32 %f93, %f89, %f82;
.loc 1 66 25
add.s64 %rd7, %rd15, %rd17;
.loc 1 66 48
mov.b32 %r68, %f34;
mov.b32 %r69, %f29;
mov.b32 %r70, %f30;
@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
.loc 1 67 25
add.s64 %rd8, %rd16, %rd18;
.loc 1 67 48
mov.b32 %r72, %f90;
cvt.rn.bf16.f32 %rs17, %r72;
mov.b32 %r73, %f91;
cvt.rn.bf16.f32 %rs18, %r73;
mov.b32 %r74, %f92;
cvt.rn.bf16.f32 %rs19, %r74;
mov.b32 %r75, %f93;
cvt.rn.bf16.f32 %rs20, %r75;
mov.b32 %r110, {%rs17, %rs18};
mov.b32 %r111, {%rs19, %rs20};
@%p1 st.global.v2.b32 [ %rd8 + 0 ], { %r110, %r111 };
.loc 1 67 4
ret;
$L__tmp34:
$L__func_end0:
}
// .globl __nv_rsqrtf
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
.param .b32 __nv_rsqrtf_param_0
)
{
.reg .f32 %f<3>;
$L__func_begin1:
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
rsqrt.approx.ftz.f32 %f2, %f1;
st.param.f32 [func_retval0+0], %f2;
ret;
$L__func_end1:
}
.file 1 "/tmp/torchinductor_root/jb/cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 407
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 106
.b8 98
.b8 110
.b8 113
.b8 103
.b8 53
.b8 117
.b8 52
.b8 115
.b8 106
.b8 55
.b8 97
.b8 52
.b8 120
.b8 115
.b8 116
.b8 106
.b8 101
.b8 114
.b8 51
.b8 97
.b8 54
.b8 116
.b8 100
.b8 103
.b8 110
.b8 110
.b8 105
.b8 103
.b8 98
.b8 50
.b8 105
.b8 121
.b8 109
.b8 100
.b8 50
.b8 55
.b8 103
.b8 99
.b8 115
.b8 54
.b8 111
.b8 55
.b8 111
.b8 100
.b8 117
.b8 104
.b8 120
.b8 121
.b8 50
.b8 118
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 106
.b8 98
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 101
.b8 57
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 101
.b8 57
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 48
.b8 59
.b8 5
.b32 125
.b64 $L__tmp1
.b64 $L__tmp14
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 48
.b8 59
.b8 5
.b32 125
.b64 $L__tmp15
.b64 $L__tmp16
.b8 3
.b8 48
.b8 45
.b8 5
.b32 125
.b64 $L__tmp17
.b64 $L__tmp32
.b8 2
.b8 56
.b8 59
.b8 4
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 56
.b8 59
.b8 5
.b32 125
.b64 $L__tmp18
.b64 $L__tmp31
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp32
.b64 $L__tmp33
.b8 3
.b8 56
.b8 45
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 411
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 56
.b8 100
.b8 101
.b8 57
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 411
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }