0-hero's picture
Add files using upload-large-folder tool
d742687 verified
raw
history blame
13.8 kB
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5de6de
.extern .func __assertfail
(
.param .b64 __assertfail_param_0,
.param .b64 __assertfail_param_1,
.param .b32 __assertfail_param_2,
.param .b64 __assertfail_param_3,
.param .b64 __assertfail_param_4
)
;
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
.visible .entry triton__0d1d2d3d4d5de6de(
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
.param .u32 triton__0d1d2d3d4d5de6de_param_6
)
.maxntid 256, 1, 1
{
.reg .pred %p<27>;
.reg .b16 %rs<3>;
.reg .b32 %r<81>;
.reg .f32 %f<73>;
.reg .b64 %rd<84>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd35, [triton__0d1d2d3d4d5de6de_param_3];
ld.param.u64 %rd34, [triton__0d1d2d3d4d5de6de_param_2];
ld.param.u64 %rd33, [triton__0d1d2d3d4d5de6de_param_1];
ld.param.u64 %rd41, [triton__0d1d2d3d4d5de6de_param_0];
$L__tmp0:
.loc 1 22 44
mov.u32 %r1, %tid.x;
bfe.u32 %r2, %r1, 2, 6;
and.b32 %r14, %r1, 63;
.loc 1 24 33
and.b32 %r3, %r1, 3;
.loc 1 21 28
mov.u32 %r13, %ctaid.x;
.loc 1 21 33
shl.b32 %r15, %r13, 6;
.loc 1 22 23
or.b32 %r16, %r15, %r2;
or.b32 %r17, %r15, %r14;
.loc 1 26 30
mul.wide.s32 %rd42, %r16, 8;
add.s64 %rd38, %rd41, %rd42;
mul.wide.s32 %rd43, %r17, 8;
add.s64 %rd40, %rd41, %rd43;
mov.pred %p11, -1;
.loc 1 26 35
mov.u64 %rd37, 0x0;
@%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ];
mov.u64 %rd39, 0x0;
@%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
.loc 1 27 18
bfe.s32 %r18, %r13, 25, 1;
shr.u32 %r19, %r18, 23;
add.s32 %r20, %r16, %r19;
and.b32 %r21, %r20, 16776704;
sub.s32 %r22, %r16, %r21;
.loc 1 35 44
shl.b32 %r5, %r22, 8;
.loc 1 36 22
add.s64 %rd44, %rd39, 50257;
.loc 1 37 22
setp.lt.s64 %p3, %rd37, 0;
setp.lt.s64 %p4, %rd39, 0;
.loc 1 38 36
selp.b64 %rd45, %rd44, %rd39, %p4;
.loc 1 39 40
setp.gt.u64 %p5, %rd45, 50256;
.loc 1 40 44
shl.b64 %rd46, %rd37, 8;
add.s64 %rd47, %rd46, 12865792;
selp.b64 %rd2, %rd47, %rd46, %p3;
mov.b32 %r67, 0;
mov.b32 %r77, 883;
mov.u64 %rd73, 1;
.loc 1 39 55
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_1;
$L__BB0_3:
.loc 1 31 36
shl.b64 %rd51, %rd2, 2;
mul.wide.u32 %rd80, %r3, 4;
add.s64 %rd79, %rd51, %rd80;
add.s64 %rd75, %rd33, %rd79;
add.s32 %r35, %r5, %r3;
mul.wide.s32 %rd78, %r35, 4;
add.s64 %rd74, %rd34, %rd78;
mov.f32 %f72, 0f00000000;
mov.b32 %r78, -4;
mov.f32 %f71, %f72;
mov.f32 %f70, %f72;
$L__BB0_4:
.loc 1 35 50
mov.u32 %r36, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ];
@!%p11 mov.u32 %r36, %r67;
mov.b32 %f28, %r36;
.loc 1 39 55
mov.u64 %rd54, assertMessage_0;
cvta.global.u64 %rd55, %rd54;
mov.u64 %rd56, assertFile_0;
cvta.global.u64 %rd57, %rd56;
mov.u64 %rd58, assertFunc_0;
cvta.global.u64 %rd59, %rd58;
{ // callseq 10, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd55;
.param .b64 param1;
st.param.b64 [param1+0], %rd57;
.param .b32 param2;
st.param.b32 [param2+0], %r77;
.param .b64 param3;
st.param.b64 [param3+0], %rd59;
.param .b64 param4;
st.param.b64 [param4+0], %rd73;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 10
.loc 1 40 52
mov.u32 %r38, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ];
@!%p11 mov.u32 %r38, %r67;
mov.b32 %f29, %r38;
.loc 1 41 22
add.f32 %f30, %f28, %f29;
$L__tmp1:
.loc 2 96 20
sub.f32 %f31, %f30, %f70;
.loc 2 97 26
add.f32 %f72, %f72, 0f3F800000;
.loc 2 98 30
mov.b32 %r41, %f31;
mov.b32 %r42, %f72;
div.full.f32 %r40, %r41, %r42;
mov.b32 %f32, %r40;
.loc 2 98 22
add.f32 %f70, %f70, %f32;
.loc 2 101 30
sub.f32 %f33, %f30, %f70;
$L__tmp2:
.loc 1 47 48
fma.rn.f32 %f71, %f31, %f33, %f71;
.loc 1 31 36
add.s32 %r78, %r78, 4;
add.s64 %rd75, %rd75, 16;
add.s64 %rd74, %rd74, 16;
setp.lt.u32 %p15, %r78, 252;
@%p15 bra $L__BB0_4;
bra.uni $L__BB0_5;
$L__BB0_1:
.loc 1 0 36
mov.b32 %r79, -4;
.loc 1 31 36
shl.b64 %rd48, %rd2, 2;
mul.wide.u32 %rd80, %r3, 4;
add.s64 %rd79, %rd48, %rd80;
add.s64 %rd77, %rd33, %rd79;
add.s32 %r25, %r5, %r3;
mul.wide.s32 %rd78, %r25, 4;
add.s64 %rd76, %rd34, %rd78;
mov.f32 %f72, 0f00000000;
mov.f32 %f71, %f72;
mov.f32 %f70, %f72;
$L__BB0_2:
.loc 1 35 50
mov.u32 %r26, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ];
@!%p11 mov.u32 %r26, %r67;
mov.b32 %f21, %r26;
.loc 1 40 52
mov.u32 %r28, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ];
@!%p11 mov.u32 %r28, %r67;
mov.b32 %f22, %r28;
.loc 1 41 22
add.f32 %f23, %f21, %f22;
$L__tmp3:
.loc 2 96 20
sub.f32 %f24, %f23, %f70;
.loc 2 97 26
add.f32 %f72, %f72, 0f3F800000;
.loc 2 98 30
mov.b32 %r31, %f24;
mov.b32 %r32, %f72;
div.full.f32 %r30, %r31, %r32;
mov.b32 %f25, %r30;
.loc 2 98 22
add.f32 %f70, %f70, %f25;
.loc 2 101 30
sub.f32 %f26, %f23, %f70;
$L__tmp4:
.loc 1 47 48
fma.rn.f32 %f71, %f24, %f26, %f71;
.loc 1 31 36
add.s32 %r79, %r79, 4;
add.s64 %rd77, %rd77, 16;
add.s64 %rd76, %rd76, 16;
setp.lt.u32 %p10, %r79, 252;
@%p10 bra $L__BB0_2;
$L__BB0_5:
.loc 1 0 36
ld.param.u64 %rd36, [triton__0d1d2d3d4d5de6de_param_4];
$L__tmp5:
.loc 2 120 46
mov.b32 %r54, %f70;
shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1;
mov.b32 %f34, %r55;
mov.b32 %r56, %f71;
shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1;
mov.b32 %f35, %r57;
mov.b32 %r58, %f72;
shfl.sync.bfly.b32 %r45, %r58, 2, 31, -1;
mov.b32 %f36, %r45;
$L__tmp6:
.loc 2 108 21
sub.f32 %f37, %f34, %f70;
.loc 2 109 28
add.f32 %f38, %f72, %f36;
.loc 2 110 39
setp.eq.f32 %p16, %f38, 0f00000000;
.loc 2 110 60
mov.b32 %r46, %f38;
div.full.f32 %r44, %r45, %r46;
mov.b32 %f39, %r44;
.loc 2 110 49
selp.f32 %f40, 0f00000000, %f39, %p16;
.loc 2 112 17
fma.rn.f32 %f41, %f37, %f40, %f70;
.loc 2 113 15
add.f32 %f42, %f71, %f35;
.loc 2 113 30
mul.f32 %f43, %f37, %f37;
.loc 2 113 38
mul.f32 %f44, %f72, %f43;
.loc 2 113 22
fma.rn.f32 %f45, %f44, %f40, %f42;
$L__tmp7:
.loc 2 120 46
mov.b32 %r59, %f41;
shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
mov.b32 %f46, %r60;
mov.b32 %r61, %f45;
shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
mov.b32 %f47, %r62;
shfl.sync.bfly.b32 %r48, %r46, 1, 31, -1;
mov.b32 %f48, %r48;
$L__tmp8:
.loc 2 108 21
sub.f32 %f49, %f46, %f41;
.loc 2 109 28
add.f32 %f50, %f38, %f48;
.loc 2 110 39
setp.eq.f32 %p17, %f50, 0f00000000;
.loc 2 110 60
mov.b32 %r49, %f50;
div.full.f32 %r47, %r48, %r49;
mov.b32 %f51, %r47;
.loc 2 110 49
selp.f32 %f52, 0f00000000, %f51, %p17;
.loc 2 112 17
fma.rn.f32 %f16, %f49, %f52, %f41;
.loc 2 113 15
add.f32 %f53, %f45, %f47;
.loc 2 113 30
mul.f32 %f54, %f49, %f49;
.loc 2 113 38
mul.f32 %f55, %f38, %f54;
.loc 2 113 22
fma.rn.f32 %f56, %f52, %f55, %f53;
$L__tmp9:
.loc 1 69 23
mov.b32 %r51, %f56;
mov.b32 %r52, 1132462080;
div.full.f32 %r50, %r51, %r52;
mov.b32 %f57, %r50;
.loc 1 71 24
add.f32 %f17, %f57, 0f3727C5AC;
.loc 1 55 36
shl.b32 %r63, %r13, 14;
shl.b32 %r64, %r2, 8;
or.b32 %r65, %r63, %r64;
or.b32 %r10, %r65, %r3;
add.s64 %rd83, %rd33, %rd79;
add.s64 %rd82, %rd35, %rd80;
add.s64 %rd81, %rd34, %rd78;
mov.b32 %r80, -4;
setp.lt.u64 %p22, %rd45, 50257;
rsqrt.approx.ftz.f32 %f61, %f17;
bra.uni $L__BB0_6;
$L__BB0_8:
.loc 1 0 0
mov.b32 %f18, %r66;
mov.b32 %f19, %r68;
.loc 1 65 54
mov.u32 %r71, 0x0;
@%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ];
@!%p11 mov.u32 %r71, %r67;
mov.b32 %f58, %r71;
.loc 1 66 24
add.f32 %f59, %f18, %f58;
.loc 1 67 24
sub.f32 %f60, %f59, %f16;
.loc 1 73 24
mul.f32 %f62, %f60, %f61;
.loc 1 74 24
mul.f32 %f63, %f62, %f19;
.loc 1 55 36
add.s32 %r80, %r80, 4;
.loc 1 76 29
add.s32 %r74, %r80, %r10;
mul.wide.s32 %rd72, %r74, 2;
add.s64 %rd71, %rd36, %rd72;
.loc 1 76 52
mov.b32 %r73, %f63;
cvt.rn.bf16.f32 %rs1, %r73;
@%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 };
.loc 1 55 36
add.s64 %rd83, %rd83, 16;
add.s64 %rd82, %rd82, 16;
add.s64 %rd81, %rd81, 16;
setp.lt.u32 %p26, %r80, 252;
@%p26 bra $L__BB0_6;
bra.uni $L__BB0_9;
$L__BB0_6:
.loc 1 59 51
mov.u32 %r66, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ];
@!%p11 mov.u32 %r66, %r67;
.loc 1 60 40
mov.u32 %r68, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ];
@!%p11 mov.u32 %r68, %r67;
.loc 1 64 57
@%p22 bra $L__BB0_8;
mov.u64 %rd63, assertMessage_1;
cvta.global.u64 %rd64, %rd63;
mov.u64 %rd65, assertFile_1;
cvta.global.u64 %rd66, %rd65;
mov.u64 %rd67, assertFunc_1;
cvta.global.u64 %rd68, %rd67;
{ // callseq 11, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd64;
.param .b64 param1;
st.param.b64 [param1+0], %rd66;
.param .b32 param2;
st.param.b32 [param2+0], %r77;
.param .b64 param3;
st.param.b64 [param3+0], %rd68;
.param .b64 param4;
st.param.b64 [param4+0], %rd73;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 11
bra.uni $L__BB0_8;
$L__BB0_9:
.loc 1 55 4
ret;
$L__tmp10:
$L__func_end0:
}
// .globl __nv_rsqrtf
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
.param .b32 __nv_rsqrtf_param_0
)
{
.reg .f32 %f<3>;
$L__func_begin1:
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
rsqrt.approx.ftz.f32 %f2, %f1;
st.param.f32 [func_retval0+0], %f2;
ret;
$L__func_end1:
}
.file 1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 298
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 108
.b8 104
.b8 101
.b8 52
.b8 97
.b8 51
.b8 115
.b8 116
.b8 118
.b8 117
.b8 102
.b8 120
.b8 97
.b8 102
.b8 109
.b8 113
.b8 51
.b8 107
.b8 107
.b8 53
.b8 104
.b8 111
.b8 100
.b8 97
.b8 122
.b8 122
.b8 50
.b8 101
.b8 102
.b8 99
.b8 116
.b8 102
.b8 102
.b8 116
.b8 101
.b8 54
.b8 52
.b8 54
.b8 122
.b8 110
.b8 106
.b8 100
.b8 110
.b8 118
.b8 51
.b8 108
.b8 113
.b8 105
.b8 53
.b8 111
.b8 97
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 108
.b8 104
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 101
.b8 54
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 101
.b8 54
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp4
.b8 2
.b8 44
.b8 38
.b8 4
.b32 125
.b64 $L__tmp5
.b64 $L__tmp8
.b8 2
.b8 50
.b8 41
.b8 5
.b32 125
.b64 $L__tmp6
.b64 $L__tmp9
.b8 2
.b8 50
.b8 41
.b8 4
.b32 125
.b64 $L__tmp6
.b64 $L__tmp9
.b8 2
.b8 120
.b8 46
.b8 0
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 302
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 101
.b8 54
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 302
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }