0-hero's picture
Add files using upload-large-folder tool
41c016f verified
raw
history blame
21.6 kB
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6de7de
.extern .func __assertfail
(
.param .b64 __assertfail_param_0,
.param .b64 __assertfail_param_1,
.param .b32 __assertfail_param_2,
.param .b64 __assertfail_param_3,
.param .b64 __assertfail_param_4
)
;
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
.extern .shared .align 1 .b8 global_smem[];
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
.visible .entry triton__0d1d2d3d4d5d6de7de(
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
)
.maxntid 64, 1, 1
{
.reg .pred %p<59>;
.reg .b16 %rs<13>;
.reg .b32 %r<176>;
.reg .f32 %f<169>;
.reg .b64 %rd<58>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
$L__tmp0:
.loc 1 24 33
mov.u32 %r1, %tid.x;
and.b32 %r2, %r1, 31;
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
bfe.u32 %r3, %r1, 5, 1;
shl.b32 %r30, %r1, 2;
and.b32 %r4, %r30, 252;
.loc 1 21 28
mov.u32 %r13, %ctaid.x;
.loc 1 26 30
mul.wide.s32 %rd25, %r13, 8;
add.s64 %rd11, %rd22, %rd25;
mov.pred %p53, -1;
.loc 1 26 35
mov.u64 %rd10, 0x0;
@%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
mov.u64 %rd12, 0x0;
@%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
mov.u64 %rd14, 0x0;
@%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
mov.u64 %rd16, 0x0;
@%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
mov.u64 %rd18, 0x0;
@%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ];
.loc 1 27 18
shr.s32 %r31, %r13, 31;
shr.u32 %r32, %r31, 23;
add.s32 %r33, %r13, %r32;
and.b32 %r34, %r33, 16776704;
sub.s32 %r35, %r13, %r34;
.loc 1 35 44
shl.b32 %r36, %r35, 8;
.loc 1 35 40
or.b32 %r37, %r36, %r4;
.loc 1 35 34
mul.wide.s32 %rd26, %r37, 4;
add.s64 %rd37, %rd23, %rd26;
mov.b32 %r151, 0;
.loc 1 35 50
mov.u32 %r14, 0x0;
mov.u32 %r15, 0x0;
mov.u32 %r16, 0x0;
mov.u32 %r17, 0x0;
@%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ];
@!%p53 mov.u32 %r14, %r151;
@!%p53 mov.u32 %r15, %r151;
@!%p53 mov.u32 %r16, %r151;
@!%p53 mov.u32 %r17, %r151;
mov.b32 %f2, %r14;
mov.b32 %f1, %r15;
mov.b32 %f3, %r16;
mov.b32 %f4, %r17;
.loc 1 36 44
shl.b32 %r38, %r13, 8;
.loc 1 36 40
or.b32 %r39, %r38, %r4;
.loc 1 36 34
mul.wide.s32 %rd27, %r39, 2;
add.s64 %rd38, %rd24, %rd27;
.loc 1 36 50
mov.u32 %r22, 0x0;
mov.u32 %r23, 0x0;
@%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ];
@!%p53 mov.u32 %r22, %r151;
@!%p53 mov.u32 %r23, %r151;
cvt.u16.u32 %rs1, %r22;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; }
cvt.u16.u32 %rs3, %r23;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; }
.loc 1 36 101
cvt.f32.bf16 %r26, %rs1;
mov.b32 %f5, %r26;
cvt.f32.bf16 %r27, %rs2;
mov.b32 %f6, %r27;
cvt.f32.bf16 %r28, %rs3;
mov.b32 %f7, %r28;
cvt.f32.bf16 %r29, %rs4;
mov.b32 %f8, %r29;
.loc 1 37 22
add.s64 %rd28, %rd18, 50257;
.loc 1 38 22
setp.lt.s64 %p14, %rd18, 0;
.loc 1 39 36
selp.b64 %rd5, %rd28, %rd18, %p14;
.loc 1 40 40
setp.lt.u64 %p15, %rd5, 50257;
mov.b32 %r175, 883;
mov.u64 %rd57, 1;
.loc 1 40 55
@%p15 bra $L__BB0_2;
mov.u64 %rd29, assertMessage_0;
cvta.global.u64 %rd30, %rd29;
mov.u64 %rd31, assertFile_0;
cvta.global.u64 %rd32, %rd31;
mov.u64 %rd33, assertFunc_0;
cvta.global.u64 %rd34, %rd33;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd30;
.param .b64 param1;
st.param.b64 [param1+0], %rd32;
.param .b32 param2;
st.param.b32 [param2+0], %r175;
.param .b64 param3;
st.param.b64 [param3+0], %rd34;
.param .b64 param4;
st.param.b64 [param4+0], %rd57;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 0
$L__BB0_2:
.loc 1 0 55
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
cvt.s64.s32 %rd3, %r39;
.loc 1 38 22
setp.lt.s64 %p44, %rd10, 0;
.loc 1 41 44
shl.b64 %rd40, %rd10, 8;
add.s64 %rd41, %rd40, 12865792;
selp.b64 %rd42, %rd41, %rd40, %p44;
cvt.u64.u32 %rd43, %r4;
.loc 1 41 40
or.b64 %rd44, %rd42, %rd43;
.loc 1 41 34
shl.b64 %rd45, %rd44, 2;
add.s64 %rd54, %rd7, %rd45;
.loc 1 41 52
mov.u32 %r41, 0x0;
mov.u32 %r42, 0x0;
mov.u32 %r43, 0x0;
mov.u32 %r44, 0x0;
@%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ];
@!%p53 mov.u32 %r41, %r151;
@!%p53 mov.u32 %r42, %r151;
@!%p53 mov.u32 %r43, %r151;
@!%p53 mov.u32 %r44, %r151;
mov.b32 %f15, %r43;
mov.b32 %f16, %r44;
.loc 1 42 22
add.f32 %f17, %f3, %f15;
add.f32 %f18, %f4, %f16;
.loc 1 44 22
add.f32 %f19, %f7, %f17;
add.f32 %f20, %f8, %f18;
.loc 1 41 52
mov.b32 %f21, %r41;
mov.b32 %f22, %r42;
.loc 1 42 22
add.f32 %f23, %f1, %f22;
add.f32 %f24, %f2, %f21;
.loc 1 44 22
add.f32 %f25, %f5, %f24;
add.f32 %f26, %f6, %f23;
$L__tmp1:
.loc 2 98 22
add.f32 %f27, %f26, 0f00000000;
add.f32 %f28, %f25, 0f00000000;
add.f32 %f29, %f19, 0f00000000;
add.f32 %f30, %f20, 0f00000000;
.loc 2 101 30
sub.f32 %f31, %f25, %f28;
sub.f32 %f32, %f26, %f27;
sub.f32 %f33, %f19, %f29;
sub.f32 %f34, %f20, %f30;
.loc 2 101 13
fma.rn.f32 %f35, %f25, %f31, 0f00000000;
fma.rn.f32 %f36, %f26, %f32, 0f00000000;
fma.rn.f32 %f37, %f19, %f33, 0f00000000;
fma.rn.f32 %f38, %f20, %f34, 0f00000000;
$L__tmp2:
.loc 2 108 21
sub.f32 %f39, %f27, %f28;
mov.b32 %r50, 1065353216;
mov.b32 %r51, 1073741824;
.loc 2 110 60
div.full.f32 %r49, %r50, %r51;
mov.b32 %f40, %r49;
.loc 2 112 17
fma.rn.f32 %f41, %f40, %f39, %f28;
.loc 2 113 15
add.f32 %f42, %f35, %f36;
.loc 2 113 30
mul.f32 %f43, %f39, %f39;
.loc 2 113 22
fma.rn.f32 %f44, %f40, %f43, %f42;
.loc 2 108 21
sub.f32 %f45, %f29, %f41;
mov.b32 %r54, 1077936128;
.loc 2 110 60
div.full.f32 %r52, %r50, %r54;
mov.b32 %f46, %r52;
.loc 2 112 17
fma.rn.f32 %f47, %f46, %f45, %f41;
.loc 2 113 15
add.f32 %f48, %f37, %f44;
.loc 2 113 30
mul.f32 %f49, %f45, %f45;
.loc 2 113 38
fma.rn.f32 %f50, %f45, %f45, %f49;
.loc 2 113 22
fma.rn.f32 %f51, %f46, %f50, %f48;
.loc 2 108 21
sub.f32 %f52, %f30, %f47;
mov.b32 %r57, 1082130432;
.loc 2 110 60
div.full.f32 %r55, %r50, %r57;
mov.b32 %f53, %r55;
.loc 2 112 17
fma.rn.f32 %f54, %f53, %f52, %f47;
.loc 2 113 15
add.f32 %f55, %f38, %f51;
.loc 2 113 30
mul.f32 %f56, %f52, %f52;
.loc 2 113 38
mul.f32 %f57, %f56, 0f40400000;
.loc 2 113 22
fma.rn.f32 %f58, %f53, %f57, %f55;
$L__tmp3:
.loc 2 120 46
mov.b32 %r118, %f54;
shfl.sync.bfly.b32 %r119, %r118, 16, 31, -1;
mov.b32 %f59, %r119;
mov.b32 %r120, %f58;
shfl.sync.bfly.b32 %r121, %r120, 16, 31, -1;
mov.b32 %f60, %r121;
shfl.sync.bfly.b32 %r59, %r57, 16, 31, -1;
mov.b32 %f61, %r59;
$L__tmp4:
.loc 2 108 21
sub.f32 %f62, %f59, %f54;
.loc 2 109 28
add.f32 %f63, %f61, 0f40800000;
.loc 2 110 39
setp.eq.f32 %p45, %f63, 0f00000000;
.loc 2 110 60
mov.b32 %r60, %f63;
div.full.f32 %r58, %r59, %r60;
mov.b32 %f64, %r58;
.loc 2 110 49
selp.f32 %f65, 0f00000000, %f64, %p45;
.loc 2 112 17
fma.rn.f32 %f66, %f65, %f62, %f54;
.loc 2 113 15
add.f32 %f67, %f58, %f60;
.loc 2 113 30
mul.f32 %f68, %f62, %f62;
.loc 2 113 38
mul.f32 %f69, %f68, 0f40800000;
.loc 2 113 22
fma.rn.f32 %f70, %f65, %f69, %f67;
$L__tmp5:
.loc 2 120 46
mov.b32 %r122, %f66;
shfl.sync.bfly.b32 %r123, %r122, 8, 31, -1;
mov.b32 %f71, %r123;
mov.b32 %r124, %f70;
shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
mov.b32 %f72, %r125;
shfl.sync.bfly.b32 %r62, %r60, 8, 31, -1;
mov.b32 %f73, %r62;
$L__tmp6:
.loc 2 108 21
sub.f32 %f74, %f71, %f66;
.loc 2 109 28
add.f32 %f75, %f63, %f73;
.loc 2 110 39
setp.eq.f32 %p46, %f75, 0f00000000;
.loc 2 110 60
mov.b32 %r63, %f75;
div.full.f32 %r61, %r62, %r63;
mov.b32 %f76, %r61;
.loc 2 110 49
selp.f32 %f77, 0f00000000, %f76, %p46;
.loc 2 112 17
fma.rn.f32 %f78, %f77, %f74, %f66;
.loc 2 113 15
add.f32 %f79, %f70, %f72;
.loc 2 113 30
mul.f32 %f80, %f74, %f74;
.loc 2 113 38
mul.f32 %f81, %f63, %f80;
.loc 2 113 22
fma.rn.f32 %f82, %f77, %f81, %f79;
$L__tmp7:
.loc 2 120 46
mov.b32 %r126, %f78;
shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
mov.b32 %f83, %r127;
mov.b32 %r128, %f82;
shfl.sync.bfly.b32 %r129, %r128, 4, 31, -1;
mov.b32 %f84, %r129;
shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1;
mov.b32 %f85, %r65;
$L__tmp8:
.loc 2 108 21
sub.f32 %f86, %f83, %f78;
.loc 2 109 28
add.f32 %f87, %f75, %f85;
.loc 2 110 39
setp.eq.f32 %p47, %f87, 0f00000000;
.loc 2 110 60
mov.b32 %r66, %f87;
div.full.f32 %r64, %r65, %r66;
mov.b32 %f88, %r64;
.loc 2 110 49
selp.f32 %f89, 0f00000000, %f88, %p47;
.loc 2 112 17
fma.rn.f32 %f90, %f89, %f86, %f78;
.loc 2 113 15
add.f32 %f91, %f82, %f84;
.loc 2 113 30
mul.f32 %f92, %f86, %f86;
.loc 2 113 38
mul.f32 %f93, %f75, %f92;
.loc 2 113 22
fma.rn.f32 %f94, %f89, %f93, %f91;
$L__tmp9:
.loc 2 120 46
mov.b32 %r130, %f90;
shfl.sync.bfly.b32 %r131, %r130, 2, 31, -1;
mov.b32 %f95, %r131;
mov.b32 %r132, %f94;
shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1;
mov.b32 %f96, %r133;
shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1;
mov.b32 %f97, %r68;
$L__tmp10:
.loc 2 108 21
sub.f32 %f98, %f95, %f90;
.loc 2 109 28
add.f32 %f99, %f87, %f97;
.loc 2 110 39
setp.eq.f32 %p48, %f99, 0f00000000;
.loc 2 110 60
mov.b32 %r69, %f99;
div.full.f32 %r67, %r68, %r69;
mov.b32 %f100, %r67;
.loc 2 110 49
selp.f32 %f101, 0f00000000, %f100, %p48;
.loc 2 112 17
fma.rn.f32 %f102, %f101, %f98, %f90;
.loc 2 113 15
add.f32 %f103, %f94, %f96;
.loc 2 113 30
mul.f32 %f104, %f98, %f98;
.loc 2 113 38
mul.f32 %f105, %f87, %f104;
.loc 2 113 22
fma.rn.f32 %f106, %f101, %f105, %f103;
$L__tmp11:
.loc 2 120 46
mov.b32 %r134, %f102;
shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1;
mov.b32 %f107, %r135;
mov.b32 %r136, %f106;
shfl.sync.bfly.b32 %r137, %r136, 1, 31, -1;
mov.b32 %f108, %r137;
shfl.sync.bfly.b32 %r71, %r69, 1, 31, -1;
mov.b32 %f109, %r71;
$L__tmp12:
.loc 2 108 21
sub.f32 %f110, %f107, %f102;
.loc 2 109 28
add.f32 %f111, %f99, %f109;
.loc 2 110 39
setp.eq.f32 %p49, %f111, 0f00000000;
.loc 2 110 60
mov.b32 %r72, %f111;
div.full.f32 %r70, %r71, %r72;
mov.b32 %f112, %r70;
.loc 2 110 49
selp.f32 %f113, 0f00000000, %f112, %p49;
.loc 2 112 17
fma.rn.f32 %f114, %f113, %f110, %f102;
.loc 2 113 15
add.f32 %f115, %f106, %f108;
.loc 2 113 30
mul.f32 %f116, %f110, %f110;
.loc 2 113 38
mul.f32 %f117, %f99, %f116;
.loc 2 113 22
fma.rn.f32 %f118, %f113, %f117, %f115;
$L__tmp13:
.loc 2 120 46
setp.eq.s32 %p21, %r2, 0;
shl.b32 %r138, %r3, 2;
mov.u32 %r139, global_smem;
add.s32 %r73, %r139, %r138;
mov.b32 %r74, %f114;
@%p21 st.shared.b32 [ %r73 + 0 ], %r74;
add.s32 %r140, %r139, 8;
add.s32 %r75, %r140, %r138;
mov.b32 %r76, %f118;
@%p21 st.shared.b32 [ %r75 + 0 ], %r76;
add.s32 %r141, %r139, 16;
add.s32 %r77, %r141, %r138;
@%p21 st.shared.b32 [ %r77 + 0 ], %r72;
bar.sync 0;
setp.lt.s32 %p24, %r1, 2;
add.s32 %r80, %r139, %r30;
@%p24 ld.shared.b32 %r79, [ %r80 + 0 ];
mov.b32 %f119, %r79;
add.s32 %r82, %r140, %r30;
@%p24 ld.shared.b32 %r81, [ %r82 + 0 ];
mov.b32 %f120, %r81;
add.s32 %r84, %r141, %r30;
@%p24 ld.shared.b32 %r83, [ %r84 + 0 ];
mov.b32 %f121, %r83;
shfl.sync.bfly.b32 %r143, %r79, 1, 31, -1;
mov.b32 %f122, %r143;
shfl.sync.bfly.b32 %r144, %r81, 1, 31, -1;
mov.b32 %f123, %r144;
shfl.sync.bfly.b32 %r86, %r83, 1, 31, -1;
mov.b32 %f124, %r86;
$L__tmp14:
.loc 2 108 21
sub.f32 %f125, %f122, %f119;
.loc 2 109 28
add.f32 %f126, %f121, %f124;
.loc 2 110 39
setp.eq.f32 %p50, %f126, 0f00000000;
.loc 2 110 60
mov.b32 %r87, %f126;
div.full.f32 %r85, %r86, %r87;
mov.b32 %f127, %r85;
.loc 2 110 49
selp.f32 %f128, 0f00000000, %f127, %p50;
.loc 2 112 17
fma.rn.f32 %f129, %f125, %f128, %f119;
.loc 2 113 15
add.f32 %f130, %f120, %f123;
.loc 2 113 30
mul.f32 %f131, %f125, %f125;
.loc 2 113 38
mul.f32 %f132, %f121, %f131;
.loc 2 113 22
fma.rn.f32 %f133, %f132, %f128, %f130;
$L__tmp15:
.loc 2 120 46
and.b32 %r145, %r1, 1;
setp.eq.b32 %p51, %r145, 1;
not.pred %p52, %p51;
and.pred %p27, %p24, %p52;
mov.b32 %r89, %f129;
@%p27 st.shared.b32 [ %r80 + 0 ], %r89;
mov.b32 %r91, %f133;
@%p27 st.shared.b32 [ %r82 + 0 ], %r91;
@%p27 st.shared.b32 [ %r84 + 0 ], %r87;
bar.sync 0;
ld.shared.f32 %f9, [global_smem];
ld.shared.f32 %f10, [global_smem+8];
$L__tmp16:
.loc 1 62 51
mov.u32 %r94, 0x0;
mov.u32 %r95, 0x0;
mov.u32 %r96, 0x0;
mov.u32 %r97, 0x0;
@%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ];
@!%p53 mov.u32 %r94, %r151;
@!%p53 mov.u32 %r95, %r151;
@!%p53 mov.u32 %r96, %r151;
@!%p53 mov.u32 %r97, %r151;
.loc 1 63 51
mov.u32 %r102, 0x0;
mov.u32 %r103, 0x0;
@%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ];
@!%p53 mov.u32 %r102, %r151;
@!%p53 mov.u32 %r103, %r151;
cvt.u16.u32 %rs5, %r102;
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; }
cvt.u16.u32 %rs7, %r103;
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; }
.loc 1 63 103
cvt.f32.bf16 %r106, %rs5;
mov.b32 %f11, %r106;
cvt.f32.bf16 %r107, %rs6;
mov.b32 %f12, %r107;
cvt.f32.bf16 %r108, %rs7;
mov.b32 %f13, %r108;
cvt.f32.bf16 %r109, %rs8;
mov.b32 %f14, %r109;
.loc 1 64 35
mul.wide.u32 %rd46, %r4, 4;
add.s64 %rd39, %rd8, %rd46;
.loc 1 64 40
mov.u32 %r110, 0x0;
mov.u32 %r111, 0x0;
mov.u32 %r112, 0x0;
mov.u32 %r113, 0x0;
@%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ];
@!%p53 mov.u32 %r110, %r151;
@!%p53 mov.u32 %r111, %r151;
@!%p53 mov.u32 %r112, %r151;
@!%p53 mov.u32 %r113, %r151;
.loc 1 68 57
@%p15 bra $L__BB0_4;
mov.u64 %rd47, assertMessage_1;
cvta.global.u64 %rd48, %rd47;
mov.u64 %rd49, assertFile_1;
cvta.global.u64 %rd50, %rd49;
mov.u64 %rd51, assertFunc_1;
cvta.global.u64 %rd52, %rd51;
{ // callseq 1, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd48;
.param .b64 param1;
st.param.b64 [param1+0], %rd50;
.param .b32 param2;
st.param.b32 [param2+0], %r175;
.param .b64 param3;
st.param.b64 [param3+0], %rd52;
.param .b64 param4;
st.param.b64 [param4+0], %rd57;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 1
$L__BB0_4:
.loc 1 69 54
mov.u32 %r147, 0x0;
mov.u32 %r148, 0x0;
mov.u32 %r149, 0x0;
mov.u32 %r150, 0x0;
@%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ];
@!%p53 mov.u32 %r147, %r151;
@!%p53 mov.u32 %r148, %r151;
@!%p53 mov.u32 %r149, %r151;
@!%p53 mov.u32 %r150, %r151;
.loc 1 75 24
mov.b32 %r156, %f10;
mov.b32 %r157, 1132462080;
div.full.f32 %r155, %r156, %r157;
mov.b32 %f134, %r155;
.loc 1 77 24
add.f32 %f135, %f134, 0f3727C5AC;
.loc 1 78 30
rsqrt.approx.ftz.f32 %f136, %f135;
.loc 1 69 54
mov.b32 %f137, %r150;
.loc 1 62 51
mov.b32 %f138, %r97;
.loc 1 70 24
add.f32 %f139, %f138, %f137;
.loc 1 72 24
add.f32 %f140, %f14, %f139;
.loc 1 73 24
sub.f32 %f141, %f140, %f9;
.loc 1 69 54
mov.b32 %f142, %r149;
.loc 1 62 51
mov.b32 %f143, %r96;
.loc 1 70 24
add.f32 %f144, %f143, %f142;
.loc 1 72 24
add.f32 %f145, %f13, %f144;
.loc 1 73 24
sub.f32 %f146, %f145, %f9;
.loc 1 69 54
mov.b32 %f147, %r148;
.loc 1 62 51
mov.b32 %f148, %r95;
.loc 1 70 24
add.f32 %f149, %f148, %f147;
.loc 1 72 24
add.f32 %f150, %f12, %f149;
.loc 1 73 24
sub.f32 %f151, %f150, %f9;
.loc 1 69 54
mov.b32 %f152, %r147;
.loc 1 62 51
mov.b32 %f153, %r94;
.loc 1 70 24
add.f32 %f154, %f153, %f152;
.loc 1 72 24
add.f32 %f155, %f11, %f154;
.loc 1 73 24
sub.f32 %f156, %f155, %f9;
.loc 1 64 40
mov.b32 %f157, %r110;
mov.b32 %f158, %r111;
mov.b32 %f159, %r112;
mov.b32 %f160, %r113;
.loc 1 79 24
mul.f32 %f161, %f156, %f136;
mul.f32 %f162, %f151, %f136;
mul.f32 %f163, %f146, %f136;
mul.f32 %f164, %f141, %f136;
.loc 1 80 24
mul.f32 %f165, %f161, %f157;
mul.f32 %f166, %f162, %f158;
mul.f32 %f167, %f163, %f159;
mul.f32 %f168, %f164, %f160;
.loc 1 82 29
shl.b64 %rd56, %rd3, 1;
add.s64 %rd55, %rd9, %rd56;
.loc 1 82 52
mov.b32 %r167, %f165;
cvt.rn.bf16.f32 %rs9, %r167;
mov.b32 %r168, %f166;
cvt.rn.bf16.f32 %rs10, %r168;
mov.b32 %r169, %f167;
cvt.rn.bf16.f32 %rs11, %r169;
mov.b32 %r170, %f168;
cvt.rn.bf16.f32 %rs12, %r170;
mov.b32 %r173, {%rs9, %rs10};
mov.b32 %r174, {%rs11, %rs12};
@%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 };
.loc 1 58 4
ret;
$L__tmp17:
$L__func_end0:
}
// .globl __nv_rsqrtf
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
.param .b32 __nv_rsqrtf_param_0
)
{
.reg .f32 %f<3>;
$L__func_begin1:
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
rsqrt.approx.ftz.f32 %f2, %f1;
st.param.f32 [func_retval0+0], %f2;
ret;
$L__func_end1:
}
.file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 302
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 99
.b8 105
.b8 103
.b8 54
.b8 102
.b8 107
.b8 105
.b8 54
.b8 112
.b8 52
.b8 108
.b8 120
.b8 114
.b8 100
.b8 109
.b8 103
.b8 103
.b8 54
.b8 101
.b8 117
.b8 100
.b8 97
.b8 104
.b8 105
.b8 101
.b8 120
.b8 99
.b8 118
.b8 117
.b8 101
.b8 101
.b8 111
.b8 108
.b8 50
.b8 112
.b8 52
.b8 113
.b8 112
.b8 53
.b8 51
.b8 50
.b8 112
.b8 118
.b8 118
.b8 101
.b8 50
.b8 121
.b8 52
.b8 54
.b8 51
.b8 121
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 99
.b8 105
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp2
.b8 2
.b8 47
.b8 41
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 53
.b8 44
.b8 4
.b32 125
.b64 $L__tmp2
.b64 $L__tmp15
.b8 2
.b8 120
.b8 46
.b8 0
.b8 4
.b32 125
.b64 $L__tmp3
.b64 $L__tmp16
.b8 2
.b8 53
.b8 44
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 306
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 306
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }