0-hero's picture
Add files using upload-large-folder tool
d742687 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6de7de
.extern .func __assertfail
(
.param .b64 __assertfail_param_0,
.param .b64 __assertfail_param_1,
.param .b32 __assertfail_param_2,
.param .b64 __assertfail_param_3,
.param .b64 __assertfail_param_4
)
;
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
.extern .shared .align 1 .b8 global_smem[];
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
.visible .entry triton__0d1d2d3d4d5d6de7de(
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
)
.maxntid 256, 1, 1
{
.reg .pred %p<137>;
.reg .b16 %rs<49>;
.reg .b32 %r<439>;
.reg .f32 %f<487>;
.reg .b64 %rd<124>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_4];
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_1];
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6de7de_param_0];
$L__tmp0:
.loc 1 22 44
mov.u32 %r89, %tid.x;
ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6de7de_param_2];
bfe.u32 %r90, %r89, 5, 3;
ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6de7de_param_3];
and.b32 %r91, %r89, 15;
.loc 1 24 33
shl.b32 %r92, %r89, 3;
and.b32 %r1, %r92, 248;
and.b32 %r2, %r89, 255;
.loc 1 21 28
mov.u32 %r24, %ctaid.x;
.loc 1 21 33
shl.b32 %r93, %r24, 4;
.loc 1 22 23
or.b32 %r94, %r93, %r90;
or.b32 %r95, %r94, 8;
or.b32 %r96, %r93, %r91;
.loc 1 26 30
mul.wide.s32 %rd62, %r94, 8;
add.s64 %rd20, %rd59, %rd62;
add.s64 %rd36, %rd20, 64;
mul.wide.s32 %rd63, %r96, 8;
add.s64 %rd52, %rd59, %rd63;
mov.pred %p113, -1;
.loc 1 26 35
mov.u64 %rd19, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ];
mov.u64 %rd21, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ];
mov.u64 %rd23, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ];
mov.u64 %rd25, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ];
mov.u64 %rd27, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ];
mov.u64 %rd29, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ];
mov.u64 %rd31, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ];
mov.u64 %rd33, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ];
mov.u64 %rd35, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ];
mov.u64 %rd37, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ];
mov.u64 %rd39, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ];
mov.u64 %rd41, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ];
mov.u64 %rd43, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ];
mov.u64 %rd45, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ];
mov.u64 %rd47, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ];
mov.u64 %rd49, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ];
mov.u64 %rd51, 0x0;
@%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ];
.loc 1 27 18
bfe.s32 %r97, %r24, 27, 1;
shr.u32 %r98, %r97, 23;
add.s32 %r99, %r94, %r98;
and.b32 %r100, %r99, 16776704;
sub.s32 %r101, %r94, %r100;
add.s32 %r102, %r95, %r98;
and.b32 %r103, %r102, 16776704;
sub.s32 %r104, %r95, %r103;
.loc 1 35 44
shl.b32 %r105, %r101, 8;
shl.b32 %r106, %r104, 8;
.loc 1 35 40
or.b32 %r107, %r105, %r1;
or.b32 %r108, %r106, %r1;
.loc 1 35 34
mul.wide.s32 %rd64, %r107, 4;
add.s64 %rd89, %rd60, %rd64;
cvt.s64.s32 %rd65, %r105;
cvt.u64.u32 %rd66, %r1;
or.b64 %rd67, %rd65, %rd66;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd69, %rd60, %rd68;
add.s64 %rd90, %rd69, 16;
mul.wide.s32 %rd70, %r108, 4;
add.s64 %rd91, %rd60, %rd70;
cvt.s64.s32 %rd71, %r106;
or.b64 %rd72, %rd71, %rd66;
shl.b64 %rd73, %rd72, 2;
add.s64 %rd74, %rd60, %rd73;
add.s64 %rd92, %rd74, 16;
mov.b32 %r325, 0;
.loc 1 35 50
mov.u32 %r25, 0x0;
mov.u32 %r26, 0x0;
mov.u32 %r27, 0x0;
mov.u32 %r28, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ];
@!%p113 mov.u32 %r25, %r325;
@!%p113 mov.u32 %r26, %r325;
@!%p113 mov.u32 %r27, %r325;
@!%p113 mov.u32 %r28, %r325;
mov.b32 %f1, %r25;
mov.b32 %f2, %r26;
mov.b32 %f3, %r27;
mov.b32 %f4, %r28;
mov.u32 %r33, 0x0;
mov.u32 %r34, 0x0;
mov.u32 %r35, 0x0;
mov.u32 %r36, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ];
@!%p113 mov.u32 %r33, %r325;
@!%p113 mov.u32 %r34, %r325;
@!%p113 mov.u32 %r35, %r325;
@!%p113 mov.u32 %r36, %r325;
mov.b32 %f5, %r33;
mov.b32 %f6, %r34;
mov.b32 %f7, %r35;
mov.b32 %f8, %r36;
mov.u32 %r41, 0x0;
mov.u32 %r42, 0x0;
mov.u32 %r43, 0x0;
mov.u32 %r44, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ];
@!%p113 mov.u32 %r41, %r325;
@!%p113 mov.u32 %r42, %r325;
@!%p113 mov.u32 %r43, %r325;
@!%p113 mov.u32 %r44, %r325;
mov.b32 %f9, %r41;
mov.b32 %f10, %r42;
mov.b32 %f11, %r43;
mov.b32 %f12, %r44;
mov.u32 %r49, 0x0;
mov.u32 %r50, 0x0;
mov.u32 %r51, 0x0;
mov.u32 %r52, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ];
@!%p113 mov.u32 %r49, %r325;
@!%p113 mov.u32 %r50, %r325;
@!%p113 mov.u32 %r51, %r325;
@!%p113 mov.u32 %r52, %r325;
mov.b32 %f13, %r49;
mov.b32 %f14, %r50;
mov.b32 %f15, %r51;
mov.b32 %f16, %r52;
.loc 1 36 44
shl.b32 %r109, %r94, 8;
shl.b32 %r110, %r95, 8;
.loc 1 36 40
or.b32 %r111, %r109, %r1;
or.b32 %r112, %r110, %r1;
.loc 1 36 34
mul.wide.s32 %rd75, %r111, 2;
add.s64 %rd93, %rd61, %rd75;
mul.wide.s32 %rd76, %r112, 2;
add.s64 %rd94, %rd61, %rd76;
.loc 1 36 50
mov.u32 %r57, 0x0;
mov.u32 %r58, 0x0;
mov.u32 %r59, 0x0;
mov.u32 %r60, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ];
@!%p113 mov.u32 %r57, %r325;
@!%p113 mov.u32 %r58, %r325;
@!%p113 mov.u32 %r59, %r325;
@!%p113 mov.u32 %r60, %r325;
cvt.u16.u32 %rs1, %r57;
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; }
cvt.u16.u32 %rs3, %r58;
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; }
cvt.u16.u32 %rs5, %r59;
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; }
cvt.u16.u32 %rs7, %r60;
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; }
mov.u32 %r65, 0x0;
mov.u32 %r66, 0x0;
mov.u32 %r67, 0x0;
mov.u32 %r68, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ];
@!%p113 mov.u32 %r65, %r325;
@!%p113 mov.u32 %r66, %r325;
@!%p113 mov.u32 %r67, %r325;
@!%p113 mov.u32 %r68, %r325;
cvt.u16.u32 %rs9, %r65;
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; }
cvt.u16.u32 %rs11, %r66;
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; }
cvt.u16.u32 %rs13, %r67;
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; }
cvt.u16.u32 %rs15, %r68;
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; }
.loc 1 36 101
cvt.f32.bf16 %r73, %rs1;
mov.b32 %f17, %r73;
cvt.f32.bf16 %r74, %rs2;
mov.b32 %f18, %r74;
cvt.f32.bf16 %r75, %rs3;
mov.b32 %f19, %r75;
cvt.f32.bf16 %r76, %rs4;
mov.b32 %f20, %r76;
cvt.f32.bf16 %r77, %rs5;
mov.b32 %f21, %r77;
cvt.f32.bf16 %r78, %rs6;
mov.b32 %f22, %r78;
cvt.f32.bf16 %r79, %rs7;
mov.b32 %f23, %r79;
cvt.f32.bf16 %r80, %rs8;
mov.b32 %f24, %r80;
cvt.f32.bf16 %r81, %rs9;
mov.b32 %f25, %r81;
cvt.f32.bf16 %r82, %rs10;
mov.b32 %f26, %r82;
cvt.f32.bf16 %r83, %rs11;
mov.b32 %f27, %r83;
cvt.f32.bf16 %r84, %rs12;
mov.b32 %f28, %r84;
cvt.f32.bf16 %r85, %rs13;
mov.b32 %f29, %r85;
cvt.f32.bf16 %r86, %rs14;
mov.b32 %f30, %r86;
cvt.f32.bf16 %r87, %rs15;
mov.b32 %f31, %r87;
cvt.f32.bf16 %r88, %rs16;
mov.b32 %f32, %r88;
.loc 1 37 22
add.s64 %rd77, %rd51, 50257;
.loc 1 38 22
setp.lt.s64 %p48, %rd51, 0;
.loc 1 39 36
selp.b64 %rd11, %rd77, %rd51, %p48;
.loc 1 40 40
setp.lt.u64 %p49, %rd11, 50257;
mov.b32 %r438, 883;
mov.u64 %rd123, 1;
.loc 1 40 55
@%p49 bra $L__BB0_2;
mov.u64 %rd78, assertMessage_0;
cvta.global.u64 %rd79, %rd78;
mov.u64 %rd80, assertFile_0;
cvta.global.u64 %rd81, %rd80;
mov.u64 %rd82, assertFunc_0;
cvta.global.u64 %rd83, %rd82;
{ // callseq 8, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd79;
.param .b64 param1;
st.param.b64 [param1+0], %rd81;
.param .b32 param2;
st.param.b32 [param2+0], %r438;
.param .b64 param3;
st.param.b64 [param3+0], %rd83;
.param .b64 param4;
st.param.b64 [param4+0], %rd123;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 8
$L__BB0_2:
.loc 1 0 55
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_5];
cvt.s64.s32 %rd7, %r111;
cvt.s64.s32 %rd9, %r112;
.loc 1 38 22
setp.lt.s64 %p103, %rd35, 0;
setp.lt.s64 %p104, %rd19, 0;
.loc 1 41 44
shl.b64 %rd96, %rd19, 8;
add.s64 %rd97, %rd96, 12865792;
selp.b64 %rd98, %rd97, %rd96, %p104;
shl.b64 %rd99, %rd35, 8;
add.s64 %rd100, %rd99, 12865792;
selp.b64 %rd101, %rd100, %rd99, %p103;
.loc 1 41 40
or.b64 %rd103, %rd98, %rd66;
or.b64 %rd104, %rd101, %rd66;
.loc 1 41 34
shl.b64 %rd105, %rd103, 2;
add.s64 %rd115, %rd16, %rd105;
add.s64 %rd116, %rd115, 16;
shl.b64 %rd106, %rd104, 2;
add.s64 %rd117, %rd16, %rd106;
add.s64 %rd118, %rd117, 16;
.loc 1 41 52
mov.u32 %r114, 0x0;
mov.u32 %r115, 0x0;
mov.u32 %r116, 0x0;
mov.u32 %r117, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ];
@!%p113 mov.u32 %r114, %r325;
@!%p113 mov.u32 %r115, %r325;
@!%p113 mov.u32 %r116, %r325;
@!%p113 mov.u32 %r117, %r325;
mov.b32 %f59, %r114;
mov.b32 %f60, %r115;
mov.b32 %f61, %r116;
mov.b32 %f62, %r117;
mov.u32 %r122, 0x0;
mov.u32 %r123, 0x0;
mov.u32 %r124, 0x0;
mov.u32 %r125, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ];
@!%p113 mov.u32 %r122, %r325;
@!%p113 mov.u32 %r123, %r325;
@!%p113 mov.u32 %r124, %r325;
@!%p113 mov.u32 %r125, %r325;
mov.b32 %f63, %r122;
mov.b32 %f64, %r123;
mov.b32 %f65, %r124;
mov.b32 %f66, %r125;
mov.u32 %r130, 0x0;
mov.u32 %r131, 0x0;
mov.u32 %r132, 0x0;
mov.u32 %r133, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ];
@!%p113 mov.u32 %r130, %r325;
@!%p113 mov.u32 %r131, %r325;
@!%p113 mov.u32 %r132, %r325;
@!%p113 mov.u32 %r133, %r325;
mov.b32 %f67, %r130;
mov.b32 %f68, %r131;
mov.b32 %f69, %r132;
mov.b32 %f70, %r133;
mov.u32 %r138, 0x0;
mov.u32 %r139, 0x0;
mov.u32 %r140, 0x0;
mov.u32 %r141, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ];
@!%p113 mov.u32 %r138, %r325;
@!%p113 mov.u32 %r139, %r325;
@!%p113 mov.u32 %r140, %r325;
@!%p113 mov.u32 %r141, %r325;
mov.b32 %f71, %r138;
mov.b32 %f72, %r139;
mov.b32 %f73, %r140;
mov.b32 %f74, %r141;
.loc 1 42 22
add.f32 %f75, %f1, %f59;
add.f32 %f76, %f2, %f60;
add.f32 %f77, %f3, %f61;
add.f32 %f78, %f4, %f62;
add.f32 %f79, %f5, %f63;
add.f32 %f80, %f6, %f64;
add.f32 %f81, %f7, %f65;
add.f32 %f82, %f8, %f66;
add.f32 %f83, %f9, %f67;
add.f32 %f84, %f10, %f68;
add.f32 %f85, %f11, %f69;
add.f32 %f86, %f12, %f70;
add.f32 %f87, %f13, %f71;
add.f32 %f88, %f14, %f72;
add.f32 %f89, %f15, %f73;
add.f32 %f90, %f16, %f74;
.loc 1 44 22
add.f32 %f91, %f17, %f75;
add.f32 %f92, %f18, %f76;
add.f32 %f93, %f19, %f77;
add.f32 %f94, %f20, %f78;
add.f32 %f95, %f21, %f79;
add.f32 %f96, %f22, %f80;
add.f32 %f97, %f23, %f81;
add.f32 %f98, %f24, %f82;
add.f32 %f99, %f25, %f83;
add.f32 %f100, %f26, %f84;
add.f32 %f101, %f27, %f85;
add.f32 %f102, %f28, %f86;
add.f32 %f103, %f29, %f87;
add.f32 %f104, %f30, %f88;
add.f32 %f105, %f31, %f89;
add.f32 %f106, %f32, %f90;
$L__tmp1:
.loc 2 98 22
add.f32 %f107, %f91, 0f00000000;
add.f32 %f108, %f92, 0f00000000;
add.f32 %f109, %f93, 0f00000000;
add.f32 %f110, %f94, 0f00000000;
add.f32 %f111, %f95, 0f00000000;
add.f32 %f112, %f96, 0f00000000;
add.f32 %f113, %f97, 0f00000000;
add.f32 %f114, %f98, 0f00000000;
add.f32 %f115, %f99, 0f00000000;
add.f32 %f116, %f100, 0f00000000;
add.f32 %f117, %f101, 0f00000000;
add.f32 %f118, %f102, 0f00000000;
add.f32 %f119, %f103, 0f00000000;
add.f32 %f120, %f104, 0f00000000;
add.f32 %f121, %f105, 0f00000000;
add.f32 %f122, %f106, 0f00000000;
.loc 2 101 30
sub.f32 %f123, %f91, %f107;
sub.f32 %f124, %f92, %f108;
sub.f32 %f125, %f93, %f109;
sub.f32 %f126, %f94, %f110;
sub.f32 %f127, %f95, %f111;
sub.f32 %f128, %f96, %f112;
sub.f32 %f129, %f97, %f113;
sub.f32 %f130, %f98, %f114;
sub.f32 %f131, %f99, %f115;
sub.f32 %f132, %f100, %f116;
sub.f32 %f133, %f101, %f117;
sub.f32 %f134, %f102, %f118;
sub.f32 %f135, %f103, %f119;
sub.f32 %f136, %f104, %f120;
sub.f32 %f137, %f105, %f121;
sub.f32 %f138, %f106, %f122;
.loc 2 101 13
fma.rn.f32 %f139, %f91, %f123, 0f00000000;
fma.rn.f32 %f140, %f92, %f124, 0f00000000;
fma.rn.f32 %f141, %f93, %f125, 0f00000000;
fma.rn.f32 %f142, %f94, %f126, 0f00000000;
fma.rn.f32 %f143, %f95, %f127, 0f00000000;
fma.rn.f32 %f144, %f96, %f128, 0f00000000;
fma.rn.f32 %f145, %f97, %f129, 0f00000000;
fma.rn.f32 %f146, %f98, %f130, 0f00000000;
fma.rn.f32 %f147, %f99, %f131, 0f00000000;
fma.rn.f32 %f148, %f100, %f132, 0f00000000;
fma.rn.f32 %f149, %f101, %f133, 0f00000000;
fma.rn.f32 %f150, %f102, %f134, 0f00000000;
fma.rn.f32 %f151, %f103, %f135, 0f00000000;
fma.rn.f32 %f152, %f104, %f136, 0f00000000;
fma.rn.f32 %f153, %f105, %f137, 0f00000000;
fma.rn.f32 %f154, %f106, %f138, 0f00000000;
$L__tmp2:
.loc 2 108 21
sub.f32 %f155, %f108, %f107;
mov.b32 %r147, 1065353216;
mov.b32 %r148, 1073741824;
.loc 2 110 60
div.full.f32 %r146, %r147, %r148;
mov.b32 %f156, %r146;
.loc 2 112 17
fma.rn.f32 %f157, %f156, %f155, %f107;
.loc 2 113 15
add.f32 %f158, %f139, %f140;
.loc 2 113 30
mul.f32 %f159, %f155, %f155;
.loc 2 113 22
fma.rn.f32 %f160, %f156, %f159, %f158;
.loc 2 108 21
sub.f32 %f161, %f109, %f157;
mov.b32 %r151, 1077936128;
.loc 2 110 60
div.full.f32 %r149, %r147, %r151;
mov.b32 %f162, %r149;
.loc 2 112 17
fma.rn.f32 %f163, %f162, %f161, %f157;
.loc 2 113 15
add.f32 %f164, %f141, %f160;
.loc 2 113 30
mul.f32 %f165, %f161, %f161;
.loc 2 113 38
fma.rn.f32 %f166, %f161, %f161, %f165;
.loc 2 113 22
fma.rn.f32 %f167, %f162, %f166, %f164;
.loc 2 108 21
sub.f32 %f168, %f110, %f163;
mov.b32 %r154, 1082130432;
.loc 2 110 60
div.full.f32 %r152, %r147, %r154;
mov.b32 %f169, %r152;
.loc 2 112 17
fma.rn.f32 %f170, %f169, %f168, %f163;
.loc 2 113 15
add.f32 %f171, %f142, %f167;
.loc 2 113 30
mul.f32 %f172, %f168, %f168;
.loc 2 113 38
mul.f32 %f173, %f172, 0f40400000;
.loc 2 113 22
fma.rn.f32 %f174, %f169, %f173, %f171;
.loc 2 108 21
sub.f32 %f175, %f111, %f170;
mov.b32 %r157, 1084227584;
.loc 2 110 60
div.full.f32 %r155, %r147, %r157;
mov.b32 %f176, %r155;
.loc 2 112 17
fma.rn.f32 %f177, %f176, %f175, %f170;
.loc 2 113 15
add.f32 %f178, %f143, %f174;
.loc 2 113 30
mul.f32 %f179, %f175, %f175;
.loc 2 113 38
mul.f32 %f180, %f179, 0f40800000;
.loc 2 113 22
fma.rn.f32 %f181, %f176, %f180, %f178;
.loc 2 108 21
sub.f32 %f182, %f112, %f177;
mov.b32 %r160, 1086324736;
.loc 2 110 60
div.full.f32 %r158, %r147, %r160;
mov.b32 %f183, %r158;
.loc 2 112 17
fma.rn.f32 %f184, %f183, %f182, %f177;
.loc 2 113 15
add.f32 %f185, %f144, %f181;
.loc 2 113 30
mul.f32 %f186, %f182, %f182;
.loc 2 113 38
mul.f32 %f187, %f186, 0f40A00000;
.loc 2 113 22
fma.rn.f32 %f188, %f183, %f187, %f185;
.loc 2 108 21
sub.f32 %f189, %f113, %f184;
mov.b32 %r163, 1088421888;
.loc 2 110 60
div.full.f32 %r161, %r147, %r163;
mov.b32 %f190, %r161;
.loc 2 112 17
fma.rn.f32 %f191, %f190, %f189, %f184;
.loc 2 113 15
add.f32 %f192, %f145, %f188;
.loc 2 113 30
mul.f32 %f193, %f189, %f189;
.loc 2 113 38
mul.f32 %f194, %f193, 0f40C00000;
.loc 2 113 22
fma.rn.f32 %f195, %f190, %f194, %f192;
.loc 2 108 21
sub.f32 %f196, %f114, %f191;
mov.b32 %r166, 1090519040;
.loc 2 110 60
div.full.f32 %r164, %r147, %r166;
mov.b32 %f197, %r164;
.loc 2 112 17
fma.rn.f32 %f198, %f197, %f196, %f191;
.loc 2 113 15
add.f32 %f199, %f146, %f195;
.loc 2 113 30
mul.f32 %f200, %f196, %f196;
.loc 2 113 38
mul.f32 %f201, %f200, 0f40E00000;
.loc 2 113 22
fma.rn.f32 %f202, %f197, %f201, %f199;
.loc 2 108 21
sub.f32 %f203, %f116, %f115;
.loc 2 110 60
div.full.f32 %r167, %r147, %r148;
mov.b32 %f204, %r167;
.loc 2 112 17
fma.rn.f32 %f205, %f203, %f204, %f115;
.loc 2 113 15
add.f32 %f206, %f147, %f148;
.loc 2 113 30
mul.f32 %f207, %f203, %f203;
.loc 2 113 22
fma.rn.f32 %f208, %f207, %f204, %f206;
.loc 2 108 21
sub.f32 %f209, %f117, %f205;
.loc 2 110 60
div.full.f32 %r170, %r147, %r151;
mov.b32 %f210, %r170;
.loc 2 112 17
fma.rn.f32 %f211, %f210, %f209, %f205;
.loc 2 113 15
add.f32 %f212, %f149, %f208;
.loc 2 113 30
mul.f32 %f213, %f209, %f209;
.loc 2 113 38
fma.rn.f32 %f214, %f209, %f209, %f213;
.loc 2 113 22
fma.rn.f32 %f215, %f210, %f214, %f212;
.loc 2 108 21
sub.f32 %f216, %f118, %f211;
.loc 2 110 60
div.full.f32 %r173, %r147, %r154;
mov.b32 %f217, %r173;
.loc 2 112 17
fma.rn.f32 %f218, %f217, %f216, %f211;
.loc 2 113 15
add.f32 %f219, %f150, %f215;
.loc 2 113 30
mul.f32 %f220, %f216, %f216;
.loc 2 113 38
mul.f32 %f221, %f220, 0f40400000;
.loc 2 113 22
fma.rn.f32 %f222, %f217, %f221, %f219;
.loc 2 108 21
sub.f32 %f223, %f119, %f218;
.loc 2 110 60
div.full.f32 %r176, %r147, %r157;
mov.b32 %f224, %r176;
.loc 2 112 17
fma.rn.f32 %f225, %f224, %f223, %f218;
.loc 2 113 15
add.f32 %f226, %f151, %f222;
.loc 2 113 30
mul.f32 %f227, %f223, %f223;
.loc 2 113 38
mul.f32 %f228, %f227, 0f40800000;
.loc 2 113 22
fma.rn.f32 %f229, %f224, %f228, %f226;
.loc 2 108 21
sub.f32 %f230, %f120, %f225;
.loc 2 110 60
div.full.f32 %r179, %r147, %r160;
mov.b32 %f231, %r179;
.loc 2 112 17
fma.rn.f32 %f232, %f231, %f230, %f225;
.loc 2 113 15
add.f32 %f233, %f152, %f229;
.loc 2 113 30
mul.f32 %f234, %f230, %f230;
.loc 2 113 38
mul.f32 %f235, %f234, 0f40A00000;
.loc 2 113 22
fma.rn.f32 %f236, %f231, %f235, %f233;
.loc 2 108 21
sub.f32 %f237, %f121, %f232;
.loc 2 110 60
div.full.f32 %r182, %r147, %r163;
mov.b32 %f238, %r182;
.loc 2 112 17
fma.rn.f32 %f239, %f238, %f237, %f232;
.loc 2 113 15
add.f32 %f240, %f153, %f236;
.loc 2 113 30
mul.f32 %f241, %f237, %f237;
.loc 2 113 38
mul.f32 %f242, %f241, 0f40C00000;
.loc 2 113 22
fma.rn.f32 %f243, %f238, %f242, %f240;
.loc 2 108 21
sub.f32 %f244, %f122, %f239;
.loc 2 110 60
div.full.f32 %r185, %r147, %r166;
mov.b32 %f245, %r185;
.loc 2 112 17
fma.rn.f32 %f246, %f245, %f244, %f239;
.loc 2 113 15
add.f32 %f247, %f154, %f243;
.loc 2 113 30
mul.f32 %f248, %f244, %f244;
.loc 2 113 38
mul.f32 %f249, %f248, 0f40E00000;
.loc 2 113 22
fma.rn.f32 %f250, %f245, %f249, %f247;
$L__tmp3:
.loc 2 120 46
mov.b32 %r284, %f198;
shfl.sync.bfly.b32 %r285, %r284, 16, 31, -1;
mov.b32 %f251, %r285;
mov.b32 %r286, %f202;
shfl.sync.bfly.b32 %r287, %r286, 16, 31, -1;
mov.b32 %f252, %r287;
shfl.sync.bfly.b32 %r189, %r166, 16, 31, -1;
mov.b32 %f253, %r189;
$L__tmp4:
.loc 2 108 21
sub.f32 %f254, %f251, %f198;
.loc 2 109 28
add.f32 %f255, %f253, 0f41000000;
.loc 2 110 39
setp.eq.f32 %p105, %f255, 0f00000000;
.loc 2 110 60
mov.b32 %r190, %f255;
div.full.f32 %r188, %r189, %r190;
mov.b32 %f256, %r188;
.loc 2 110 49
selp.f32 %f257, 0f00000000, %f256, %p105;
.loc 2 112 17
fma.rn.f32 %f258, %f257, %f254, %f198;
.loc 2 113 15
add.f32 %f259, %f202, %f252;
.loc 2 113 30
mul.f32 %f260, %f254, %f254;
.loc 2 113 38
mul.f32 %f261, %f260, 0f41000000;
.loc 2 113 22
fma.rn.f32 %f262, %f257, %f261, %f259;
$L__tmp5:
.loc 2 120 46
mov.b32 %r288, %f258;
shfl.sync.bfly.b32 %r289, %r288, 8, 31, -1;
mov.b32 %f263, %r289;
mov.b32 %r290, %f262;
shfl.sync.bfly.b32 %r291, %r290, 8, 31, -1;
mov.b32 %f264, %r291;
shfl.sync.bfly.b32 %r192, %r190, 8, 31, -1;
mov.b32 %f265, %r192;
$L__tmp6:
.loc 2 108 21
sub.f32 %f266, %f263, %f258;
.loc 2 109 28
add.f32 %f267, %f255, %f265;
.loc 2 110 39
setp.eq.f32 %p106, %f267, 0f00000000;
.loc 2 110 60
mov.b32 %r193, %f267;
div.full.f32 %r191, %r192, %r193;
mov.b32 %f268, %r191;
.loc 2 110 49
selp.f32 %f269, 0f00000000, %f268, %p106;
.loc 2 112 17
fma.rn.f32 %f270, %f269, %f266, %f258;
.loc 2 113 15
add.f32 %f271, %f262, %f264;
.loc 2 113 30
mul.f32 %f272, %f266, %f266;
.loc 2 113 38
mul.f32 %f273, %f255, %f272;
.loc 2 113 22
fma.rn.f32 %f274, %f269, %f273, %f271;
$L__tmp7:
.loc 2 120 46
mov.b32 %r292, %f270;
shfl.sync.bfly.b32 %r293, %r292, 4, 31, -1;
mov.b32 %f275, %r293;
mov.b32 %r294, %f274;
shfl.sync.bfly.b32 %r295, %r294, 4, 31, -1;
mov.b32 %f276, %r295;
shfl.sync.bfly.b32 %r195, %r193, 4, 31, -1;
mov.b32 %f277, %r195;
$L__tmp8:
.loc 2 108 21
sub.f32 %f278, %f275, %f270;
.loc 2 109 28
add.f32 %f279, %f267, %f277;
.loc 2 110 39
setp.eq.f32 %p107, %f279, 0f00000000;
.loc 2 110 60
mov.b32 %r196, %f279;
div.full.f32 %r194, %r195, %r196;
mov.b32 %f280, %r194;
.loc 2 110 49
selp.f32 %f281, 0f00000000, %f280, %p107;
.loc 2 112 17
fma.rn.f32 %f282, %f281, %f278, %f270;
.loc 2 113 15
add.f32 %f283, %f274, %f276;
.loc 2 113 30
mul.f32 %f284, %f278, %f278;
.loc 2 113 38
mul.f32 %f285, %f267, %f284;
.loc 2 113 22
fma.rn.f32 %f286, %f281, %f285, %f283;
$L__tmp9:
.loc 2 120 46
mov.b32 %r296, %f282;
shfl.sync.bfly.b32 %r297, %r296, 2, 31, -1;
mov.b32 %f287, %r297;
mov.b32 %r298, %f286;
shfl.sync.bfly.b32 %r299, %r298, 2, 31, -1;
mov.b32 %f288, %r299;
shfl.sync.bfly.b32 %r198, %r196, 2, 31, -1;
mov.b32 %f289, %r198;
$L__tmp10:
.loc 2 108 21
sub.f32 %f290, %f287, %f282;
.loc 2 109 28
add.f32 %f33, %f279, %f289;
.loc 2 110 39
setp.eq.f32 %p108, %f33, 0f00000000;
.loc 2 110 60
mov.b32 %r199, %f33;
div.full.f32 %r197, %r198, %r199;
mov.b32 %f291, %r197;
.loc 2 110 49
selp.f32 %f292, 0f00000000, %f291, %p108;
.loc 2 112 17
fma.rn.f32 %f34, %f290, %f292, %f282;
.loc 2 113 15
add.f32 %f293, %f286, %f288;
.loc 2 113 30
mul.f32 %f294, %f290, %f290;
.loc 2 113 38
mul.f32 %f295, %f279, %f294;
.loc 2 113 22
fma.rn.f32 %f35, %f292, %f295, %f293;
$L__tmp11:
.loc 2 120 46
mov.b32 %r300, %f34;
shfl.sync.bfly.b32 %r3, %r300, 1, 31, -1;
mov.b32 %r301, %f35;
shfl.sync.bfly.b32 %r4, %r301, 1, 31, -1;
shfl.sync.bfly.b32 %r201, %r199, 1, 31, -1;
mov.b32 %f296, %r201;
$L__tmp12:
.loc 2 109 28
add.f32 %f36, %f33, %f296;
.loc 2 110 60
mov.b32 %r202, %f36;
div.full.f32 %r200, %r201, %r202;
mov.b32 %f37, %r200;
$L__tmp13:
.loc 2 120 46
mov.b32 %r302, %f246;
shfl.sync.bfly.b32 %r303, %r302, 16, 31, -1;
mov.b32 %f297, %r303;
mov.b32 %r304, %f250;
shfl.sync.bfly.b32 %r305, %r304, 16, 31, -1;
mov.b32 %f298, %r305;
shfl.sync.bfly.b32 %r204, %r166, 16, 31, -1;
mov.b32 %f299, %r204;
$L__tmp14:
.loc 2 108 21
sub.f32 %f300, %f297, %f246;
.loc 2 109 28
add.f32 %f301, %f299, 0f41000000;
.loc 2 110 39
setp.eq.f32 %p109, %f301, 0f00000000;
.loc 2 110 60
mov.b32 %r205, %f301;
div.full.f32 %r203, %r204, %r205;
mov.b32 %f302, %r203;
.loc 2 110 49
selp.f32 %f303, 0f00000000, %f302, %p109;
.loc 2 112 17
fma.rn.f32 %f304, %f300, %f303, %f246;
.loc 2 113 15
add.f32 %f305, %f250, %f298;
.loc 2 113 30
mul.f32 %f306, %f300, %f300;
.loc 2 113 38
mul.f32 %f307, %f306, 0f41000000;
.loc 2 113 22
fma.rn.f32 %f308, %f307, %f303, %f305;
$L__tmp15:
.loc 2 120 46
mov.b32 %r306, %f304;
shfl.sync.bfly.b32 %r307, %r306, 8, 31, -1;
mov.b32 %f309, %r307;
mov.b32 %r308, %f308;
shfl.sync.bfly.b32 %r309, %r308, 8, 31, -1;
mov.b32 %f310, %r309;
shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1;
mov.b32 %f311, %r207;
$L__tmp16:
.loc 2 108 21
sub.f32 %f312, %f309, %f304;
.loc 2 109 28
add.f32 %f313, %f301, %f311;
.loc 2 110 39
setp.eq.f32 %p110, %f313, 0f00000000;
.loc 2 110 60
mov.b32 %r208, %f313;
div.full.f32 %r206, %r207, %r208;
mov.b32 %f314, %r206;
.loc 2 110 49
selp.f32 %f315, 0f00000000, %f314, %p110;
.loc 2 112 17
fma.rn.f32 %f316, %f312, %f315, %f304;
.loc 2 113 15
add.f32 %f317, %f308, %f310;
.loc 2 113 30
mul.f32 %f318, %f312, %f312;
.loc 2 113 38
mul.f32 %f319, %f301, %f318;
.loc 2 113 22
fma.rn.f32 %f320, %f315, %f319, %f317;
$L__tmp17:
.loc 2 120 46
mov.b32 %r310, %f316;
shfl.sync.bfly.b32 %r311, %r310, 4, 31, -1;
mov.b32 %f321, %r311;
mov.b32 %r312, %f320;
shfl.sync.bfly.b32 %r313, %r312, 4, 31, -1;
mov.b32 %f322, %r313;
shfl.sync.bfly.b32 %r210, %r208, 4, 31, -1;
mov.b32 %f323, %r210;
$L__tmp18:
.loc 2 108 21
sub.f32 %f324, %f321, %f316;
.loc 2 109 28
add.f32 %f325, %f313, %f323;
.loc 2 110 39
setp.eq.f32 %p111, %f325, 0f00000000;
.loc 2 110 60
mov.b32 %r211, %f325;
div.full.f32 %r209, %r210, %r211;
mov.b32 %f326, %r209;
.loc 2 110 49
selp.f32 %f327, 0f00000000, %f326, %p111;
.loc 2 112 17
fma.rn.f32 %f328, %f324, %f327, %f316;
.loc 2 113 15
add.f32 %f329, %f320, %f322;
.loc 2 113 30
mul.f32 %f330, %f324, %f324;
.loc 2 113 38
mul.f32 %f331, %f313, %f330;
.loc 2 113 22
fma.rn.f32 %f332, %f327, %f331, %f329;
$L__tmp19:
.loc 2 120 46
mov.b32 %r314, %f328;
shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
mov.b32 %f333, %r315;
mov.b32 %r316, %f332;
shfl.sync.bfly.b32 %r317, %r316, 2, 31, -1;
mov.b32 %f334, %r317;
shfl.sync.bfly.b32 %r213, %r211, 2, 31, -1;
mov.b32 %f335, %r213;
$L__tmp20:
.loc 2 108 21
sub.f32 %f336, %f333, %f328;
.loc 2 109 28
add.f32 %f38, %f325, %f335;
.loc 2 110 39
setp.eq.f32 %p112, %f38, 0f00000000;
.loc 2 110 60
mov.b32 %r214, %f38;
div.full.f32 %r212, %r213, %r214;
mov.b32 %f337, %r212;
.loc 2 110 49
selp.f32 %f338, 0f00000000, %f337, %p112;
.loc 2 112 17
fma.rn.f32 %f39, %f336, %f338, %f328;
.loc 2 113 15
add.f32 %f339, %f332, %f334;
.loc 2 113 30
mul.f32 %f340, %f336, %f336;
.loc 2 113 38
mul.f32 %f341, %f325, %f340;
.loc 2 113 22
fma.rn.f32 %f40, %f338, %f341, %f339;
$L__tmp21:
.loc 2 120 46
mov.b32 %r318, %f39;
shfl.sync.bfly.b32 %r5, %r318, 1, 31, -1;
mov.b32 %r319, %f40;
shfl.sync.bfly.b32 %r6, %r319, 1, 31, -1;
shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1;
mov.b32 %f342, %r216;
$L__tmp22:
.loc 2 109 28
add.f32 %f41, %f38, %f342;
.loc 2 110 60
mov.b32 %r217, %f41;
div.full.f32 %r215, %r216, %r217;
mov.b32 %f42, %r215;
$L__tmp23:
.loc 1 62 51
mov.u32 %r218, 0x0;
mov.u32 %r219, 0x0;
mov.u32 %r220, 0x0;
mov.u32 %r221, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ];
@!%p113 mov.u32 %r218, %r325;
@!%p113 mov.u32 %r219, %r325;
@!%p113 mov.u32 %r220, %r325;
@!%p113 mov.u32 %r221, %r325;
mov.u32 %r226, 0x0;
mov.u32 %r227, 0x0;
mov.u32 %r228, 0x0;
mov.u32 %r229, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ];
@!%p113 mov.u32 %r226, %r325;
@!%p113 mov.u32 %r227, %r325;
@!%p113 mov.u32 %r228, %r325;
@!%p113 mov.u32 %r229, %r325;
mov.u32 %r234, 0x0;
mov.u32 %r235, 0x0;
mov.u32 %r236, 0x0;
mov.u32 %r237, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ];
@!%p113 mov.u32 %r234, %r325;
@!%p113 mov.u32 %r235, %r325;
@!%p113 mov.u32 %r236, %r325;
@!%p113 mov.u32 %r237, %r325;
mov.u32 %r242, 0x0;
mov.u32 %r243, 0x0;
mov.u32 %r244, 0x0;
mov.u32 %r245, 0x0;
@%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ];
@!%p113 mov.u32 %r242, %r325;
@!%p113 mov.u32 %r243, %r325;
@!%p113 mov.u32 %r244, %r325;
@!%p113 mov.u32 %r245, %r325;
.loc 1 63 51
mov.u32 %r250, 0x0;
mov.u32 %r251, 0x0;
mov.u32 %r252, 0x0;
mov.u32 %r253, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ];
@!%p113 mov.u32 %r250, %r325;
@!%p113 mov.u32 %r251, %r325;
@!%p113 mov.u32 %r252, %r325;
@!%p113 mov.u32 %r253, %r325;
cvt.u16.u32 %rs17, %r250;
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; }
cvt.u16.u32 %rs19, %r251;
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; }
cvt.u16.u32 %rs21, %r252;
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; }
cvt.u16.u32 %rs23, %r253;
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; }
mov.u32 %r258, 0x0;
mov.u32 %r259, 0x0;
mov.u32 %r260, 0x0;
mov.u32 %r261, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ];
@!%p113 mov.u32 %r258, %r325;
@!%p113 mov.u32 %r259, %r325;
@!%p113 mov.u32 %r260, %r325;
@!%p113 mov.u32 %r261, %r325;
cvt.u16.u32 %rs25, %r258;
{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; }
cvt.u16.u32 %rs27, %r259;
{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; }
cvt.u16.u32 %rs29, %r260;
{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; }
cvt.u16.u32 %rs31, %r261;
{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; }
.loc 1 63 103
cvt.f32.bf16 %r266, %rs17;
mov.b32 %f43, %r266;
cvt.f32.bf16 %r267, %rs18;
mov.b32 %f44, %r267;
cvt.f32.bf16 %r268, %rs19;
mov.b32 %f45, %r268;
cvt.f32.bf16 %r269, %rs20;
mov.b32 %f46, %r269;
cvt.f32.bf16 %r270, %rs21;
mov.b32 %f47, %r270;
cvt.f32.bf16 %r271, %rs22;
mov.b32 %f48, %r271;
cvt.f32.bf16 %r272, %rs23;
mov.b32 %f49, %r272;
cvt.f32.bf16 %r273, %rs24;
mov.b32 %f50, %r273;
cvt.f32.bf16 %r274, %rs25;
mov.b32 %f51, %r274;
cvt.f32.bf16 %r275, %rs26;
mov.b32 %f52, %r275;
cvt.f32.bf16 %r276, %rs27;
mov.b32 %f53, %r276;
cvt.f32.bf16 %r277, %rs28;
mov.b32 %f54, %r277;
cvt.f32.bf16 %r278, %rs29;
mov.b32 %f55, %r278;
cvt.f32.bf16 %r279, %rs30;
mov.b32 %f56, %r279;
cvt.f32.bf16 %r280, %rs31;
mov.b32 %f57, %r280;
cvt.f32.bf16 %r281, %rs32;
mov.b32 %f58, %r281;
.loc 1 64 35
mul.wide.u32 %rd107, %r2, 4;
add.s64 %rd95, %rd17, %rd107;
.loc 1 64 40
mov.u32 %r282, 0x0;
@%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ];
@!%p113 mov.u32 %r282, %r325;
.loc 1 68 57
@%p49 bra $L__BB0_4;
mov.u64 %rd108, assertMessage_1;
cvta.global.u64 %rd109, %rd108;
mov.u64 %rd110, assertFile_1;
cvta.global.u64 %rd111, %rd110;
mov.u64 %rd112, assertFunc_1;
cvta.global.u64 %rd113, %rd112;
{ // callseq 9, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd109;
.param .b64 param1;
st.param.b64 [param1+0], %rd111;
.param .b32 param2;
st.param.b32 [param2+0], %r438;
.param .b64 param3;
st.param.b64 [param3+0], %rd113;
.param .b64 param4;
st.param.b64 [param4+0], %rd123;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 9
$L__BB0_4:
$L__tmp24:
.loc 2 120 46
mov.b32 %f343, %r6;
$L__tmp25:
.loc 2 113 15
add.f32 %f344, %f40, %f343;
$L__tmp26:
.loc 2 120 46
mov.b32 %f345, %r5;
$L__tmp27:
.loc 2 108 21
sub.f32 %f346, %f345, %f39;
.loc 2 113 30
mul.f32 %f347, %f346, %f346;
.loc 2 113 38
mul.f32 %f348, %f38, %f347;
.loc 2 110 39
setp.eq.f32 %p135, %f41, 0f00000000;
.loc 2 110 49
selp.f32 %f349, 0f00000000, %f42, %p135;
.loc 2 113 22
fma.rn.f32 %f350, %f349, %f348, %f344;
$L__tmp28:
.loc 2 120 46
mov.b32 %f351, %r4;
$L__tmp29:
.loc 2 113 15
add.f32 %f352, %f35, %f351;
$L__tmp30:
.loc 2 120 46
mov.b32 %f353, %r3;
$L__tmp31:
.loc 2 108 21
sub.f32 %f354, %f353, %f34;
.loc 2 113 30
mul.f32 %f355, %f354, %f354;
.loc 2 113 38
mul.f32 %f356, %f33, %f355;
.loc 2 110 39
setp.eq.f32 %p136, %f36, 0f00000000;
.loc 2 110 49
selp.f32 %f357, 0f00000000, %f37, %p136;
.loc 2 113 22
fma.rn.f32 %f358, %f357, %f356, %f352;
$L__tmp32:
.loc 1 69 54
mov.u32 %r321, 0x0;
mov.u32 %r322, 0x0;
mov.u32 %r323, 0x0;
mov.u32 %r324, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ];
@!%p113 mov.u32 %r321, %r325;
@!%p113 mov.u32 %r322, %r325;
@!%p113 mov.u32 %r323, %r325;
@!%p113 mov.u32 %r324, %r325;
mov.u32 %r329, 0x0;
mov.u32 %r330, 0x0;
mov.u32 %r331, 0x0;
mov.u32 %r332, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ];
@!%p113 mov.u32 %r329, %r325;
@!%p113 mov.u32 %r330, %r325;
@!%p113 mov.u32 %r331, %r325;
@!%p113 mov.u32 %r332, %r325;
mov.u32 %r337, 0x0;
mov.u32 %r338, 0x0;
mov.u32 %r339, 0x0;
mov.u32 %r340, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ];
@!%p113 mov.u32 %r337, %r325;
@!%p113 mov.u32 %r338, %r325;
@!%p113 mov.u32 %r339, %r325;
@!%p113 mov.u32 %r340, %r325;
mov.u32 %r345, 0x0;
mov.u32 %r346, 0x0;
mov.u32 %r347, 0x0;
mov.u32 %r348, 0x0;
@%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ];
@!%p113 mov.u32 %r345, %r325;
@!%p113 mov.u32 %r346, %r325;
@!%p113 mov.u32 %r347, %r325;
@!%p113 mov.u32 %r348, %r325;
.loc 1 75 24
mov.b32 %r354, %f358;
mov.b32 %r355, 1132462080;
div.full.f32 %r353, %r354, %r355;
mov.b32 %f359, %r353;
mov.b32 %r378, %f350;
div.full.f32 %r377, %r378, %r355;
mov.b32 %f360, %r377;
.loc 1 77 24
add.f32 %f361, %f359, 0f3727C5AC;
add.f32 %f362, %f360, 0f3727C5AC;
.loc 1 78 30
rsqrt.approx.ftz.f32 %f363, %f361;
rsqrt.approx.ftz.f32 %f364, %f362;
.loc 1 69 54
mov.b32 %f365, %r348;
.loc 1 62 51
mov.b32 %f366, %r245;
.loc 1 70 24
add.f32 %f367, %f366, %f365;
.loc 1 72 24
add.f32 %f368, %f58, %f367;
$L__tmp33:
.loc 2 112 17
fma.rn.f32 %f369, %f346, %f349, %f39;
$L__tmp34:
.loc 1 73 24
sub.f32 %f370, %f368, %f369;
.loc 1 69 54
mov.b32 %f371, %r347;
.loc 1 62 51
mov.b32 %f372, %r244;
.loc 1 70 24
add.f32 %f373, %f372, %f371;
.loc 1 72 24
add.f32 %f374, %f57, %f373;
.loc 1 73 24
sub.f32 %f375, %f374, %f369;
.loc 1 69 54
mov.b32 %f376, %r346;
.loc 1 62 51
mov.b32 %f377, %r243;
.loc 1 70 24
add.f32 %f378, %f377, %f376;
.loc 1 72 24
add.f32 %f379, %f56, %f378;
.loc 1 73 24
sub.f32 %f380, %f379, %f369;
.loc 1 69 54
mov.b32 %f381, %r345;
.loc 1 62 51
mov.b32 %f382, %r242;
.loc 1 70 24
add.f32 %f383, %f382, %f381;
.loc 1 72 24
add.f32 %f384, %f55, %f383;
.loc 1 73 24
sub.f32 %f385, %f384, %f369;
.loc 1 69 54
mov.b32 %f386, %r340;
.loc 1 62 51
mov.b32 %f387, %r237;
.loc 1 70 24
add.f32 %f388, %f387, %f386;
.loc 1 72 24
add.f32 %f389, %f54, %f388;
.loc 1 73 24
sub.f32 %f390, %f389, %f369;
.loc 1 69 54
mov.b32 %f391, %r339;
.loc 1 62 51
mov.b32 %f392, %r236;
.loc 1 70 24
add.f32 %f393, %f392, %f391;
.loc 1 72 24
add.f32 %f394, %f53, %f393;
.loc 1 73 24
sub.f32 %f395, %f394, %f369;
.loc 1 69 54
mov.b32 %f396, %r338;
.loc 1 62 51
mov.b32 %f397, %r235;
.loc 1 70 24
add.f32 %f398, %f397, %f396;
.loc 1 72 24
add.f32 %f399, %f52, %f398;
.loc 1 73 24
sub.f32 %f400, %f399, %f369;
.loc 1 69 54
mov.b32 %f401, %r337;
.loc 1 62 51
mov.b32 %f402, %r234;
.loc 1 70 24
add.f32 %f403, %f402, %f401;
.loc 1 72 24
add.f32 %f404, %f51, %f403;
.loc 1 73 24
sub.f32 %f405, %f404, %f369;
.loc 1 69 54
mov.b32 %f406, %r332;
.loc 1 62 51
mov.b32 %f407, %r229;
.loc 1 70 24
add.f32 %f408, %f407, %f406;
.loc 1 72 24
add.f32 %f409, %f50, %f408;
$L__tmp35:
.loc 2 112 17
fma.rn.f32 %f410, %f354, %f357, %f34;
$L__tmp36:
.loc 1 73 24
sub.f32 %f411, %f409, %f410;
.loc 1 69 54
mov.b32 %f412, %r331;
.loc 1 62 51
mov.b32 %f413, %r228;
.loc 1 70 24
add.f32 %f414, %f413, %f412;
.loc 1 72 24
add.f32 %f415, %f49, %f414;
.loc 1 73 24
sub.f32 %f416, %f415, %f410;
.loc 1 69 54
mov.b32 %f417, %r330;
.loc 1 62 51
mov.b32 %f418, %r227;
.loc 1 70 24
add.f32 %f419, %f418, %f417;
.loc 1 72 24
add.f32 %f420, %f48, %f419;
.loc 1 73 24
sub.f32 %f421, %f420, %f410;
.loc 1 69 54
mov.b32 %f422, %r329;
.loc 1 62 51
mov.b32 %f423, %r226;
.loc 1 70 24
add.f32 %f424, %f423, %f422;
.loc 1 72 24
add.f32 %f425, %f47, %f424;
.loc 1 73 24
sub.f32 %f426, %f425, %f410;
.loc 1 69 54
mov.b32 %f427, %r324;
.loc 1 62 51
mov.b32 %f428, %r221;
.loc 1 70 24
add.f32 %f429, %f428, %f427;
.loc 1 72 24
add.f32 %f430, %f46, %f429;
.loc 1 73 24
sub.f32 %f431, %f430, %f410;
.loc 1 69 54
mov.b32 %f432, %r323;
.loc 1 62 51
mov.b32 %f433, %r220;
.loc 1 70 24
add.f32 %f434, %f433, %f432;
.loc 1 72 24
add.f32 %f435, %f45, %f434;
.loc 1 73 24
sub.f32 %f436, %f435, %f410;
.loc 1 69 54
mov.b32 %f437, %r322;
.loc 1 62 51
mov.b32 %f438, %r219;
.loc 1 70 24
add.f32 %f439, %f438, %f437;
.loc 1 72 24
add.f32 %f440, %f44, %f439;
.loc 1 73 24
sub.f32 %f441, %f440, %f410;
.loc 1 69 54
mov.b32 %f442, %r321;
.loc 1 62 51
mov.b32 %f443, %r218;
.loc 1 70 24
add.f32 %f444, %f443, %f442;
.loc 1 72 24
add.f32 %f445, %f43, %f444;
.loc 1 73 24
sub.f32 %f446, %f445, %f410;
.loc 1 79 24
mul.f32 %f447, %f446, %f363;
mul.f32 %f448, %f441, %f363;
mul.f32 %f449, %f436, %f363;
mul.f32 %f450, %f431, %f363;
mul.f32 %f451, %f426, %f363;
mul.f32 %f452, %f421, %f363;
mul.f32 %f453, %f416, %f363;
mul.f32 %f454, %f411, %f363;
mul.f32 %f455, %f405, %f364;
mul.f32 %f456, %f400, %f364;
mul.f32 %f457, %f395, %f364;
mul.f32 %f458, %f390, %f364;
mul.f32 %f459, %f385, %f364;
mul.f32 %f460, %f380, %f364;
mul.f32 %f461, %f375, %f364;
mul.f32 %f462, %f370, %f364;
.loc 1 80 24
shl.b32 %r425, %r2, 2;
mov.u32 %r426, global_smem;
add.s32 %r427, %r426, %r425;
st.shared.u32 [%r427], %r282;
bar.sync 0;
shl.b32 %r428, %r1, 2;
add.s32 %r429, %r426, %r428;
ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r429];
ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r429+16];
mul.f32 %f471, %f447, %f463;
mul.f32 %f472, %f448, %f464;
mul.f32 %f473, %f449, %f465;
mul.f32 %f474, %f450, %f466;
mul.f32 %f475, %f451, %f467;
mul.f32 %f476, %f452, %f468;
mul.f32 %f477, %f453, %f469;
mul.f32 %f478, %f454, %f470;
mul.f32 %f479, %f455, %f463;
mul.f32 %f480, %f456, %f464;
mul.f32 %f481, %f457, %f465;
mul.f32 %f482, %f458, %f466;
mul.f32 %f483, %f459, %f467;
mul.f32 %f484, %f460, %f468;
mul.f32 %f485, %f461, %f469;
mul.f32 %f486, %f462, %f470;
.loc 1 82 29
shl.b64 %rd121, %rd7, 1;
add.s64 %rd119, %rd18, %rd121;
shl.b64 %rd122, %rd9, 1;
add.s64 %rd120, %rd18, %rd122;
.loc 1 82 52
mov.b32 %r401, %f471;
cvt.rn.bf16.f32 %rs33, %r401;
mov.b32 %r402, %f472;
cvt.rn.bf16.f32 %rs34, %r402;
mov.b32 %r403, %f473;
cvt.rn.bf16.f32 %rs35, %r403;
mov.b32 %r404, %f474;
cvt.rn.bf16.f32 %rs36, %r404;
mov.b32 %r405, %f475;
cvt.rn.bf16.f32 %rs37, %r405;
mov.b32 %r406, %f476;
cvt.rn.bf16.f32 %rs38, %r406;
mov.b32 %r407, %f477;
cvt.rn.bf16.f32 %rs39, %r407;
mov.b32 %r408, %f478;
cvt.rn.bf16.f32 %rs40, %r408;
mov.b32 %r409, %f479;
cvt.rn.bf16.f32 %rs41, %r409;
mov.b32 %r410, %f480;
cvt.rn.bf16.f32 %rs42, %r410;
mov.b32 %r411, %f481;
cvt.rn.bf16.f32 %rs43, %r411;
mov.b32 %r412, %f482;
cvt.rn.bf16.f32 %rs44, %r412;
mov.b32 %r413, %f483;
cvt.rn.bf16.f32 %rs45, %r413;
mov.b32 %r414, %f484;
cvt.rn.bf16.f32 %rs46, %r414;
mov.b32 %r415, %f485;
cvt.rn.bf16.f32 %rs47, %r415;
mov.b32 %r416, %f486;
cvt.rn.bf16.f32 %rs48, %r416;
mov.b32 %r430, {%rs33, %rs34};
mov.b32 %r431, {%rs35, %rs36};
mov.b32 %r432, {%rs37, %rs38};
mov.b32 %r433, {%rs39, %rs40};
@%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 };
mov.b32 %r434, {%rs41, %rs42};
mov.b32 %r435, {%rs43, %rs44};
mov.b32 %r436, {%rs45, %rs46};
mov.b32 %r437, {%rs47, %rs48};
@%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 };
.loc 1 58 4
ret;
$L__tmp37:
$L__func_end0:
}
// .globl __nv_rsqrtf
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
.param .b32 __nv_rsqrtf_param_0
)
{
.reg .f32 %f<3>;
$L__func_begin1:
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
rsqrt.approx.ftz.f32 %f2, %f1;
st.param.f32 [func_retval0+0], %f2;
ret;
$L__func_end1:
}
.file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 302
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 99
.b8 105
.b8 103
.b8 54
.b8 102
.b8 107
.b8 105
.b8 54
.b8 112
.b8 52
.b8 108
.b8 120
.b8 114
.b8 100
.b8 109
.b8 103
.b8 103
.b8 54
.b8 101
.b8 117
.b8 100
.b8 97
.b8 104
.b8 105
.b8 101
.b8 120
.b8 99
.b8 118
.b8 117
.b8 101
.b8 101
.b8 111
.b8 108
.b8 50
.b8 112
.b8 52
.b8 113
.b8 112
.b8 53
.b8 51
.b8 50
.b8 112
.b8 118
.b8 118
.b8 101
.b8 50
.b8 121
.b8 52
.b8 54
.b8 51
.b8 121
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 99
.b8 105
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp2
.b8 2
.b8 47
.b8 41
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp36
.b8 2
.b8 53
.b8 44
.b8 4
.b32 125
.b64 $L__tmp2
.b64 $L__tmp36
.b8 2
.b8 120
.b8 46
.b8 0
.b8 4
.b32 125
.b64 $L__tmp3
.b64 $L__tmp31
.b8 2
.b8 53
.b8 44
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 306
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 101
.b8 55
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 306
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }