// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3d4d5d6de7de .extern .func __assertfail ( .param .b64 __assertfail_param_0, .param .b64 __assertfail_param_1, .param .b32 __assertfail_param_2, .param .b64 __assertfail_param_3, .param .b64 __assertfail_param_4 ) ; .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; .extern .shared .align 1 .b8 global_smem[]; .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; .visible .entry triton__0d1d2d3d4d5d6de7de( .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 ) .maxntid 64, 1, 1 { .reg .pred %p<59>; .reg .b16 %rs<13>; .reg .b32 %r<176>; .reg .f32 %f<169>; .reg .b64 %rd<58>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4]; ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1]; ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0]; $L__tmp0: .loc 1 24 33 mov.u32 %r1, %tid.x; and.b32 %r2, %r1, 31; ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2]; ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3]; bfe.u32 %r3, %r1, 5, 1; shl.b32 %r30, %r1, 2; and.b32 %r4, %r30, 252; .loc 1 21 28 mov.u32 %r13, %ctaid.x; .loc 1 26 30 mul.wide.s32 %rd25, %r13, 8; add.s64 %rd11, %rd22, %rd25; mov.pred %p53, -1; .loc 1 26 35 mov.u64 %rd10, 0x0; @%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ]; mov.u64 %rd12, 0x0; @%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ]; mov.u64 %rd14, 0x0; @%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ]; mov.u64 %rd16, 0x0; @%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ]; mov.u64 %rd18, 0x0; @%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ]; .loc 1 27 18 shr.s32 %r31, %r13, 31; shr.u32 %r32, %r31, 23; add.s32 %r33, %r13, %r32; and.b32 %r34, %r33, 16776704; sub.s32 %r35, %r13, %r34; .loc 1 35 44 shl.b32 %r36, %r35, 8; .loc 1 35 40 or.b32 %r37, %r36, %r4; .loc 1 35 34 mul.wide.s32 %rd26, %r37, 4; add.s64 %rd37, %rd23, %rd26; mov.b32 %r151, 0; .loc 1 35 50 mov.u32 %r14, 0x0; mov.u32 %r15, 0x0; mov.u32 %r16, 0x0; mov.u32 %r17, 0x0; @%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ]; @!%p53 mov.u32 %r14, %r151; @!%p53 mov.u32 %r15, %r151; @!%p53 mov.u32 %r16, %r151; @!%p53 mov.u32 %r17, %r151; mov.b32 %f2, %r14; mov.b32 %f1, %r15; mov.b32 %f3, %r16; mov.b32 %f4, %r17; .loc 1 36 44 shl.b32 %r38, %r13, 8; .loc 1 36 40 or.b32 %r39, %r38, %r4; .loc 1 36 34 mul.wide.s32 %rd27, %r39, 2; add.s64 %rd38, %rd24, %rd27; .loc 1 36 50 mov.u32 %r22, 0x0; mov.u32 %r23, 0x0; @%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ]; @!%p53 mov.u32 %r22, %r151; @!%p53 mov.u32 %r23, %r151; cvt.u16.u32 %rs1, %r22; { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; } cvt.u16.u32 %rs3, %r23; { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; } .loc 1 36 101 cvt.f32.bf16 %r26, %rs1; mov.b32 %f5, %r26; cvt.f32.bf16 %r27, %rs2; mov.b32 %f6, %r27; cvt.f32.bf16 %r28, %rs3; mov.b32 %f7, %r28; cvt.f32.bf16 %r29, %rs4; mov.b32 %f8, %r29; .loc 1 37 22 add.s64 %rd28, %rd18, 50257; .loc 1 38 22 setp.lt.s64 %p14, %rd18, 0; .loc 1 39 36 selp.b64 %rd5, %rd28, %rd18, %p14; .loc 1 40 40 setp.lt.u64 %p15, %rd5, 50257; mov.b32 %r175, 883; mov.u64 %rd57, 1; .loc 1 40 55 @%p15 bra $L__BB0_2; mov.u64 %rd29, assertMessage_0; cvta.global.u64 %rd30, %rd29; mov.u64 %rd31, assertFile_0; cvta.global.u64 %rd32, %rd31; mov.u64 %rd33, assertFunc_0; cvta.global.u64 %rd34, %rd33; { // callseq 0, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd30; .param .b64 param1; st.param.b64 [param1+0], %rd32; .param .b32 param2; st.param.b32 [param2+0], %r175; .param .b64 param3; st.param.b64 [param3+0], %rd34; .param .b64 param4; st.param.b64 [param4+0], %rd57; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 0 $L__BB0_2: .loc 1 0 55 ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5]; cvt.s64.s32 %rd3, %r39; .loc 1 38 22 setp.lt.s64 %p44, %rd10, 0; .loc 1 41 44 shl.b64 %rd40, %rd10, 8; add.s64 %rd41, %rd40, 12865792; selp.b64 %rd42, %rd41, %rd40, %p44; cvt.u64.u32 %rd43, %r4; .loc 1 41 40 or.b64 %rd44, %rd42, %rd43; .loc 1 41 34 shl.b64 %rd45, %rd44, 2; add.s64 %rd54, %rd7, %rd45; .loc 1 41 52 mov.u32 %r41, 0x0; mov.u32 %r42, 0x0; mov.u32 %r43, 0x0; mov.u32 %r44, 0x0; @%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ]; @!%p53 mov.u32 %r41, %r151; @!%p53 mov.u32 %r42, %r151; @!%p53 mov.u32 %r43, %r151; @!%p53 mov.u32 %r44, %r151; mov.b32 %f15, %r43; mov.b32 %f16, %r44; .loc 1 42 22 add.f32 %f17, %f3, %f15; add.f32 %f18, %f4, %f16; .loc 1 44 22 add.f32 %f19, %f7, %f17; add.f32 %f20, %f8, %f18; .loc 1 41 52 mov.b32 %f21, %r41; mov.b32 %f22, %r42; .loc 1 42 22 add.f32 %f23, %f1, %f22; add.f32 %f24, %f2, %f21; .loc 1 44 22 add.f32 %f25, %f5, %f24; add.f32 %f26, %f6, %f23; $L__tmp1: .loc 2 98 22 add.f32 %f27, %f26, 0f00000000; add.f32 %f28, %f25, 0f00000000; add.f32 %f29, %f19, 0f00000000; add.f32 %f30, %f20, 0f00000000; .loc 2 101 30 sub.f32 %f31, %f25, %f28; sub.f32 %f32, %f26, %f27; sub.f32 %f33, %f19, %f29; sub.f32 %f34, %f20, %f30; .loc 2 101 13 fma.rn.f32 %f35, %f25, %f31, 0f00000000; fma.rn.f32 %f36, %f26, %f32, 0f00000000; fma.rn.f32 %f37, %f19, %f33, 0f00000000; fma.rn.f32 %f38, %f20, %f34, 0f00000000; $L__tmp2: .loc 2 108 21 sub.f32 %f39, %f27, %f28; mov.b32 %r50, 1065353216; mov.b32 %r51, 1073741824; .loc 2 110 60 div.full.f32 %r49, %r50, %r51; mov.b32 %f40, %r49; .loc 2 112 17 fma.rn.f32 %f41, %f40, %f39, %f28; .loc 2 113 15 add.f32 %f42, %f35, %f36; .loc 2 113 30 mul.f32 %f43, %f39, %f39; .loc 2 113 22 fma.rn.f32 %f44, %f40, %f43, %f42; .loc 2 108 21 sub.f32 %f45, %f29, %f41; mov.b32 %r54, 1077936128; .loc 2 110 60 div.full.f32 %r52, %r50, %r54; mov.b32 %f46, %r52; .loc 2 112 17 fma.rn.f32 %f47, %f46, %f45, %f41; .loc 2 113 15 add.f32 %f48, %f37, %f44; .loc 2 113 30 mul.f32 %f49, %f45, %f45; .loc 2 113 38 fma.rn.f32 %f50, %f45, %f45, %f49; .loc 2 113 22 fma.rn.f32 %f51, %f46, %f50, %f48; .loc 2 108 21 sub.f32 %f52, %f30, %f47; mov.b32 %r57, 1082130432; .loc 2 110 60 div.full.f32 %r55, %r50, %r57; mov.b32 %f53, %r55; .loc 2 112 17 fma.rn.f32 %f54, %f53, %f52, %f47; .loc 2 113 15 add.f32 %f55, %f38, %f51; .loc 2 113 30 mul.f32 %f56, %f52, %f52; .loc 2 113 38 mul.f32 %f57, %f56, 0f40400000; .loc 2 113 22 fma.rn.f32 %f58, %f53, %f57, %f55; $L__tmp3: .loc 2 120 46 mov.b32 %r118, %f54; shfl.sync.bfly.b32 %r119, %r118, 16, 31, -1; mov.b32 %f59, %r119; mov.b32 %r120, %f58; shfl.sync.bfly.b32 %r121, %r120, 16, 31, -1; mov.b32 %f60, %r121; shfl.sync.bfly.b32 %r59, %r57, 16, 31, -1; mov.b32 %f61, %r59; $L__tmp4: .loc 2 108 21 sub.f32 %f62, %f59, %f54; .loc 2 109 28 add.f32 %f63, %f61, 0f40800000; .loc 2 110 39 setp.eq.f32 %p45, %f63, 0f00000000; .loc 2 110 60 mov.b32 %r60, %f63; div.full.f32 %r58, %r59, %r60; mov.b32 %f64, %r58; .loc 2 110 49 selp.f32 %f65, 0f00000000, %f64, %p45; .loc 2 112 17 fma.rn.f32 %f66, %f65, %f62, %f54; .loc 2 113 15 add.f32 %f67, %f58, %f60; .loc 2 113 30 mul.f32 %f68, %f62, %f62; .loc 2 113 38 mul.f32 %f69, %f68, 0f40800000; .loc 2 113 22 fma.rn.f32 %f70, %f65, %f69, %f67; $L__tmp5: .loc 2 120 46 mov.b32 %r122, %f66; shfl.sync.bfly.b32 %r123, %r122, 8, 31, -1; mov.b32 %f71, %r123; mov.b32 %r124, %f70; shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1; mov.b32 %f72, %r125; shfl.sync.bfly.b32 %r62, %r60, 8, 31, -1; mov.b32 %f73, %r62; $L__tmp6: .loc 2 108 21 sub.f32 %f74, %f71, %f66; .loc 2 109 28 add.f32 %f75, %f63, %f73; .loc 2 110 39 setp.eq.f32 %p46, %f75, 0f00000000; .loc 2 110 60 mov.b32 %r63, %f75; div.full.f32 %r61, %r62, %r63; mov.b32 %f76, %r61; .loc 2 110 49 selp.f32 %f77, 0f00000000, %f76, %p46; .loc 2 112 17 fma.rn.f32 %f78, %f77, %f74, %f66; .loc 2 113 15 add.f32 %f79, %f70, %f72; .loc 2 113 30 mul.f32 %f80, %f74, %f74; .loc 2 113 38 mul.f32 %f81, %f63, %f80; .loc 2 113 22 fma.rn.f32 %f82, %f77, %f81, %f79; $L__tmp7: .loc 2 120 46 mov.b32 %r126, %f78; shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1; mov.b32 %f83, %r127; mov.b32 %r128, %f82; shfl.sync.bfly.b32 %r129, %r128, 4, 31, -1; mov.b32 %f84, %r129; shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1; mov.b32 %f85, %r65; $L__tmp8: .loc 2 108 21 sub.f32 %f86, %f83, %f78; .loc 2 109 28 add.f32 %f87, %f75, %f85; .loc 2 110 39 setp.eq.f32 %p47, %f87, 0f00000000; .loc 2 110 60 mov.b32 %r66, %f87; div.full.f32 %r64, %r65, %r66; mov.b32 %f88, %r64; .loc 2 110 49 selp.f32 %f89, 0f00000000, %f88, %p47; .loc 2 112 17 fma.rn.f32 %f90, %f89, %f86, %f78; .loc 2 113 15 add.f32 %f91, %f82, %f84; .loc 2 113 30 mul.f32 %f92, %f86, %f86; .loc 2 113 38 mul.f32 %f93, %f75, %f92; .loc 2 113 22 fma.rn.f32 %f94, %f89, %f93, %f91; $L__tmp9: .loc 2 120 46 mov.b32 %r130, %f90; shfl.sync.bfly.b32 %r131, %r130, 2, 31, -1; mov.b32 %f95, %r131; mov.b32 %r132, %f94; shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1; mov.b32 %f96, %r133; shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1; mov.b32 %f97, %r68; $L__tmp10: .loc 2 108 21 sub.f32 %f98, %f95, %f90; .loc 2 109 28 add.f32 %f99, %f87, %f97; .loc 2 110 39 setp.eq.f32 %p48, %f99, 0f00000000; .loc 2 110 60 mov.b32 %r69, %f99; div.full.f32 %r67, %r68, %r69; mov.b32 %f100, %r67; .loc 2 110 49 selp.f32 %f101, 0f00000000, %f100, %p48; .loc 2 112 17 fma.rn.f32 %f102, %f101, %f98, %f90; .loc 2 113 15 add.f32 %f103, %f94, %f96; .loc 2 113 30 mul.f32 %f104, %f98, %f98; .loc 2 113 38 mul.f32 %f105, %f87, %f104; .loc 2 113 22 fma.rn.f32 %f106, %f101, %f105, %f103; $L__tmp11: .loc 2 120 46 mov.b32 %r134, %f102; shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1; mov.b32 %f107, %r135; mov.b32 %r136, %f106; shfl.sync.bfly.b32 %r137, %r136, 1, 31, -1; mov.b32 %f108, %r137; shfl.sync.bfly.b32 %r71, %r69, 1, 31, -1; mov.b32 %f109, %r71; $L__tmp12: .loc 2 108 21 sub.f32 %f110, %f107, %f102; .loc 2 109 28 add.f32 %f111, %f99, %f109; .loc 2 110 39 setp.eq.f32 %p49, %f111, 0f00000000; .loc 2 110 60 mov.b32 %r72, %f111; div.full.f32 %r70, %r71, %r72; mov.b32 %f112, %r70; .loc 2 110 49 selp.f32 %f113, 0f00000000, %f112, %p49; .loc 2 112 17 fma.rn.f32 %f114, %f113, %f110, %f102; .loc 2 113 15 add.f32 %f115, %f106, %f108; .loc 2 113 30 mul.f32 %f116, %f110, %f110; .loc 2 113 38 mul.f32 %f117, %f99, %f116; .loc 2 113 22 fma.rn.f32 %f118, %f113, %f117, %f115; $L__tmp13: .loc 2 120 46 setp.eq.s32 %p21, %r2, 0; shl.b32 %r138, %r3, 2; mov.u32 %r139, global_smem; add.s32 %r73, %r139, %r138; mov.b32 %r74, %f114; @%p21 st.shared.b32 [ %r73 + 0 ], %r74; add.s32 %r140, %r139, 8; add.s32 %r75, %r140, %r138; mov.b32 %r76, %f118; @%p21 st.shared.b32 [ %r75 + 0 ], %r76; add.s32 %r141, %r139, 16; add.s32 %r77, %r141, %r138; @%p21 st.shared.b32 [ %r77 + 0 ], %r72; bar.sync 0; setp.lt.s32 %p24, %r1, 2; add.s32 %r80, %r139, %r30; @%p24 ld.shared.b32 %r79, [ %r80 + 0 ]; mov.b32 %f119, %r79; add.s32 %r82, %r140, %r30; @%p24 ld.shared.b32 %r81, [ %r82 + 0 ]; mov.b32 %f120, %r81; add.s32 %r84, %r141, %r30; @%p24 ld.shared.b32 %r83, [ %r84 + 0 ]; mov.b32 %f121, %r83; shfl.sync.bfly.b32 %r143, %r79, 1, 31, -1; mov.b32 %f122, %r143; shfl.sync.bfly.b32 %r144, %r81, 1, 31, -1; mov.b32 %f123, %r144; shfl.sync.bfly.b32 %r86, %r83, 1, 31, -1; mov.b32 %f124, %r86; $L__tmp14: .loc 2 108 21 sub.f32 %f125, %f122, %f119; .loc 2 109 28 add.f32 %f126, %f121, %f124; .loc 2 110 39 setp.eq.f32 %p50, %f126, 0f00000000; .loc 2 110 60 mov.b32 %r87, %f126; div.full.f32 %r85, %r86, %r87; mov.b32 %f127, %r85; .loc 2 110 49 selp.f32 %f128, 0f00000000, %f127, %p50; .loc 2 112 17 fma.rn.f32 %f129, %f125, %f128, %f119; .loc 2 113 15 add.f32 %f130, %f120, %f123; .loc 2 113 30 mul.f32 %f131, %f125, %f125; .loc 2 113 38 mul.f32 %f132, %f121, %f131; .loc 2 113 22 fma.rn.f32 %f133, %f132, %f128, %f130; $L__tmp15: .loc 2 120 46 and.b32 %r145, %r1, 1; setp.eq.b32 %p51, %r145, 1; not.pred %p52, %p51; and.pred %p27, %p24, %p52; mov.b32 %r89, %f129; @%p27 st.shared.b32 [ %r80 + 0 ], %r89; mov.b32 %r91, %f133; @%p27 st.shared.b32 [ %r82 + 0 ], %r91; @%p27 st.shared.b32 [ %r84 + 0 ], %r87; bar.sync 0; ld.shared.f32 %f9, [global_smem]; ld.shared.f32 %f10, [global_smem+8]; $L__tmp16: .loc 1 62 51 mov.u32 %r94, 0x0; mov.u32 %r95, 0x0; mov.u32 %r96, 0x0; mov.u32 %r97, 0x0; @%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ]; @!%p53 mov.u32 %r94, %r151; @!%p53 mov.u32 %r95, %r151; @!%p53 mov.u32 %r96, %r151; @!%p53 mov.u32 %r97, %r151; .loc 1 63 51 mov.u32 %r102, 0x0; mov.u32 %r103, 0x0; @%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ]; @!%p53 mov.u32 %r102, %r151; @!%p53 mov.u32 %r103, %r151; cvt.u16.u32 %rs5, %r102; { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; } cvt.u16.u32 %rs7, %r103; { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; } .loc 1 63 103 cvt.f32.bf16 %r106, %rs5; mov.b32 %f11, %r106; cvt.f32.bf16 %r107, %rs6; mov.b32 %f12, %r107; cvt.f32.bf16 %r108, %rs7; mov.b32 %f13, %r108; cvt.f32.bf16 %r109, %rs8; mov.b32 %f14, %r109; .loc 1 64 35 mul.wide.u32 %rd46, %r4, 4; add.s64 %rd39, %rd8, %rd46; .loc 1 64 40 mov.u32 %r110, 0x0; mov.u32 %r111, 0x0; mov.u32 %r112, 0x0; mov.u32 %r113, 0x0; @%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ]; @!%p53 mov.u32 %r110, %r151; @!%p53 mov.u32 %r111, %r151; @!%p53 mov.u32 %r112, %r151; @!%p53 mov.u32 %r113, %r151; .loc 1 68 57 @%p15 bra $L__BB0_4; mov.u64 %rd47, assertMessage_1; cvta.global.u64 %rd48, %rd47; mov.u64 %rd49, assertFile_1; cvta.global.u64 %rd50, %rd49; mov.u64 %rd51, assertFunc_1; cvta.global.u64 %rd52, %rd51; { // callseq 1, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd48; .param .b64 param1; st.param.b64 [param1+0], %rd50; .param .b32 param2; st.param.b32 [param2+0], %r175; .param .b64 param3; st.param.b64 [param3+0], %rd52; .param .b64 param4; st.param.b64 [param4+0], %rd57; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 1 $L__BB0_4: .loc 1 69 54 mov.u32 %r147, 0x0; mov.u32 %r148, 0x0; mov.u32 %r149, 0x0; mov.u32 %r150, 0x0; @%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ]; @!%p53 mov.u32 %r147, %r151; @!%p53 mov.u32 %r148, %r151; @!%p53 mov.u32 %r149, %r151; @!%p53 mov.u32 %r150, %r151; .loc 1 75 24 mov.b32 %r156, %f10; mov.b32 %r157, 1132462080; div.full.f32 %r155, %r156, %r157; mov.b32 %f134, %r155; .loc 1 77 24 add.f32 %f135, %f134, 0f3727C5AC; .loc 1 78 30 rsqrt.approx.ftz.f32 %f136, %f135; .loc 1 69 54 mov.b32 %f137, %r150; .loc 1 62 51 mov.b32 %f138, %r97; .loc 1 70 24 add.f32 %f139, %f138, %f137; .loc 1 72 24 add.f32 %f140, %f14, %f139; .loc 1 73 24 sub.f32 %f141, %f140, %f9; .loc 1 69 54 mov.b32 %f142, %r149; .loc 1 62 51 mov.b32 %f143, %r96; .loc 1 70 24 add.f32 %f144, %f143, %f142; .loc 1 72 24 add.f32 %f145, %f13, %f144; .loc 1 73 24 sub.f32 %f146, %f145, %f9; .loc 1 69 54 mov.b32 %f147, %r148; .loc 1 62 51 mov.b32 %f148, %r95; .loc 1 70 24 add.f32 %f149, %f148, %f147; .loc 1 72 24 add.f32 %f150, %f12, %f149; .loc 1 73 24 sub.f32 %f151, %f150, %f9; .loc 1 69 54 mov.b32 %f152, %r147; .loc 1 62 51 mov.b32 %f153, %r94; .loc 1 70 24 add.f32 %f154, %f153, %f152; .loc 1 72 24 add.f32 %f155, %f11, %f154; .loc 1 73 24 sub.f32 %f156, %f155, %f9; .loc 1 64 40 mov.b32 %f157, %r110; mov.b32 %f158, %r111; mov.b32 %f159, %r112; mov.b32 %f160, %r113; .loc 1 79 24 mul.f32 %f161, %f156, %f136; mul.f32 %f162, %f151, %f136; mul.f32 %f163, %f146, %f136; mul.f32 %f164, %f141, %f136; .loc 1 80 24 mul.f32 %f165, %f161, %f157; mul.f32 %f166, %f162, %f158; mul.f32 %f167, %f163, %f159; mul.f32 %f168, %f164, %f160; .loc 1 82 29 shl.b64 %rd56, %rd3, 1; add.s64 %rd55, %rd9, %rd56; .loc 1 82 52 mov.b32 %r167, %f165; cvt.rn.bf16.f32 %rs9, %r167; mov.b32 %r168, %f166; cvt.rn.bf16.f32 %rs10, %r168; mov.b32 %r169, %f167; cvt.rn.bf16.f32 %rs11, %r169; mov.b32 %r170, %f168; cvt.rn.bf16.f32 %rs12, %r170; mov.b32 %r173, {%rs9, %rs10}; mov.b32 %r174, {%rs11, %rs12}; @%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 }; .loc 1 58 4 ret; $L__tmp17: $L__func_end0: } // .globl __nv_rsqrtf .visible .func (.param .b32 func_retval0) __nv_rsqrtf( .param .b32 __nv_rsqrtf_param_0 ) { .reg .f32 %f<3>; $L__func_begin1: ld.param.f32 %f1, [__nv_rsqrtf_param_0]; rsqrt.approx.ftz.f32 %f2, %f1; st.param.f32 [func_retval0+0], %f2; ret; $L__func_end1: } .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py" .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 302 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 99 .b8 105 .b8 103 .b8 54 .b8 102 .b8 107 .b8 105 .b8 54 .b8 112 .b8 52 .b8 108 .b8 120 .b8 114 .b8 100 .b8 109 .b8 103 .b8 103 .b8 54 .b8 101 .b8 117 .b8 100 .b8 97 .b8 104 .b8 105 .b8 101 .b8 120 .b8 99 .b8 118 .b8 117 .b8 101 .b8 101 .b8 111 .b8 108 .b8 50 .b8 112 .b8 52 .b8 113 .b8 112 .b8 53 .b8 51 .b8 50 .b8 112 .b8 118 .b8 118 .b8 101 .b8 50 .b8 121 .b8 52 .b8 54 .b8 51 .b8 121 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 99 .b8 105 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp2 .b8 2 .b8 47 .b8 41 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp15 .b8 2 .b8 53 .b8 44 .b8 4 .b32 125 .b64 $L__tmp2 .b64 $L__tmp15 .b8 2 .b8 120 .b8 46 .b8 0 .b8 4 .b32 125 .b64 $L__tmp3 .b64 $L__tmp16 .b8 2 .b8 53 .b8 44 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 306 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 306 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }