// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3d4d5d6de7de .extern .func __assertfail ( .param .b64 __assertfail_param_0, .param .b64 __assertfail_param_1, .param .b32 __assertfail_param_2, .param .b64 __assertfail_param_3, .param .b64 __assertfail_param_4 ) ; .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; .extern .shared .align 1 .b8 global_smem[]; .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; .visible .entry triton__0d1d2d3d4d5d6de7de( .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 ) .maxntid 128, 1, 1 { .reg .pred %p<48>; .reg .b16 %rs<13>; .reg .b32 %r<158>; .reg .f32 %f<164>; .reg .b64 %rd<73>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6de7de_param_5]; ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_4]; ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_3]; ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6de7de_param_0]; ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6de7de_param_1]; $L__tmp0: .loc 1 22 44 mov.u32 %r13, %tid.x; and.b32 %r1, %r13, 31; ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6de7de_param_2]; bfe.u32 %r14, %r13, 5, 2; bfe.u32 %r15, %r13, 1, 4; shl.b32 %r16, %r14, 4; or.b32 %r2, %r16, %r15; and.b32 %r17, %r13, 63; .loc 1 24 33 shl.b32 %r18, %r13, 2; and.b32 %r3, %r18, 4; and.b32 %r4, %r13, 7; .loc 1 31 36 shl.b32 %r5, %r14, 2; .loc 1 21 28 mov.u32 %r11, %ctaid.x; .loc 1 21 33 shl.b32 %r19, %r11, 6; .loc 1 22 23 or.b32 %r20, %r19, %r2; or.b32 %r21, %r19, %r17; .loc 1 26 30 mul.wide.s32 %rd33, %r20, 8; add.s64 %rd21, %rd30, %rd33; mul.wide.s32 %rd34, %r21, 8; add.s64 %rd29, %rd30, %rd34; mov.pred %p1, -1; .loc 1 26 35 mov.u64 %rd20, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ]; mov.u64 %rd22, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd21 + 0 ]; mov.u64 %rd24, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd21 + 0 ]; mov.u64 %rd26, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd21 + 0 ]; mov.u64 %rd28, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ]; .loc 1 27 18 bfe.s32 %r22, %r11, 25, 1; shr.u32 %r23, %r22, 23; add.s32 %r24, %r20, %r23; and.b32 %r25, %r24, 16776704; sub.s32 %r26, %r20, %r25; .loc 1 35 44 shl.b32 %r27, %r26, 8; .loc 1 37 22 add.s64 %rd35, %rd28, 50257; .loc 1 38 22 setp.lt.s64 %p6, %rd20, 0; setp.lt.s64 %p7, %rd28, 0; .loc 1 39 36 selp.b64 %rd1, %rd35, %rd28, %p7; .loc 1 41 44 shl.b64 %rd36, %rd20, 8; add.s64 %rd37, %rd36, 12865792; selp.b64 %rd38, %rd37, %rd36, %p6; .loc 1 31 36 and.b32 %r28, %r13, 1; mul.wide.u32 %rd2, %r28, 16; shl.b64 %rd39, %rd38, 2; or.b64 %rd40, %rd2, %rd39; add.s64 %rd72, %rd31, %rd40; shl.b32 %r29, %r11, 14; shl.b32 %r30, %r14, 12; or.b32 %r31, %r29, %r30; shl.b32 %r32, %r15, 8; or.b32 %r33, %r31, %r32; or.b32 %r6, %r33, %r3; or.b32 %r34, %r27, %r3; mul.wide.s32 %rd41, %r34, 4; add.s64 %rd70, %rd32, %rd41; mov.f32 %f148, 0f00000000; mov.b32 %r156, -8; mov.u64 %rd68, %rd70; mov.u64 %rd69, %rd72; mov.f32 %f149, %f148; mov.f32 %f150, %f148; mov.f32 %f151, %f148; mov.f32 %f152, %f148; mov.f32 %f153, %f148; mov.f32 %f154, %f148; mov.f32 %f155, %f148; mov.f32 %f156, %f148; mov.f32 %f157, %f148; mov.f32 %f158, %f148; mov.f32 %f159, %f148; mov.f32 %f160, %f148; mov.f32 %f161, %f148; mov.f32 %f162, %f148; mov.f32 %f163, %f148; bra.uni $L__BB0_1; $L__BB0_3: .loc 1 0 0 mov.b32 %f17, %r35; mov.b32 %f18, %r36; mov.b32 %f19, %r37; mov.b32 %f20, %r38; cvt.u16.u32 %rs1, %r43; { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r43; } cvt.u16.u32 %rs3, %r44; { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r44; } cvt.f32.bf16 %r47, %rs1; mov.b32 %f21, %r47; cvt.f32.bf16 %r48, %rs2; mov.b32 %f22, %r48; cvt.f32.bf16 %r49, %rs3; mov.b32 %f23, %r49; cvt.f32.bf16 %r50, %rs4; mov.b32 %f24, %r50; .loc 1 41 52 mov.u32 %r54, 0x0; mov.u32 %r55, 0x0; mov.u32 %r56, 0x0; mov.u32 %r57, 0x0; @%p1 ld.global.L1::evict_last.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd69 + 0 ]; @!%p1 mov.u32 %r54, %r143; @!%p1 mov.u32 %r55, %r143; @!%p1 mov.u32 %r56, %r143; @!%p1 mov.u32 %r57, %r143; mov.b32 %f56, %r54; mov.b32 %f57, %r55; mov.b32 %f58, %r56; mov.b32 %f59, %r57; .loc 1 42 22 add.f32 %f60, %f17, %f56; add.f32 %f61, %f18, %f57; add.f32 %f62, %f19, %f58; add.f32 %f63, %f20, %f59; .loc 1 44 22 add.f32 %f64, %f21, %f60; add.f32 %f65, %f22, %f61; add.f32 %f66, %f23, %f62; add.f32 %f67, %f24, %f63; $L__tmp1: .loc 2 96 20 sub.f32 %f68, %f64, %f160; sub.f32 %f69, %f65, %f161; sub.f32 %f70, %f66, %f162; sub.f32 %f71, %f67, %f163; .loc 2 97 26 add.f32 %f148, %f148, 0f3F800000; add.f32 %f149, %f149, 0f3F800000; add.f32 %f150, %f150, 0f3F800000; add.f32 %f151, %f151, 0f3F800000; add.f32 %f152, %f152, 0f3F800000; add.f32 %f153, %f153, 0f3F800000; add.f32 %f154, %f154, 0f3F800000; add.f32 %f155, %f155, 0f3F800000; .loc 2 98 30 mov.b32 %r63, %f68; mov.b32 %r64, %f148; div.full.f32 %r62, %r63, %r64; mov.b32 %f72, %r62; mov.b32 %r66, %f69; mov.b32 %r67, %f149; div.full.f32 %r65, %r66, %r67; mov.b32 %f73, %r65; mov.b32 %r69, %f70; mov.b32 %r70, %f150; div.full.f32 %r68, %r69, %r70; mov.b32 %f74, %r68; mov.b32 %r72, %f71; mov.b32 %r73, %f151; div.full.f32 %r71, %r72, %r73; mov.b32 %f75, %r71; .loc 2 98 22 add.f32 %f160, %f160, %f72; add.f32 %f161, %f161, %f73; add.f32 %f162, %f162, %f74; add.f32 %f163, %f163, %f75; .loc 2 101 30 sub.f32 %f76, %f64, %f160; sub.f32 %f77, %f65, %f161; sub.f32 %f78, %f66, %f162; sub.f32 %f79, %f67, %f163; $L__tmp2: .loc 1 50 50 fma.rn.f32 %f156, %f68, %f76, %f156; fma.rn.f32 %f157, %f69, %f77, %f157; fma.rn.f32 %f158, %f70, %f78, %f158; fma.rn.f32 %f159, %f71, %f79, %f159; .loc 1 31 36 add.s32 %r156, %r156, 8; add.s64 %rd69, %rd69, 32; add.s64 %rd68, %rd68, 32; setp.lt.u32 %p22, %r156, 248; @%p22 bra $L__BB0_1; bra.uni $L__BB0_4; $L__BB0_1: .loc 1 40 40 setp.lt.u64 %p16, %rd1, 50257; mov.b32 %r143, 0; .loc 1 35 50 mov.u32 %r35, 0x0; mov.u32 %r36, 0x0; mov.u32 %r37, 0x0; mov.u32 %r38, 0x0; @%p1 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd68 + 0 ]; @!%p1 mov.u32 %r35, %r143; @!%p1 mov.u32 %r36, %r143; @!%p1 mov.u32 %r37, %r143; @!%p1 mov.u32 %r38, %r143; .loc 1 36 34 add.s32 %r51, %r6, %r156; add.s32 %r52, %r51, 8; mul.wide.s32 %rd44, %r52, 2; add.s64 %rd43, %rd17, %rd44; .loc 1 36 50 mov.u32 %r43, 0x0; mov.u32 %r44, 0x0; @%p1 ld.global.L1::evict_last.v2.b32 { %r43, %r44 }, [ %rd43 + 0 ]; @!%p1 mov.u32 %r43, %r143; @!%p1 mov.u32 %r44, %r143; mov.b32 %r155, 883; mov.u64 %rd67, 1; .loc 1 40 55 @%p16 bra $L__BB0_3; mov.u64 %rd45, assertMessage_0; cvta.global.u64 %rd46, %rd45; mov.u64 %rd47, assertFile_0; cvta.global.u64 %rd48, %rd47; mov.u64 %rd49, assertFunc_0; cvta.global.u64 %rd50, %rd49; { // callseq 2, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd46; .param .b64 param1; st.param.b64 [param1+0], %rd48; .param .b32 param2; st.param.b32 [param2+0], %r155; .param .b64 param3; st.param.b64 [param3+0], %rd50; .param .b64 param4; st.param.b64 [param4+0], %rd67; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 2 bra.uni $L__BB0_3; $L__BB0_4: .loc 1 31 36 shr.u32 %r99, %r1, 3; or.b32 %r100, %r5, %r99; mad.lo.s32 %r101, %r100, 12, %r4; shl.b32 %r102, %r101, 2; mov.u32 %r103, global_smem; add.s32 %r104, %r103, %r102; st.shared.f32 [%r104], %f152; st.shared.f32 [%r104+768], %f153; st.shared.f32 [%r104+1536], %f154; st.shared.f32 [%r104+2304], %f155; bar.sync 0; mad.lo.s32 %r105, %r2, 12, %r3; shl.b32 %r106, %r105, 2; add.s32 %r107, %r103, %r106; ld.shared.v4.f32 {%f80, %f81, %f82, %f83}, [%r107]; $L__tmp3: .loc 2 108 21 sub.f32 %f84, %f161, %f160; .loc 2 109 28 add.f32 %f85, %f80, %f81; .loc 2 110 39 setp.eq.f32 %p23, %f85, 0f00000000; .loc 2 110 60 mov.b32 %r75, %f81; mov.b32 %r76, %f85; div.full.f32 %r74, %r75, %r76; mov.b32 %f86, %r74; .loc 2 110 49 selp.f32 %f87, 0f00000000, %f86, %p23; .loc 2 112 17 fma.rn.f32 %f88, %f84, %f87, %f160; .loc 2 113 15 add.f32 %f89, %f156, %f157; .loc 2 113 30 mul.f32 %f90, %f84, %f84; .loc 2 113 38 mul.f32 %f91, %f90, %f80; .loc 2 113 22 fma.rn.f32 %f92, %f91, %f87, %f89; .loc 2 108 21 sub.f32 %f93, %f162, %f88; .loc 2 109 28 add.f32 %f94, %f82, %f85; .loc 2 110 39 setp.eq.f32 %p24, %f94, 0f00000000; .loc 2 110 60 mov.b32 %r79, %f94; mov.b32 %r78, %f82; div.full.f32 %r77, %r78, %r79; mov.b32 %f95, %r77; .loc 2 110 49 selp.f32 %f96, 0f00000000, %f95, %p24; .loc 2 112 17 fma.rn.f32 %f97, %f96, %f93, %f88; .loc 2 113 15 add.f32 %f98, %f158, %f92; .loc 2 113 30 mul.f32 %f99, %f93, %f93; .loc 2 113 38 mul.f32 %f100, %f85, %f99; .loc 2 113 22 fma.rn.f32 %f101, %f96, %f100, %f98; .loc 2 108 21 sub.f32 %f102, %f163, %f97; .loc 2 109 28 add.f32 %f103, %f83, %f94; .loc 2 110 39 setp.eq.f32 %p25, %f103, 0f00000000; .loc 2 110 60 mov.b32 %r82, %f103; mov.b32 %r81, %f83; div.full.f32 %r80, %r81, %r82; mov.b32 %f104, %r80; .loc 2 110 49 selp.f32 %f105, 0f00000000, %f104, %p25; .loc 2 112 17 fma.rn.f32 %f106, %f105, %f102, %f97; .loc 2 113 15 add.f32 %f107, %f159, %f101; .loc 2 113 30 mul.f32 %f108, %f102, %f102; .loc 2 113 38 mul.f32 %f109, %f94, %f108; .loc 2 113 22 fma.rn.f32 %f110, %f105, %f109, %f107; $L__tmp4: .loc 2 120 46 mov.b32 %r108, %f106; shfl.sync.bfly.b32 %r109, %r108, 1, 31, -1; mov.b32 %f111, %r109; mov.b32 %r110, %f110; shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1; mov.b32 %f112, %r111; shfl.sync.bfly.b32 %r84, %r82, 1, 31, -1; mov.b32 %f113, %r84; $L__tmp5: .loc 2 108 21 sub.f32 %f114, %f111, %f106; .loc 2 109 28 add.f32 %f115, %f103, %f113; .loc 2 110 39 setp.eq.f32 %p26, %f115, 0f00000000; .loc 2 110 60 mov.b32 %r85, %f115; div.full.f32 %r83, %r84, %r85; mov.b32 %f116, %r83; .loc 2 110 49 selp.f32 %f117, 0f00000000, %f116, %p26; .loc 2 112 17 fma.rn.f32 %f41, %f117, %f114, %f106; .loc 2 113 15 add.f32 %f118, %f110, %f112; .loc 2 113 30 mul.f32 %f119, %f114, %f114; .loc 2 113 38 mul.f32 %f120, %f103, %f119; .loc 2 113 22 fma.rn.f32 %f121, %f117, %f120, %f118; $L__tmp6: .loc 1 75 24 mov.b32 %r87, %f121; mov.b32 %r88, 1132462080; div.full.f32 %r86, %r87, %r88; mov.b32 %f122, %r86; .loc 1 77 24 add.f32 %f42, %f122, 0f3727C5AC; .loc 1 58 36 add.s64 %rd71, %rd18, %rd2; mov.b32 %r157, -8; rsqrt.approx.ftz.f32 %f139, %f42; bra.uni $L__BB0_5; $L__BB0_7: .loc 1 0 0 mov.b32 %f43, %r112; mov.b32 %f44, %r113; mov.b32 %f45, %r114; mov.b32 %f46, %r115; cvt.s64.s32 %rd13, %r137; mov.b32 %f47, %r124; mov.b32 %f48, %r125; mov.b32 %f49, %r126; mov.b32 %f50, %r127; mov.b32 %f51, %r128; mov.b32 %f52, %r129; mov.b32 %f53, %r130; mov.b32 %f54, %r131; .loc 1 69 54 mov.u32 %r139, 0x0; mov.u32 %r140, 0x0; mov.u32 %r141, 0x0; mov.u32 %r142, 0x0; @%p1 ld.global.L1::evict_first.v4.b32 { %r139, %r140, %r141, %r142 }, [ %rd72 + 0 ]; @!%p1 mov.u32 %r139, %r143; @!%p1 mov.u32 %r140, %r143; @!%p1 mov.u32 %r141, %r143; @!%p1 mov.u32 %r142, %r143; mov.b32 %f123, %r139; mov.b32 %f124, %r140; mov.b32 %f125, %r141; mov.b32 %f126, %r142; .loc 1 70 24 add.f32 %f127, %f43, %f123; add.f32 %f128, %f44, %f124; add.f32 %f129, %f45, %f125; add.f32 %f130, %f46, %f126; .loc 1 72 24 add.f32 %f131, %f47, %f127; add.f32 %f132, %f48, %f128; add.f32 %f133, %f49, %f129; add.f32 %f134, %f50, %f130; .loc 1 73 24 sub.f32 %f135, %f131, %f41; sub.f32 %f136, %f132, %f41; sub.f32 %f137, %f133, %f41; sub.f32 %f138, %f134, %f41; .loc 1 79 24 mul.f32 %f140, %f135, %f139; mul.f32 %f141, %f136, %f139; mul.f32 %f142, %f137, %f139; mul.f32 %f143, %f138, %f139; .loc 1 80 24 mul.f32 %f144, %f140, %f51; mul.f32 %f145, %f141, %f52; mul.f32 %f146, %f142, %f53; mul.f32 %f147, %f143, %f54; .loc 1 82 29 shl.b64 %rd66, %rd13, 1; add.s64 %rd65, %rd19, %rd66; .loc 1 82 52 mov.b32 %r147, %f144; cvt.rn.bf16.f32 %rs9, %r147; mov.b32 %r148, %f145; cvt.rn.bf16.f32 %rs10, %r148; mov.b32 %r149, %f146; cvt.rn.bf16.f32 %rs11, %r149; mov.b32 %r150, %f147; cvt.rn.bf16.f32 %rs12, %r150; mov.b32 %r153, {%rs9, %rs10}; mov.b32 %r154, {%rs11, %rs12}; @%p1 st.global.v2.b32 [ %rd65 + 0 ], { %r153, %r154 }; .loc 1 58 36 add.s32 %r157, %r157, 8; add.s64 %rd72, %rd72, 32; add.s64 %rd71, %rd71, 32; add.s64 %rd70, %rd70, 32; setp.lt.u32 %p47, %r157, 248; @%p47 bra $L__BB0_5; bra.uni $L__BB0_8; $L__BB0_5: .loc 1 62 51 mov.u32 %r112, 0x0; mov.u32 %r113, 0x0; mov.u32 %r114, 0x0; mov.u32 %r115, 0x0; @%p1 ld.global.L1::evict_last.v4.b32 { %r112, %r113, %r114, %r115 }, [ %rd70 + 0 ]; @!%p1 mov.u32 %r112, %r143; @!%p1 mov.u32 %r113, %r143; @!%p1 mov.u32 %r114, %r143; @!%p1 mov.u32 %r115, %r143; .loc 1 63 35 add.s32 %r136, %r6, %r157; add.s32 %r137, %r136, 8; mul.wide.s32 %rd56, %r137, 2; add.s64 %rd54, %rd17, %rd56; .loc 1 63 51 mov.u32 %r120, 0x0; mov.u32 %r121, 0x0; @%p1 ld.global.L1::evict_first.v2.b32 { %r120, %r121 }, [ %rd54 + 0 ]; @!%p1 mov.u32 %r120, %r143; @!%p1 mov.u32 %r121, %r143; cvt.u16.u32 %rs5, %r120; { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r120; } cvt.u16.u32 %rs7, %r121; { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r121; } .loc 1 63 103 cvt.f32.bf16 %r124, %rs5; cvt.f32.bf16 %r125, %rs6; cvt.f32.bf16 %r126, %rs7; cvt.f32.bf16 %r127, %rs8; .loc 1 64 40 mov.u32 %r128, 0x0; mov.u32 %r129, 0x0; mov.u32 %r130, 0x0; mov.u32 %r131, 0x0; @%p1 ld.global.L1::evict_last.v4.b32 { %r128, %r129, %r130, %r131 }, [ %rd71 + 0 ]; @!%p1 mov.u32 %r128, %r143; @!%p1 mov.u32 %r129, %r143; @!%p1 mov.u32 %r130, %r143; @!%p1 mov.u32 %r131, %r143; .loc 1 68 57 @%p16 bra $L__BB0_7; mov.u64 %rd57, assertMessage_1; cvta.global.u64 %rd58, %rd57; mov.u64 %rd59, assertFile_1; cvta.global.u64 %rd60, %rd59; mov.u64 %rd61, assertFunc_1; cvta.global.u64 %rd62, %rd61; { // callseq 3, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd58; .param .b64 param1; st.param.b64 [param1+0], %rd60; .param .b32 param2; st.param.b32 [param2+0], %r155; .param .b64 param3; st.param.b64 [param3+0], %rd62; .param .b64 param4; st.param.b64 [param4+0], %rd67; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 3 bra.uni $L__BB0_7; $L__BB0_8: .loc 1 58 4 ret; $L__tmp7: $L__func_end0: } // .globl __nv_rsqrtf .visible .func (.param .b32 func_retval0) __nv_rsqrtf( .param .b32 __nv_rsqrtf_param_0 ) { .reg .f32 %f<3>; $L__func_begin1: ld.param.f32 %f1, [__nv_rsqrtf_param_0]; rsqrt.approx.ftz.f32 %f2, %f1; st.param.f32 [func_retval0+0], %f2; ret; $L__func_end1: } .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py" .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 302 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 99 .b8 105 .b8 103 .b8 54 .b8 102 .b8 107 .b8 105 .b8 54 .b8 112 .b8 52 .b8 108 .b8 120 .b8 114 .b8 100 .b8 109 .b8 103 .b8 103 .b8 54 .b8 101 .b8 117 .b8 100 .b8 97 .b8 104 .b8 105 .b8 101 .b8 120 .b8 99 .b8 118 .b8 117 .b8 101 .b8 101 .b8 111 .b8 108 .b8 50 .b8 112 .b8 52 .b8 113 .b8 112 .b8 53 .b8 51 .b8 50 .b8 112 .b8 118 .b8 118 .b8 101 .b8 50 .b8 121 .b8 52 .b8 54 .b8 51 .b8 121 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 99 .b8 105 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp2 .b8 2 .b8 47 .b8 41 .b8 5 .b32 125 .b64 $L__tmp3 .b64 $L__tmp6 .b8 2 .b8 53 .b8 44 .b8 4 .b32 125 .b64 $L__tmp3 .b64 $L__tmp6 .b8 2 .b8 120 .b8 46 .b8 0 .b8 4 .b32 125 .b64 $L__tmp4 .b64 $L__tmp5 .b8 2 .b8 53 .b8 44 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 306 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 101 .b8 55 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 306 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }