// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3d4d5d6d7d8de9de .extern .func __assertfail ( .param .b64 __assertfail_param_0, .param .b64 __assertfail_param_1, .param .b32 __assertfail_param_2, .param .b64 __assertfail_param_3, .param .b64 __assertfail_param_4 ) ; .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; .extern .shared .align 1 .b8 global_smem[]; .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; .visible .entry triton__0d1d2d3d4d5d6d7d8de9de( .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 ) .maxntid 64, 1, 1 { .reg .pred %p<36>; .reg .b16 %rs<5>; .reg .b32 %r<109>; .reg .f32 %f<70>; .reg .b64 %rd<49>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; ld.param.u64 %rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; ld.param.u64 %rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; ld.param.u64 %rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; $L__tmp0: .loc 1 26 26 mov.u32 %r1, %tid.x; ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; and.b32 %r2, %r1, 63; shl.b32 %r28, %r2, 2; ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; .loc 1 23 28 mov.u32 %r11, %ctaid.x; .loc 1 30 18 shr.s32 %r29, %r11, 31; shr.u32 %r30, %r29, 23; add.s32 %r31, %r11, %r30; and.b32 %r32, %r31, 16776704; sub.s32 %r33, %r11, %r32; .loc 1 31 30 cvt.s64.s32 %rd1, %r11; mul.wide.s32 %rd24, %r11, 8; add.s64 %rd10, %rd21, %rd24; mov.pred %p18, -1; .loc 1 31 35 mov.u64 %rd9, 0x0; @%p18 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ]; mov.u64 %rd11, 0x0; @%p18 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd10 + 0 ]; mov.u64 %rd13, 0x0; @%p18 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd10 + 0 ]; mov.u64 %rd15, 0x0; @%p18 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd10 + 0 ]; mov.u64 %rd17, 0x0; @%p18 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd10 + 0 ]; .loc 1 32 40 shl.b32 %r34, %r33, 8; .loc 1 32 36 or.b32 %r35, %r34, %r28; .loc 1 32 30 mul.wide.s32 %rd25, %r35, 4; add.s64 %rd19, %rd22, %rd25; mov.b32 %r41, 0; .loc 1 32 46 mov.u32 %r12, 0x0; mov.u32 %r13, 0x0; mov.u32 %r14, 0x0; mov.u32 %r15, 0x0; @%p18 ld.global.L1::evict_last.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd19 + 0 ]; @!%p18 mov.u32 %r12, %r41; @!%p18 mov.u32 %r13, %r41; @!%p18 mov.u32 %r14, %r41; @!%p18 mov.u32 %r15, %r41; .loc 1 33 31 cvt.u64.u32 %rd3, %r28; mul.wide.u32 %rd26, %r28, 4; add.s64 %rd20, %rd23, %rd26; .loc 1 33 36 mov.u32 %r20, 0x0; mov.u32 %r21, 0x0; mov.u32 %r22, 0x0; mov.u32 %r23, 0x0; @%p18 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd20 + 0 ]; @!%p18 mov.u32 %r20, %r41; @!%p18 mov.u32 %r21, %r41; @!%p18 mov.u32 %r22, %r41; @!%p18 mov.u32 %r23, %r41; .loc 1 34 18 add.s64 %rd27, %rd17, 50257; .loc 1 35 18 setp.lt.s64 %p16, %rd17, 0; .loc 1 36 32 selp.b64 %rd28, %rd27, %rd17, %p16; .loc 1 37 36 setp.lt.u64 %p17, %rd28, 50257; .loc 1 37 51 @%p17 bra $L__BB0_2; mov.u64 %rd29, assertMessage_0; cvta.global.u64 %rd30, %rd29; mov.u64 %rd31, assertFile_0; cvta.global.u64 %rd32, %rd31; mov.u64 %rd33, assertFunc_0; cvta.global.u64 %rd34, %rd33; mov.b32 %r36, 883; mov.u64 %rd35, 1; { // callseq 0, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd30; .param .b64 param1; st.param.b64 [param1+0], %rd32; .param .b32 param2; st.param.b32 [param2+0], %r36; .param .b64 param3; st.param.b64 [param3+0], %rd34; .param .b64 param4; st.param.b64 [param4+0], %rd35; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 0 $L__BB0_2: .loc 1 35 18 setp.lt.s64 %p33, %rd9, 0; .loc 1 26 26 and.b32 %r75, %r1, 31; .loc 1 38 40 shl.b64 %rd41, %rd9, 8; add.s64 %rd42, %rd41, 12865792; selp.b64 %rd43, %rd42, %rd41, %p33; .loc 1 38 36 or.b64 %rd44, %rd43, %rd3; .loc 1 38 30 shl.b64 %rd45, %rd44, 2; add.s64 %rd36, %rd5, %rd45; .loc 1 38 48 mov.u32 %r37, 0x0; mov.u32 %r38, 0x0; mov.u32 %r39, 0x0; mov.u32 %r40, 0x0; @%p18 ld.global.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd36 + 0 ]; @!%p18 mov.u32 %r37, %r41; @!%p18 mov.u32 %r38, %r41; @!%p18 mov.u32 %r39, %r41; @!%p18 mov.u32 %r40, %r41; .loc 1 32 46 mov.b32 %f1, %r12; mov.b32 %f2, %r13; .loc 1 38 48 mov.b32 %f3, %r37; mov.b32 %f4, %r38; .loc 1 39 18 add.f32 %f5, %f2, %f4; mov.b32 %r64, %f5; add.f32 %f6, %f1, %f3; .loc 1 32 46 mov.b32 %f7, %r15; mov.b32 %f8, %r14; .loc 1 38 48 mov.b32 %f9, %r40; mov.b32 %f10, %r39; .loc 1 39 18 add.f32 %f11, %f8, %f10; mov.b32 %r65, %f11; add.f32 %f12, %f7, %f9; $L__tmp1: .loc 2 233 15 add.f32 %f13, %f6, %f5; add.f32 %f14, %f11, %f13; add.f32 %f15, %f12, %f14; $L__tmp2: .loc 2 243 36 mov.b32 %r76, %f15; shfl.sync.bfly.b32 %r77, %r76, 16, 31, -1; mov.b32 %f16, %r77; $L__tmp3: .loc 2 233 15 add.f32 %f17, %f15, %f16; $L__tmp4: .loc 2 243 36 mov.b32 %r78, %f17; shfl.sync.bfly.b32 %r79, %r78, 8, 31, -1; mov.b32 %f18, %r79; $L__tmp5: .loc 2 233 15 add.f32 %f19, %f17, %f18; $L__tmp6: .loc 2 243 36 mov.b32 %r80, %f19; shfl.sync.bfly.b32 %r81, %r80, 4, 31, -1; mov.b32 %f20, %r81; $L__tmp7: .loc 2 233 15 add.f32 %f21, %f19, %f20; $L__tmp8: .loc 2 243 36 mov.b32 %r82, %f21; shfl.sync.bfly.b32 %r83, %r82, 2, 31, -1; mov.b32 %f22, %r83; $L__tmp9: .loc 2 233 15 add.f32 %f23, %f21, %f22; $L__tmp10: .loc 2 243 36 mov.b32 %r84, %f23; shfl.sync.bfly.b32 %r85, %r84, 1, 31, -1; mov.b32 %f24, %r85; $L__tmp11: .loc 2 233 15 add.f32 %f25, %f23, %f24; $L__tmp12: .loc 2 243 36 setp.eq.s32 %p23, %r75, 0; shr.u32 %r86, %r1, 3; and.b32 %r87, %r86, 4; mov.u32 %r88, global_smem; add.s32 %r45, %r88, %r87; mov.b32 %r46, %f25; @%p23 st.shared.b32 [ %r45 + 0 ], %r46; bar.sync 0; setp.lt.s32 %p24, %r1, 2; shl.b32 %r89, %r1, 2; add.s32 %r48, %r88, %r89; @%p24 ld.shared.b32 %r47, [ %r48 + 0 ]; mov.b32 %f26, %r47; shfl.sync.bfly.b32 %r90, %r47, 1, 31, -1; mov.b32 %f27, %r90; $L__tmp13: .loc 2 233 15 add.f32 %f28, %f26, %f27; $L__tmp14: .loc 2 243 36 and.b32 %r91, %r1, 1; setp.eq.b32 %p34, %r91, 1; not.pred %p35, %p34; and.pred %p25, %p24, %p35; mov.b32 %r50, %f28; @%p25 st.shared.b32 [ %r48 + 0 ], %r50; bar.sync 0; ld.shared.f32 %f29, [global_smem]; $L__tmp15: .loc 3 8 15 add.f32 %f30, %f29, 0f00000000; $L__tmp16: .loc 1 47 20 mov.b32 %r52, %f30; mov.b32 %r53, 1132462080; div.full.f32 %r74, %r52, %r53; mov.b32 %f31, %r74; .loc 1 48 19 sub.f32 %f32, %f6, %f31; sub.f32 %f33, %f5, %f31; sub.f32 %f34, %f11, %f31; sub.f32 %f35, %f12, %f31; .loc 1 49 20 mul.f32 %f36, %f33, %f33; $L__tmp17: .loc 2 243 36 bar.sync 0; $L__tmp18: .loc 2 233 15 fma.rn.f32 %f37, %f32, %f32, %f36; fma.rn.f32 %f38, %f34, %f34, %f37; fma.rn.f32 %f39, %f35, %f35, %f38; $L__tmp19: .loc 2 243 36 mov.b32 %r92, %f39; shfl.sync.bfly.b32 %r93, %r92, 16, 31, -1; mov.b32 %f40, %r93; $L__tmp20: .loc 2 233 15 add.f32 %f41, %f39, %f40; $L__tmp21: .loc 2 243 36 mov.b32 %r94, %f41; shfl.sync.bfly.b32 %r95, %r94, 8, 31, -1; mov.b32 %f42, %r95; $L__tmp22: .loc 2 233 15 add.f32 %f43, %f41, %f42; $L__tmp23: .loc 2 243 36 mov.b32 %r96, %f43; shfl.sync.bfly.b32 %r97, %r96, 4, 31, -1; mov.b32 %f44, %r97; $L__tmp24: .loc 2 233 15 add.f32 %f45, %f43, %f44; $L__tmp25: .loc 2 243 36 mov.b32 %r98, %f45; shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; mov.b32 %f46, %r99; $L__tmp26: .loc 2 233 15 add.f32 %f47, %f45, %f46; $L__tmp27: .loc 2 243 36 mov.b32 %r100, %f47; shfl.sync.bfly.b32 %r101, %r100, 1, 31, -1; mov.b32 %f48, %r101; $L__tmp28: .loc 2 233 15 add.f32 %f49, %f47, %f48; $L__tmp29: .loc 2 243 36 mov.b32 %r55, %f49; @%p23 st.shared.b32 [ %r45 + 0 ], %r55; bar.sync 0; @%p24 ld.shared.b32 %r56, [ %r48 + 0 ]; mov.b32 %f50, %r56; shfl.sync.bfly.b32 %r102, %r56, 1, 31, -1; mov.b32 %f51, %r102; $L__tmp30: .loc 2 233 15 add.f32 %f52, %f50, %f51; $L__tmp31: .loc 2 243 36 mov.b32 %r59, %f52; @%p25 st.shared.b32 [ %r48 + 0 ], %r59; bar.sync 0; ld.shared.f32 %f53, [global_smem]; $L__tmp32: .loc 3 8 15 add.f32 %f54, %f53, 0f00000000; $L__tmp33: .loc 1 54 20 mov.b32 %r61, %f54; div.full.f32 %r60, %r61, %r53; mov.b32 %f55, %r60; .loc 1 56 20 add.f32 %f56, %f55, 0f3727C5AC; .loc 1 57 26 rsqrt.approx.ftz.f32 %f57, %f56; cvt.u32.u64 %r103, %rd3; cvt.u32.u64 %r104, %rd1; .loc 1 33 36 mov.b32 %f58, %r20; mov.b32 %f59, %r21; mov.b32 %f60, %r22; mov.b32 %f61, %r23; .loc 1 59 20 mul.f32 %f62, %f32, %f57; mul.f32 %f63, %f33, %f57; mul.f32 %f64, %f34, %f57; mul.f32 %f65, %f35, %f57; .loc 1 60 20 mul.f32 %f66, %f62, %f58; mul.f32 %f67, %f63, %f59; mul.f32 %f68, %f64, %f60; mul.f32 %f69, %f65, %f61; .loc 1 62 35 shl.b32 %r105, %r104, 8; .loc 1 62 31 or.b32 %r106, %r105, %r103; .loc 1 62 25 mul.wide.s32 %rd46, %r106, 4; add.s64 %rd37, %rd6, %rd46; .loc 1 39 18 mov.b32 %r63, %f6; mov.b32 %r66, %f12; .loc 1 62 47 @%p18 st.global.v4.b32 [ %rd37 + 0 ], { %r63, %r64, %r65, %r66 }; .loc 1 63 4 bar.sync 0; .loc 1 64 28 shl.b64 %rd47, %rd1, 2; add.s64 %rd38, %rd4, %rd47; .loc 1 64 40 setp.eq.s32 %p30, %r2, 0; mov.b32 %r67, %f57; @%p30 st.global.b32 [ %rd38 + 0 ], { %r67 }; .loc 1 65 25 mul.wide.s32 %rd48, %r106, 2; add.s64 %rd39, %rd8, %rd48; .loc 1 65 48 mov.b32 %r68, %f66; cvt.rn.bf16.f32 %rs1, %r68; mov.b32 %r69, %f67; cvt.rn.bf16.f32 %rs2, %r69; mov.b32 %r70, %f68; cvt.rn.bf16.f32 %rs3, %r70; mov.b32 %r71, %f69; cvt.rn.bf16.f32 %rs4, %r71; mov.b32 %r107, {%rs1, %rs2}; mov.b32 %r108, {%rs3, %rs4}; @%p18 st.global.v2.b32 [ %rd39 + 0 ], { %r107, %r108 }; .loc 1 66 25 add.s64 %rd40, %rd7, %rd47; .loc 1 66 37 @%p30 st.global.b32 [ %rd40 + 0 ], { %r74 }; .loc 1 66 4 ret; $L__tmp34: $L__func_end0: } // .globl __nv_rsqrtf .visible .func (.param .b32 func_retval0) __nv_rsqrtf( .param .b32 __nv_rsqrtf_param_0 ) { .reg .f32 %f<3>; $L__func_begin1: ld.param.f32 %f1, [__nv_rsqrtf_param_0]; rsqrt.approx.ftz.f32 %f2, %f1; st.param.f32 [func_retval0+0], %f2; ret; $L__func_end1: } .file 1 "/tmp/torchinductor_root/pd/cpdqiwgwgnzx7tsvbieui7kffx5dt43uhgvg7z7egekxcsybpv34.py" .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 407 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 112 .b8 100 .b8 113 .b8 105 .b8 119 .b8 103 .b8 119 .b8 103 .b8 110 .b8 122 .b8 120 .b8 55 .b8 116 .b8 115 .b8 118 .b8 98 .b8 105 .b8 101 .b8 117 .b8 105 .b8 55 .b8 107 .b8 102 .b8 102 .b8 120 .b8 53 .b8 100 .b8 116 .b8 52 .b8 51 .b8 117 .b8 104 .b8 103 .b8 118 .b8 103 .b8 55 .b8 122 .b8 55 .b8 101 .b8 103 .b8 101 .b8 107 .b8 120 .b8 99 .b8 115 .b8 121 .b8 98 .b8 112 .b8 118 .b8 51 .b8 52 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 112 .b8 100 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp14 .b8 2 .b8 44 .b8 59 .b8 5 .b32 125 .b64 $L__tmp1 .b64 $L__tmp14 .b8 2 .b8 243 .b8 36 .b8 0 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp15 .b8 2 .b8 44 .b8 59 .b8 5 .b32 125 .b64 $L__tmp15 .b64 $L__tmp16 .b8 3 .b8 44 .b8 45 .b8 5 .b32 125 .b64 $L__tmp17 .b64 $L__tmp32 .b8 2 .b8 52 .b8 59 .b8 4 .b32 125 .b64 $L__tmp18 .b64 $L__tmp31 .b8 2 .b8 52 .b8 59 .b8 5 .b32 125 .b64 $L__tmp18 .b64 $L__tmp31 .b8 2 .b8 243 .b8 36 .b8 0 .b8 5 .b32 125 .b64 $L__tmp32 .b64 $L__tmp33 .b8 3 .b8 52 .b8 45 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 411 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 411 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }