// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3de4de .extern .shared .align 1 .b8 global_smem[]; .visible .entry triton__0d1d2d3de4de( .param .u64 triton__0d1d2d3de4de_param_0, .param .u64 triton__0d1d2d3de4de_param_1, .param .u64 triton__0d1d2d3de4de_param_2, .param .u32 triton__0d1d2d3de4de_param_3, .param .u32 triton__0d1d2d3de4de_param_4 ) .maxntid 128, 1, 1 { .reg .pred %p<20>; .reg .b16 %rs<5>; .reg .b32 %r<98>; .reg .f32 %f<47>; .reg .b64 %rd<10>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2]; ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1]; ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0]; $L__tmp0: .loc 1 22 44 mov.u32 %r1, %tid.x; and.b32 %r2, %r1, 31; shl.b32 %r13, %r1, 2; and.b32 %r3, %r13, 60; .loc 1 24 33 bfe.u32 %r4, %r1, 5, 2; .loc 1 21 28 mov.u32 %r11, %ctaid.x; .loc 1 21 33 shl.b32 %r5, %r11, 6; .loc 1 22 23 or.b32 %r14, %r5, %r3; .loc 1 26 20 shr.s32 %r16, %r14, 31; shr.u32 %r17, %r16, 24; add.s32 %r18, %r14, %r17; shr.s32 %r19, %r18, 8; .loc 1 29 36 mad.lo.s32 %r20, %r19, 32512, %r14; shl.b32 %r21, %r4, 9; add.s32 %r22, %r20, %r21; shl.b32 %r23, %r1, 4; and.b32 %r24, %r23, 256; add.s32 %r96, %r22, %r24; mov.f32 %f43, 0f00000000; mov.b32 %r97, -8; mov.pred %p1, -1; mov.f32 %f44, %f43; mov.f32 %f45, %f43; mov.f32 %f46, %f43; $L__BB0_1: .loc 1 33 34 mul.wide.s32 %rd6, %r96, 2; add.s64 %rd4, %rd1, %rd6; mov.b32 %r27, 0; .loc 1 33 63 mov.u32 %r25, 0x0; mov.u32 %r26, 0x0; @%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ]; @!%p1 mov.u32 %r25, %r27; @!%p1 mov.u32 %r26, %r27; cvt.u16.u32 %rs1, %r25; { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; } cvt.u16.u32 %rs3, %r26; { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; } .loc 1 33 115 cvt.f32.bf16 %r29, %rs1; mov.b32 %f13, %r29; cvt.f32.bf16 %r30, %rs2; mov.b32 %f14, %r30; cvt.f32.bf16 %r31, %rs3; mov.b32 %f15, %r31; cvt.f32.bf16 %r32, %rs4; mov.b32 %f16, %r32; .loc 1 34 34 mul.wide.s32 %rd7, %r96, 4; add.s64 %rd5, %rd2, %rd7; .loc 1 34 63 mov.u32 %r33, 0x0; mov.u32 %r34, 0x0; mov.u32 %r35, 0x0; mov.u32 %r36, 0x0; @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ]; @!%p1 mov.u32 %r33, %r27; @!%p1 mov.u32 %r34, %r27; @!%p1 mov.u32 %r35, %r27; @!%p1 mov.u32 %r36, %r27; mov.b32 %f17, %r33; mov.b32 %f18, %r34; mov.b32 %f19, %r35; mov.b32 %f20, %r36; .loc 1 39 38 fma.rn.f32 %f46, %f16, %f20, %f46; fma.rn.f32 %f45, %f15, %f19, %f45; fma.rn.f32 %f44, %f14, %f18, %f44; fma.rn.f32 %f43, %f13, %f17, %f43; .loc 1 29 36 add.s32 %r97, %r97, 8; add.s32 %r96, %r96, 2048; setp.lt.u32 %p9, %r97, 120; @%p9 bra $L__BB0_1; .loc 1 22 44 and.b32 %r58, %r1, 63; .loc 1 22 23 or.b32 %r59, %r5, %r58; $L__tmp1: .loc 2 243 36 mov.b32 %r60, %f43; shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; mov.b32 %f21, %r61; $L__tmp2: .loc 2 233 15 add.f32 %f22, %f43, %f21; $L__tmp3: .loc 2 243 36 mov.b32 %r62, %f44; shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1; mov.b32 %f23, %r63; $L__tmp4: .loc 2 233 15 add.f32 %f24, %f44, %f23; $L__tmp5: .loc 2 243 36 mov.b32 %r64, %f45; shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1; mov.b32 %f25, %r65; $L__tmp6: .loc 2 233 15 add.f32 %f26, %f45, %f25; $L__tmp7: .loc 2 243 36 mov.b32 %r66, %f46; shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; mov.b32 %f27, %r67; $L__tmp8: .loc 2 233 15 add.f32 %f28, %f46, %f27; $L__tmp9: .loc 2 243 36 setp.lt.u32 %p10, %r2, 16; shl.b32 %r68, %r3, 2; or.b32 %r69, %r68, %r4; shl.b32 %r70, %r69, 2; mov.u32 %r71, global_smem; add.s32 %r41, %r71, %r70; mov.b32 %r42, %f22; @%p10 st.shared.b32 [ %r41 + 0 ], %r42; shl.b32 %r72, %r4, 2; shl.b32 %r73, %r3, 4; or.b32 %r74, %r73, 16; or.b32 %r75, %r74, %r72; add.s32 %r43, %r71, %r75; mov.b32 %r44, %f24; @%p10 st.shared.b32 [ %r43 + 0 ], %r44; or.b32 %r76, %r73, 32; or.b32 %r77, %r76, %r72; add.s32 %r45, %r71, %r77; mov.b32 %r46, %f26; @%p10 st.shared.b32 [ %r45 + 0 ], %r46; or.b32 %r78, %r73, 48; or.b32 %r79, %r78, %r72; add.s32 %r47, %r71, %r79; mov.b32 %r48, %f28; @%p10 st.shared.b32 [ %r47 + 0 ], %r48; bar.sync 0; setp.lt.s32 %p14, %r1, 256; add.s32 %r50, %r71, %r13; @%p14 ld.shared.b32 %r49, [ %r50 + 0 ]; mov.b32 %f29, %r49; shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1; mov.b32 %f30, %r81; $L__tmp10: .loc 2 233 15 add.f32 %f31, %f29, %f30; $L__tmp11: .loc 2 243 36 mov.b32 %r82, %f31; shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; mov.b32 %f32, %r83; $L__tmp12: .loc 2 233 15 add.f32 %f33, %f31, %f32; $L__tmp13: .loc 2 243 36 and.b32 %r84, %r1, 3; setp.eq.s32 %p19, %r84, 0; and.pred %p15, %p14, %p19; mov.b32 %r52, %f33; @%p15 st.shared.b32 [ %r50 + 0 ], %r52; add.s32 %r54, %r50, 512; @%p14 ld.shared.b32 %r53, [ %r54 + 0 ]; mov.b32 %f34, %r53; shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1; mov.b32 %f35, %r85; $L__tmp14: .loc 2 233 15 add.f32 %f36, %f34, %f35; $L__tmp15: .loc 2 243 36 mov.b32 %r86, %f36; shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; mov.b32 %f37, %r87; $L__tmp16: .loc 2 233 15 add.f32 %f38, %f36, %f37; $L__tmp17: .loc 2 243 36 mov.b32 %r56, %f38; @%p15 st.shared.b32 [ %r54 + 0 ], %r56; bar.sync 0; add.s32 %r88, %r71, %r73; ld.shared.f32 %f39, [%r88]; add.s32 %r89, %r71, %r74; ld.shared.f32 %f40, [%r89]; add.s32 %r90, %r71, %r76; ld.shared.f32 %f41, [%r90]; add.s32 %r91, %r71, %r78; ld.shared.f32 %f42, [%r91]; $L__tmp18: .loc 1 40 28 bar.sync 0; add.s32 %r92, %r71, %r68; st.shared.f32 [%r92], %f39; st.shared.f32 [%r92+4], %f40; st.shared.f32 [%r92+8], %f41; st.shared.f32 [%r92+12], %f42; bar.sync 0; shl.b32 %r93, %r58, 2; add.s32 %r94, %r71, %r93; ld.shared.u32 %r57, [%r94]; .loc 1 41 25 mul.wide.s32 %rd9, %r59, 4; add.s64 %rd8, %rd3, %rd9; .loc 1 41 36 and.b32 %r95, %r1, 64; setp.eq.s32 %p18, %r95, 0; @%p18 st.global.b32 [ %rd8 + 0 ], { %r57 }; .loc 1 41 4 ret; $L__tmp19: $L__func_end0: } .file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py" .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 266 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 115 .b8 106 .b8 100 .b8 55 .b8 109 .b8 108 .b8 114 .b8 106 .b8 117 .b8 106 .b8 100 .b8 52 .b8 117 .b8 119 .b8 122 .b8 101 .b8 53 .b8 116 .b8 107 .b8 103 .b8 55 .b8 112 .b8 116 .b8 116 .b8 101 .b8 97 .b8 103 .b8 112 .b8 105 .b8 104 .b8 103 .b8 116 .b8 53 .b8 122 .b8 116 .b8 97 .b8 116 .b8 102 .b8 113 .b8 99 .b8 104 .b8 112 .b8 114 .b8 99 .b8 114 .b8 97 .b8 120 .b8 50 .b8 50 .b8 108 .b8 115 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 115 .b8 106 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp18 .b8 2 .b8 40 .b8 25 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp17 .b8 2 .b8 40 .b8 25 .b8 4 .b32 125 .b64 $L__tmp2 .b64 $L__tmp17 .b8 2 .b8 243 .b8 36 .b8 0 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 270 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 270 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }