// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3de4e .extern .shared .align 1 .b8 global_smem[]; .visible .entry triton__0d1d2d3de4e( .param .u64 triton__0d1d2d3de4e_param_0, .param .u64 triton__0d1d2d3de4e_param_1, .param .u64 triton__0d1d2d3de4e_param_2, .param .u32 triton__0d1d2d3de4e_param_3, .param .u32 triton__0d1d2d3de4e_param_4 ) .maxntid 256, 1, 1 { .reg .pred %p<10>; .reg .b32 %r<44>; .reg .f32 %f<11>; .reg .b64 %rd<16>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2]; ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1]; ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0]; $L__tmp0: .loc 1 22 44 mov.u32 %r1, %tid.x; and.b32 %r2, %r1, 63; .loc 1 24 33 bfe.u32 %r3, %r1, 6, 2; .loc 1 21 28 mov.u32 %r10, %ctaid.x; .loc 1 21 33 shl.b32 %r12, %r10, 6; .loc 1 22 23 or.b32 %r4, %r12, %r2; .loc 1 27 36 shl.b32 %r13, %r3, 17; add.s32 %r14, %r13, %r12; or.b32 %r42, %r14, %r2; mov.f32 %f10, 0f00000000; mov.b32 %r43, -4; mov.pred %p4, -1; $L__BB0_1: .loc 1 31 34 mul.wide.s32 %rd5, %r42, 4; add.s64 %rd4, %rd1, %rd5; mov.b32 %r16, 0; .loc 1 31 53 mov.u32 %r15, 0x0; @%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ]; @!%p4 mov.u32 %r15, %r16; mov.b32 %f4, %r15; .loc 1 34 38 add.f32 %f10, %f10, %f4; .loc 1 27 36 add.s32 %r43, %r43, 4; add.s32 %r42, %r42, 524288; setp.lt.u32 %p3, %r43, 116; @%p3 bra $L__BB0_1; $L__tmp1: .loc 2 243 36 shl.b32 %r25, %r3, 2; shl.b32 %r26, %r2, 4; or.b32 %r27, %r26, %r25; mov.u32 %r28, global_smem; add.s32 %r17, %r28, %r27; mov.b32 %r18, %f10; @%p4 st.shared.b32 [ %r17 + 0 ], %r18; bar.sync 0; setp.lt.s32 %p5, %r1, 256; shl.b32 %r29, %r1, 2; add.s32 %r20, %r28, %r29; @%p5 ld.shared.b32 %r19, [ %r20 + 0 ]; mov.b32 %f5, %r19; shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1; mov.b32 %f6, %r30; $L__tmp2: .loc 2 233 15 add.f32 %f7, %f5, %f6; $L__tmp3: .loc 2 243 36 mov.b32 %r31, %f7; shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1; mov.b32 %f8, %r32; $L__tmp4: .loc 2 233 15 add.f32 %f9, %f7, %f8; $L__tmp5: .loc 2 243 36 and.b32 %r33, %r1, 3; setp.eq.s32 %p9, %r33, 0; and.pred %p6, %p5, %p9; mov.b32 %r22, %f9; @%p6 st.shared.b32 [ %r20 + 0 ], %r22; bar.sync 0; add.s32 %r34, %r28, %r26; $L__tmp6: .loc 1 36 20 shr.s32 %r36, %r4, 31; shr.u32 %r37, %r36, 24; add.s32 %r38, %r4, %r37; shr.s32 %r39, %r38, 8; and.b32 %r40, %r38, -256; sub.s32 %r41, %r4, %r40; .loc 1 38 30 mul.wide.s32 %rd9, %r39, 8; add.s64 %rd7, %rd2, %rd9; .loc 1 45 55 ld.shared.u32 %r24, [%r34]; .loc 1 38 35 mov.u64 %rd6, 0x0; @%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ]; .loc 1 41 32 shr.u64 %rd10, %rd6, 54; and.b64 %rd11, %rd10, 512; add.s64 %rd12, %rd11, %rd6; .loc 1 45 30 shl.b64 %rd13, %rd12, 10; add.s64 %rd14, %rd3, %rd13; mul.wide.s32 %rd15, %r41, 4; add.s64 %rd8, %rd14, %rd15; .loc 1 45 55 setp.eq.s32 %p8, %r3, 0; mov.u32 %r23, 0x0; @%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24; .loc 1 45 4 ret; $L__tmp7: $L__func_end0: } .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 264 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 54 .b8 105 .b8 107 .b8 53 .b8 118 .b8 120 .b8 55 .b8 112 .b8 50 .b8 50 .b8 102 .b8 112 .b8 107 .b8 52 .b8 100 .b8 99 .b8 118 .b8 104 .b8 53 .b8 53 .b8 122 .b8 105 .b8 109 .b8 119 .b8 52 .b8 116 .b8 53 .b8 110 .b8 114 .b8 53 .b8 122 .b8 110 .b8 50 .b8 98 .b8 55 .b8 105 .b8 110 .b8 117 .b8 106 .b8 120 .b8 106 .b8 97 .b8 117 .b8 120 .b8 115 .b8 104 .b8 108 .b8 106 .b8 117 .b8 109 .b8 109 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 54 .b8 105 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp6 .b8 2 .b8 35 .b8 25 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp5 .b8 2 .b8 35 .b8 25 .b8 4 .b32 125 .b64 $L__tmp2 .b64 $L__tmp5 .b8 2 .b8 243 .b8 36 .b8 0 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 268 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 268 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }