// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3de4e .extern .shared .align 1 .b8 global_smem[]; .visible .entry triton__0d1d2d3de4e( .param .u64 triton__0d1d2d3de4e_param_0, .param .u64 triton__0d1d2d3de4e_param_1, .param .u64 triton__0d1d2d3de4e_param_2, .param .u32 triton__0d1d2d3de4e_param_3, .param .u32 triton__0d1d2d3de4e_param_4 ) .maxntid 256, 1, 1 { .reg .pred %p<20>; .reg .b32 %r<107>; .reg .f32 %f<60>; .reg .b64 %rd<18>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_0]; ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_1]; $L__tmp0: .loc 1 22 44 mov.u32 %r32, %tid.x; and.b32 %r33, %r32, 31; ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_2]; shl.b32 %r34, %r32, 2; and.b32 %r35, %r34, 12; and.b32 %r36, %r32, 15; .loc 1 24 33 bfe.u32 %r37, %r32, 5, 3; bfe.u32 %r38, %r32, 2, 3; shl.b32 %r39, %r37, 3; or.b32 %r40, %r39, %r38; or.b32 %r41, %r40, 64; .loc 1 21 28 mov.u32 %r1, %ctaid.x; .loc 1 21 33 shl.b32 %r42, %r1, 4; .loc 1 22 23 or.b32 %r43, %r42, %r35; or.b32 %r44, %r42, %r36; .loc 1 29 25 setp.lt.u32 %p6, %r41, 120; .loc 1 31 47 shl.b32 %r45, %r40, 17; shl.b32 %r46, %r41, 17; .loc 1 31 40 add.s32 %r47, %r43, %r45; add.s32 %r48, %r43, %r46; .loc 1 31 34 mul.wide.s32 %rd9, %r47, 4; add.s64 %rd1, %rd6, %rd9; mul.wide.s32 %rd10, %r48, 4; add.s64 %rd2, %rd6, %rd10; mov.b32 %r6, 0; mov.pred %p1, -1; .loc 1 31 53 mov.u32 %r2, 0x0; mov.u32 %r3, 0x0; mov.u32 %r4, 0x0; mov.u32 %r5, 0x0; @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; @!%p1 mov.u32 %r2, %r6; @!%p1 mov.u32 %r3, %r6; @!%p1 mov.u32 %r4, %r6; @!%p1 mov.u32 %r5, %r6; mov.b32 %f1, %r2; mov.b32 %f2, %r3; mov.b32 %f3, %r4; mov.b32 %f4, %r5; mov.u32 %r10, 0x0; mov.u32 %r11, 0x0; mov.u32 %r12, 0x0; mov.u32 %r13, 0x0; @%p6 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; @!%p6 mov.u32 %r10, %r6; @!%p6 mov.u32 %r11, %r6; @!%p6 mov.u32 %r12, %r6; @!%p6 mov.u32 %r13, %r6; mov.b32 %f5, %r10; mov.b32 %f6, %r11; mov.b32 %f7, %r12; mov.b32 %f8, %r13; .loc 1 33 23 add.f32 %f9, %f1, 0f00000000; add.f32 %f10, %f2, 0f00000000; add.f32 %f11, %f3, 0f00000000; add.f32 %f12, %f4, 0f00000000; add.f32 %f13, %f5, 0f00000000; add.f32 %f14, %f6, 0f00000000; add.f32 %f15, %f7, 0f00000000; add.f32 %f16, %f8, 0f00000000; .loc 1 34 38 selp.f32 %f17, %f13, 0f00000000, %p6; selp.f32 %f18, %f14, 0f00000000, %p6; selp.f32 %f19, %f15, 0f00000000, %p6; selp.f32 %f20, %f16, 0f00000000, %p6; $L__tmp1: .loc 2 233 15 add.f32 %f21, %f9, %f17; add.f32 %f22, %f10, %f18; add.f32 %f23, %f11, %f19; add.f32 %f24, %f12, %f20; $L__tmp2: .loc 2 243 36 mov.b32 %r49, %f21; shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1; mov.b32 %f25, %r50; $L__tmp3: .loc 2 233 15 add.f32 %f26, %f21, %f25; $L__tmp4: .loc 2 243 36 mov.b32 %r51, %f26; shfl.sync.bfly.b32 %r52, %r51, 8, 31, -1; mov.b32 %f27, %r52; $L__tmp5: .loc 2 233 15 add.f32 %f28, %f26, %f27; $L__tmp6: .loc 2 243 36 mov.b32 %r53, %f28; shfl.sync.bfly.b32 %r54, %r53, 4, 31, -1; mov.b32 %f29, %r54; $L__tmp7: .loc 2 233 15 add.f32 %f30, %f28, %f29; $L__tmp8: .loc 2 243 36 mov.b32 %r55, %f22; shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; mov.b32 %f31, %r56; $L__tmp9: .loc 2 233 15 add.f32 %f32, %f22, %f31; $L__tmp10: .loc 2 243 36 mov.b32 %r57, %f32; shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; mov.b32 %f33, %r58; $L__tmp11: .loc 2 233 15 add.f32 %f34, %f32, %f33; $L__tmp12: .loc 2 243 36 mov.b32 %r59, %f34; shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; mov.b32 %f35, %r60; $L__tmp13: .loc 2 233 15 add.f32 %f36, %f34, %f35; $L__tmp14: .loc 2 243 36 mov.b32 %r61, %f23; shfl.sync.bfly.b32 %r62, %r61, 16, 31, -1; mov.b32 %f37, %r62; $L__tmp15: .loc 2 233 15 add.f32 %f38, %f23, %f37; $L__tmp16: .loc 2 243 36 mov.b32 %r63, %f38; shfl.sync.bfly.b32 %r64, %r63, 8, 31, -1; mov.b32 %f39, %r64; $L__tmp17: .loc 2 233 15 add.f32 %f40, %f38, %f39; $L__tmp18: .loc 2 243 36 mov.b32 %r65, %f40; shfl.sync.bfly.b32 %r66, %r65, 4, 31, -1; mov.b32 %f41, %r66; $L__tmp19: .loc 2 233 15 add.f32 %f42, %f40, %f41; $L__tmp20: .loc 2 243 36 mov.b32 %r67, %f24; shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1; mov.b32 %f43, %r68; $L__tmp21: .loc 2 233 15 add.f32 %f44, %f24, %f43; $L__tmp22: .loc 2 243 36 mov.b32 %r69, %f44; shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1; mov.b32 %f45, %r70; $L__tmp23: .loc 2 233 15 add.f32 %f46, %f44, %f45; $L__tmp24: .loc 2 243 36 mov.b32 %r71, %f46; shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1; mov.b32 %f47, %r72; $L__tmp25: .loc 2 233 15 add.f32 %f48, %f46, %f47; $L__tmp26: .loc 2 243 36 setp.lt.u32 %p11, %r33, 4; shl.b32 %r73, %r37, 2; shl.b32 %r74, %r35, 5; or.b32 %r75, %r74, %r73; mov.u32 %r76, global_smem; add.s32 %r18, %r76, %r75; mov.b32 %r19, %f30; @%p11 st.shared.b32 [ %r18 + 0 ], %r19; or.b32 %r77, %r74, 32; or.b32 %r78, %r77, %r73; add.s32 %r20, %r76, %r78; mov.b32 %r21, %f36; @%p11 st.shared.b32 [ %r20 + 0 ], %r21; or.b32 %r79, %r74, 64; or.b32 %r80, %r79, %r73; add.s32 %r22, %r76, %r80; mov.b32 %r23, %f42; @%p11 st.shared.b32 [ %r22 + 0 ], %r23; or.b32 %r81, %r74, 96; or.b32 %r82, %r81, %r73; add.s32 %r24, %r76, %r82; mov.b32 %r25, %f48; @%p11 st.shared.b32 [ %r24 + 0 ], %r25; bar.sync 0; setp.lt.s32 %p15, %r32, 128; add.s32 %r27, %r76, %r34; @%p15 ld.shared.b32 %r26, [ %r27 + 0 ]; mov.b32 %f49, %r26; shfl.sync.bfly.b32 %r83, %r26, 4, 31, -1; mov.b32 %f50, %r83; $L__tmp27: .loc 2 233 15 add.f32 %f51, %f49, %f50; $L__tmp28: .loc 2 243 36 mov.b32 %r84, %f51; shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1; mov.b32 %f52, %r85; $L__tmp29: .loc 2 233 15 add.f32 %f53, %f51, %f52; $L__tmp30: .loc 2 243 36 mov.b32 %r86, %f53; shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; mov.b32 %f54, %r87; $L__tmp31: .loc 2 233 15 add.f32 %f55, %f53, %f54; $L__tmp32: .loc 2 243 36 and.b32 %r88, %r32, 7; setp.eq.s32 %p19, %r88, 0; and.pred %p16, %p15, %p19; mov.b32 %r29, %f55; @%p16 st.shared.b32 [ %r27 + 0 ], %r29; bar.sync 0; add.s32 %r89, %r76, %r74; ld.shared.f32 %f56, [%r89]; add.s32 %r90, %r76, %r77; ld.shared.f32 %f57, [%r90]; add.s32 %r91, %r76, %r79; ld.shared.f32 %f58, [%r91]; add.s32 %r92, %r76, %r81; ld.shared.f32 %f59, [%r92]; $L__tmp33: .loc 1 35 28 bar.sync 0; shl.b32 %r93, %r35, 2; add.s32 %r94, %r76, %r93; st.shared.f32 [%r94], %f56; st.shared.f32 [%r94+4], %f57; st.shared.f32 [%r94+8], %f58; st.shared.f32 [%r94+12], %f59; bar.sync 0; shl.b32 %r95, %r36, 2; add.s32 %r96, %r76, %r95; .loc 1 36 20 shr.s32 %r98, %r44, 31; shr.u32 %r99, %r98, 24; add.s32 %r100, %r44, %r99; shr.s32 %r101, %r100, 8; and.b32 %r102, %r100, -256; sub.s32 %r103, %r44, %r102; .loc 1 38 30 mul.wide.s32 %rd11, %r101, 8; add.s64 %rd4, %rd7, %rd11; .loc 1 45 55 ld.shared.u32 %r31, [%r96]; .loc 1 38 35 mov.u64 %rd3, 0x0; @%p1 ld.global.L1::evict_last.b64 { %rd3 }, [ %rd4 + 0 ]; .loc 1 41 32 shr.u64 %rd12, %rd3, 54; and.b64 %rd13, %rd12, 512; add.s64 %rd14, %rd13, %rd3; .loc 1 45 30 shl.b64 %rd15, %rd14, 10; add.s64 %rd16, %rd8, %rd15; mul.wide.s32 %rd17, %r103, 4; add.s64 %rd5, %rd16, %rd17; .loc 1 45 55 bfe.u32 %r104, %r32, 4, 1; shl.b32 %r105, %r37, 1; or.b32 %r106, %r105, %r104; setp.eq.s32 %p18, %r106, 0; mov.u32 %r30, 0x0; @%p18 atom.global.gpu.acq_rel.add.f32 %r30, [ %rd5 + 0 ], %r31; .loc 1 45 4 ret; $L__tmp34: $L__func_end0: } .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 264 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 54 .b8 105 .b8 107 .b8 53 .b8 118 .b8 120 .b8 55 .b8 112 .b8 50 .b8 50 .b8 102 .b8 112 .b8 107 .b8 52 .b8 100 .b8 99 .b8 118 .b8 104 .b8 53 .b8 53 .b8 122 .b8 105 .b8 109 .b8 119 .b8 52 .b8 116 .b8 53 .b8 110 .b8 114 .b8 53 .b8 122 .b8 110 .b8 50 .b8 98 .b8 55 .b8 105 .b8 110 .b8 117 .b8 106 .b8 120 .b8 106 .b8 97 .b8 117 .b8 120 .b8 115 .b8 104 .b8 108 .b8 106 .b8 117 .b8 109 .b8 109 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 54 .b8 105 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp32 .b8 2 .b8 35 .b8 25 .b8 5 .b32 125 .b64 $L__tmp1 .b64 $L__tmp32 .b8 2 .b8 243 .b8 36 .b8 0 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp33 .b8 2 .b8 35 .b8 25 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 268 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 101 .b8 52 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 268 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }