// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2de .extern .func __assertfail ( .param .b64 __assertfail_param_0, .param .b64 __assertfail_param_1, .param .b32 __assertfail_param_2, .param .b64 __assertfail_param_3, .param .b64 __assertfail_param_4 ) ; .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55}; .extern .shared .align 1 .b8 global_smem[]; .visible .entry triton__0d1d2de( .param .u64 triton__0d1d2de_param_0, .param .u64 triton__0d1d2de_param_1, .param .u64 triton__0d1d2de_param_2 ) .maxntid 128, 1, 1 { .reg .pred %p<24>; .reg .b16 %rs<21>; .reg .b32 %r<21>; .reg .b64 %rd<58>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd9, [triton__0d1d2de_param_1]; ld.param.u64 %rd16, [triton__0d1d2de_param_0]; $L__tmp0: .loc 1 21 36 mov.u32 %r4, %tid.x; and.b32 %r1, %r4, 127; shl.b32 %r2, %r1, 1; or.b32 %r5, %r2, 1; or.b32 %r6, %r2, 256; .loc 1 20 28 mov.u32 %r3, %ctaid.x; .loc 1 20 46 mul.wide.s32 %rd1, %r3, 512; cvt.u64.u32 %rd17, %r2; cvt.u64.u32 %rd18, %r6; .loc 1 21 23 or.b64 %rd2, %rd1, %rd17; or.b64 %rd3, %rd1, %rd18; .loc 1 24 30 shl.b64 %rd19, %rd2, 3; add.s64 %rd12, %rd16, %rd19; add.s64 %rd15, %rd12, 2048; mov.pred %p20, -1; .loc 1 24 35 mov.u64 %rd10, 0x0; mov.u64 %rd11, 0x0; @%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ]; mov.u64 %rd13, 0x0; mov.u64 %rd14, 0x0; @%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ]; .loc 1 26 19 setp.eq.s64 %p3, %rd14, -1; setp.eq.s64 %p4, %rd13, -1; setp.eq.s64 %p5, %rd11, -1; setp.eq.s64 %p6, %rd10, -1; .loc 1 28 32 selp.b64 %rd20, 0, %rd10, %p6; selp.b64 %rd21, 0, %rd11, %p5; selp.b64 %rd22, 0, %rd13, %p4; selp.b64 %rd23, 0, %rd14, %p3; .loc 1 29 18 add.s64 %rd24, %rd23, 50257; add.s64 %rd25, %rd22, 50257; add.s64 %rd26, %rd21, 50257; add.s64 %rd27, %rd20, 50257; .loc 1 30 18 setp.lt.s64 %p7, %rd23, 0; setp.lt.s64 %p8, %rd22, 0; setp.lt.s64 %p9, %rd21, 0; setp.lt.s64 %p10, %rd20, 0; .loc 1 31 32 selp.b64 %rd7, %rd27, %rd20, %p10; selp.b64 %rd6, %rd26, %rd21, %p9; selp.b64 %rd5, %rd25, %rd22, %p8; selp.b64 %rd4, %rd24, %rd23, %p7; .loc 1 32 36 setp.lt.u64 %p11, %rd4, 50257; setp.lt.u64 %p12, %rd5, 50257; setp.lt.u64 %p13, %rd6, 50257; setp.lt.u64 %p14, %rd7, 50257; mov.u32 %r7, global_smem; add.s32 %r8, %r7, %r2; selp.u16 %rs1, 1, 0, %p14; st.shared.u8 [%r8], %rs1; cvt.u64.u32 %rd8, %r5; selp.u16 %rs2, 1, 0, %p13; st.shared.u8 [%r8+1], %rs2; bar.sync 0; add.s32 %r9, %r7, %r1; ld.shared.u8 %rs3, [%r9]; ld.shared.u8 %rs4, [%r9+128]; bar.sync 0; selp.u16 %rs5, 1, 0, %p12; st.shared.u8 [%r8], %rs5; selp.u16 %rs6, 1, 0, %p11; st.shared.u8 [%r8+1], %rs6; bar.sync 0; ld.shared.u8 %rs7, [%r9]; ld.shared.u8 %rs8, [%r9+128]; setp.eq.s16 %p15, %rs7, 0; selp.u16 %rs9, 1, 0, %p15; shl.b16 %rs10, %rs9, 2; setp.eq.s16 %p16, %rs8, 0; selp.u16 %rs11, -1, 0, %p16; shl.b16 %rs12, %rs11, 3; or.b16 %rs13, %rs12, %rs10; setp.eq.s16 %p17, %rs4, 0; selp.u16 %rs14, 1, 0, %p17; setp.eq.s16 %p18, %rs3, 0; selp.u16 %rs15, -1, 0, %p18; shl.b16 %rs16, %rs15, 1; or.b16 %rs17, %rs14, %rs16; and.b16 %rs18, %rs17, 3; or.b16 %rs19, %rs18, %rs13; .loc 1 32 51 and.b16 %rs20, %rs19, 15; setp.eq.s16 %p19, %rs20, 0; @%p19 bra $L__BB0_2; mov.u64 %rd28, assertMessage_0; cvta.global.u64 %rd29, %rd28; mov.u64 %rd30, assertFile_0; cvta.global.u64 %rd31, %rd30; mov.u64 %rd32, assertFunc_0; cvta.global.u64 %rd33, %rd32; mov.b32 %r10, 883; mov.u64 %rd34, 1; { // callseq 0, 0 .reg .b32 temp_param_reg; .param .b64 param0; st.param.b64 [param0+0], %rd29; .param .b64 param1; st.param.b64 [param1+0], %rd31; .param .b32 param2; st.param.b32 [param2+0], %r10; .param .b64 param3; st.param.b64 [param3+0], %rd33; .param .b64 param4; st.param.b64 [param4+0], %rd34; call.uni __assertfail, ( param0, param1, param2, param3, param4 ); } // callseq 0 $L__BB0_2: .loc 1 21 36 or.b32 %r15, %r2, 257; cvt.u64.u32 %rd39, %r15; .loc 1 21 23 or.b64 %rd40, %rd1, %rd39; or.b64 %rd41, %rd1, %rd8; .loc 1 34 25 shl.b64 %rd42, %rd7, 2; add.s64 %rd43, %rd9, %rd42; mul.lo.s64 %rd44, %rd2, 201028; add.s64 %rd45, %rd43, %rd44; shl.b64 %rd46, %rd6, 2; add.s64 %rd47, %rd9, %rd46; mul.lo.s64 %rd48, %rd41, 201028; add.s64 %rd49, %rd47, %rd48; shl.b64 %rd50, %rd5, 2; add.s64 %rd51, %rd9, %rd50; mul.lo.s64 %rd52, %rd3, 201028; add.s64 %rd53, %rd51, %rd52; shl.b64 %rd54, %rd4, 2; add.s64 %rd55, %rd9, %rd54; mul.lo.s64 %rd56, %rd40, 201028; add.s64 %rd57, %rd55, %rd56; .loc 1 34 51 bar.sync 0; shl.b32 %r16, %r2, 3; add.s32 %r18, %r7, %r16; st.shared.u64 [%r18], %rd45; st.shared.u64 [%r18+8], %rd49; bar.sync 0; shl.b32 %r19, %r1, 3; add.s32 %r20, %r7, %r19; ld.shared.u64 %rd35, [%r20]; ld.shared.u64 %rd36, [%r20+1024]; bar.sync 0; st.shared.u64 [%r18], %rd53; st.shared.u64 [%r18+8], %rd57; bar.sync 0; ld.shared.u64 %rd37, [%r20]; ld.shared.u64 %rd38, [%r20+1024]; mov.b32 %r11, -1082130432; @%p20 st.global.b32 [ %rd35 + 0 ], { %r11 }; @%p20 st.global.b32 [ %rd36 + 0 ], { %r11 }; @%p20 st.global.b32 [ %rd37 + 0 ], { %r11 }; @%p20 st.global.b32 [ %rd38 + 0 ], { %r11 }; .loc 1 34 4 ret; $L__tmp1: $L__func_end0: } .file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 176 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 104 .b8 108 .b8 114 .b8 107 .b8 103 .b8 112 .b8 118 .b8 118 .b8 98 .b8 100 .b8 105 .b8 122 .b8 100 .b8 122 .b8 55 .b8 115 .b8 108 .b8 108 .b8 113 .b8 117 .b8 101 .b8 116 .b8 50 .b8 106 .b8 55 .b8 122 .b8 104 .b8 116 .b8 101 .b8 115 .b8 54 .b8 109 .b8 101 .b8 104 .b8 54 .b8 107 .b8 101 .b8 110 .b8 114 .b8 113 .b8 120 .b8 111 .b8 118 .b8 50 .b8 54 .b8 109 .b8 115 .b8 119 .b8 118 .b8 119 .b8 55 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 104 .b8 108 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 180 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 180 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }