|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2d3d4de |
|
|
|
.visible .entry triton__0d1d2d3d4de( |
|
.param .u64 triton__0d1d2d3d4de_param_0, |
|
.param .u64 triton__0d1d2d3d4de_param_1, |
|
.param .u64 triton__0d1d2d3d4de_param_2, |
|
.param .u64 triton__0d1d2d3d4de_param_3, |
|
.param .u32 triton__0d1d2d3d4de_param_4 |
|
) |
|
.maxntid 128, 1, 1 |
|
{ |
|
.reg .pred %p<8> |
|
.reg .b16 %rs<33> |
|
.reg .b32 %r<77> |
|
.reg .f32 %f<65> |
|
.reg .b64 %rd<11> |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0] |
|
ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1] |
|
$L__tmp0: |
|
.loc 1 21 36 |
|
mov.u32 %r50, %tid.x |
|
shl.b32 %r51, %r50, 3 |
|
ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2] |
|
and.b32 %r52, %r51, 1016 |
|
ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3] |
|
.loc 1 20 28 |
|
mov.u32 %r1, %ctaid.x |
|
.loc 1 20 33 |
|
shl.b32 %r53, %r1, 10 |
|
.loc 1 21 23 |
|
or.b32 %r54, %r53, %r52 |
|
.loc 1 23 20 |
|
shr.s32 %r56, %r54, 31 |
|
shr.u32 %r57, %r56, 24 |
|
add.s32 %r58, %r54, %r57 |
|
shr.s32 %r59, %r58, 8 |
|
.loc 1 23 27 |
|
mul.hi.s32 %r60, %r59, 1431655766 |
|
shr.u32 %r61, %r60, 31 |
|
add.s32 %r62, %r60, %r61 |
|
mul.lo.s32 %r63, %r62, 3 |
|
sub.s32 %r64, %r59, %r63 |
|
and.b32 %r65, %r58, -256 |
|
sub.s32 %r66, %r54, %r65 |
|
.loc 1 25 20 |
|
mul.hi.s32 %r67, %r54, 715827883 |
|
shr.u32 %r68, %r67, 31 |
|
shr.u32 %r69, %r67, 7 |
|
add.s32 %r70, %r69, %r68 |
|
.loc 1 27 40 |
|
shl.b32 %r71, %r70, 8 |
|
.loc 1 27 36 |
|
add.s32 %r72, %r71, %r66 |
|
.loc 1 27 30 |
|
mul.wide.s32 %rd9, %r72, 2 |
|
add.s64 %rd1, %rd5, %rd9 |
|
mov.pred %p1, -1 |
|
.loc 1 27 46 |
|
mov.u32 %r2, 0x0 |
|
mov.u32 %r3, 0x0 |
|
mov.u32 %r4, 0x0 |
|
mov.u32 %r5, 0x0 |
|
@%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ] |
|
cvt.u16.u32 %rs1, %r2 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs3, %r3 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs5, %r4 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs7, %r5 |
|
{ .reg .b16 tmp |
|
.loc 1 27 85 |
|
cvt.f32.bf16 %r6, %rs1 |
|
mov.b32 %f1, %r6 |
|
cvt.f32.bf16 %r7, %rs2 |
|
mov.b32 %f2, %r7 |
|
cvt.f32.bf16 %r8, %rs3 |
|
mov.b32 %f3, %r8 |
|
cvt.f32.bf16 %r9, %rs4 |
|
mov.b32 %f4, %r9 |
|
cvt.f32.bf16 %r10, %rs5 |
|
mov.b32 %f5, %r10 |
|
cvt.f32.bf16 %r11, %rs6 |
|
mov.b32 %f6, %r11 |
|
cvt.f32.bf16 %r12, %rs7 |
|
mov.b32 %f7, %r12 |
|
cvt.f32.bf16 %r13, %rs8 |
|
mov.b32 %f8, %r13 |
|
.loc 1 28 30 |
|
add.s64 %rd2, %rd6, %rd9 |
|
.loc 1 28 46 |
|
mov.u32 %r14, 0x0 |
|
mov.u32 %r15, 0x0 |
|
mov.u32 %r16, 0x0 |
|
mov.u32 %r17, 0x0 |
|
@%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ] |
|
cvt.u16.u32 %rs9, %r14 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs11, %r15 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs13, %r16 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs15, %r17 |
|
{ .reg .b16 tmp |
|
.loc 1 28 85 |
|
cvt.f32.bf16 %r18, %rs9 |
|
mov.b32 %f9, %r18 |
|
cvt.f32.bf16 %r19, %rs10 |
|
mov.b32 %f10, %r19 |
|
cvt.f32.bf16 %r20, %rs11 |
|
mov.b32 %f11, %r20 |
|
cvt.f32.bf16 %r21, %rs12 |
|
mov.b32 %f12, %r21 |
|
cvt.f32.bf16 %r22, %rs13 |
|
mov.b32 %f13, %r22 |
|
cvt.f32.bf16 %r23, %rs14 |
|
mov.b32 %f14, %r23 |
|
cvt.f32.bf16 %r24, %rs15 |
|
mov.b32 %f15, %r24 |
|
cvt.f32.bf16 %r25, %rs16 |
|
mov.b32 %f16, %r25 |
|
.loc 1 29 31 |
|
add.s64 %rd3, %rd7, %rd9 |
|
.loc 1 29 47 |
|
mov.u32 %r26, 0x0 |
|
mov.u32 %r27, 0x0 |
|
mov.u32 %r28, 0x0 |
|
mov.u32 %r29, 0x0 |
|
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ] |
|
cvt.u16.u32 %rs17, %r26 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs19, %r27 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs21, %r28 |
|
{ .reg .b16 tmp |
|
cvt.u16.u32 %rs23, %r29 |
|
{ .reg .b16 tmp |
|
.loc 1 29 86 |
|
cvt.f32.bf16 %r30, %rs17 |
|
mov.b32 %f17, %r30 |
|
cvt.f32.bf16 %r31, %rs18 |
|
mov.b32 %f18, %r31 |
|
cvt.f32.bf16 %r32, %rs19 |
|
mov.b32 %f19, %r32 |
|
cvt.f32.bf16 %r33, %rs20 |
|
mov.b32 %f20, %r33 |
|
cvt.f32.bf16 %r34, %rs21 |
|
mov.b32 %f21, %r34 |
|
cvt.f32.bf16 %r35, %rs22 |
|
mov.b32 %f22, %r35 |
|
cvt.f32.bf16 %r36, %rs23 |
|
mov.b32 %f23, %r36 |
|
cvt.f32.bf16 %r37, %rs24 |
|
mov.b32 %f24, %r37 |
|
.loc 1 32 19 |
|
setp.eq.s32 %p5, %r64, 2 |
|
.loc 1 34 32 |
|
selp.f32 %f25, %f1, 0f00000000, %p5 |
|
selp.f32 %f26, %f2, 0f00000000, %p5 |
|
selp.f32 %f27, %f3, 0f00000000, %p5 |
|
selp.f32 %f28, %f4, 0f00000000, %p5 |
|
selp.f32 %f29, %f5, 0f00000000, %p5 |
|
selp.f32 %f30, %f6, 0f00000000, %p5 |
|
selp.f32 %f31, %f7, 0f00000000, %p5 |
|
selp.f32 %f32, %f8, 0f00000000, %p5 |
|
.loc 1 36 19 |
|
setp.eq.s32 %p6, %r64, 1 |
|
.loc 1 37 32 |
|
selp.f32 %f33, %f9, 0f00000000, %p6 |
|
selp.f32 %f34, %f10, 0f00000000, %p6 |
|
selp.f32 %f35, %f11, 0f00000000, %p6 |
|
selp.f32 %f36, %f12, 0f00000000, %p6 |
|
selp.f32 %f37, %f13, 0f00000000, %p6 |
|
selp.f32 %f38, %f14, 0f00000000, %p6 |
|
selp.f32 %f39, %f15, 0f00000000, %p6 |
|
selp.f32 %f40, %f16, 0f00000000, %p6 |
|
.loc 1 38 19 |
|
add.f32 %f41, %f25, %f33 |
|
add.f32 %f42, %f26, %f34 |
|
add.f32 %f43, %f27, %f35 |
|
add.f32 %f44, %f28, %f36 |
|
add.f32 %f45, %f29, %f37 |
|
add.f32 %f46, %f30, %f38 |
|
add.f32 %f47, %f31, %f39 |
|
add.f32 %f48, %f32, %f40 |
|
.loc 1 40 20 |
|
setp.eq.s32 %p7, %r64, 0 |
|
.loc 1 41 35 |
|
selp.f32 %f49, %f17, 0f00000000, %p7 |
|
selp.f32 %f50, %f18, 0f00000000, %p7 |
|
selp.f32 %f51, %f19, 0f00000000, %p7 |
|
selp.f32 %f52, %f20, 0f00000000, %p7 |
|
selp.f32 %f53, %f21, 0f00000000, %p7 |
|
selp.f32 %f54, %f22, 0f00000000, %p7 |
|
selp.f32 %f55, %f23, 0f00000000, %p7 |
|
selp.f32 %f56, %f24, 0f00000000, %p7 |
|
.loc 1 42 20 |
|
add.f32 %f57, %f41, %f49 |
|
add.f32 %f58, %f42, %f50 |
|
add.f32 %f59, %f43, %f51 |
|
add.f32 %f60, %f44, %f52 |
|
add.f32 %f61, %f45, %f53 |
|
add.f32 %f62, %f46, %f54 |
|
add.f32 %f63, %f47, %f55 |
|
add.f32 %f64, %f48, %f56 |
|
.loc 1 43 25 |
|
mul.wide.s32 %rd10, %r54, 2 |
|
add.s64 %rd4, %rd8, %rd10 |
|
.loc 1 43 37 |
|
mov.b32 %r38, %f57 |
|
cvt.rn.bf16.f32 %rs25, %r38 |
|
mov.b32 %r39, %f58 |
|
cvt.rn.bf16.f32 %rs26, %r39 |
|
mov.b32 %r40, %f59 |
|
cvt.rn.bf16.f32 %rs27, %r40 |
|
mov.b32 %r41, %f60 |
|
cvt.rn.bf16.f32 %rs28, %r41 |
|
mov.b32 %r42, %f61 |
|
cvt.rn.bf16.f32 %rs29, %r42 |
|
mov.b32 %r43, %f62 |
|
cvt.rn.bf16.f32 %rs30, %r43 |
|
mov.b32 %r44, %f63 |
|
cvt.rn.bf16.f32 %rs31, %r44 |
|
mov.b32 %r45, %f64 |
|
cvt.rn.bf16.f32 %rs32, %r45 |
|
mov.b32 %r73, {%rs25, %rs26} |
|
mov.b32 %r74, {%rs27, %rs28} |
|
mov.b32 %r75, {%rs29, %rs30} |
|
mov.b32 %r76, {%rs31, %rs32} |
|
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 } |
|
.loc 1 43 4 |
|
ret |
|
$L__tmp1: |
|
$L__func_end0: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 184 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 54 |
|
.b8 51 |
|
.b8 114 |
|
.b8 55 |
|
.b8 105 |
|
.b8 117 |
|
.b8 114 |
|
.b8 119 |
|
.b8 107 |
|
.b8 53 |
|
.b8 121 |
|
.b8 100 |
|
.b8 108 |
|
.b8 115 |
|
.b8 119 |
|
.b8 104 |
|
.b8 55 |
|
.b8 114 |
|
.b8 118 |
|
.b8 104 |
|
.b8 99 |
|
.b8 109 |
|
.b8 108 |
|
.b8 120 |
|
.b8 50 |
|
.b8 99 |
|
.b8 102 |
|
.b8 114 |
|
.b8 101 |
|
.b8 116 |
|
.b8 108 |
|
.b8 114 |
|
.b8 101 |
|
.b8 119 |
|
.b8 103 |
|
.b8 119 |
|
.b8 54 |
|
.b8 116 |
|
.b8 108 |
|
.b8 106 |
|
.b8 108 |
|
.b8 117 |
|
.b8 114 |
|
.b8 115 |
|
.b8 115 |
|
.b8 104 |
|
.b8 103 |
|
.b8 116 |
|
.b8 102 |
|
.b8 112 |
|
.b8 112 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 54 |
|
.b8 51 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 188 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 188 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|