|
|
|
|
|
|
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
|
|
.extern .shared .align 1 .b8 global_smem[]; |
|
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; |
|
|
|
.visible .entry triton__0d1d2d3d4de5de( |
|
.param .u64 triton__0d1d2d3d4de5de_param_0, |
|
.param .u64 triton__0d1d2d3d4de5de_param_1, |
|
.param .u64 triton__0d1d2d3d4de5de_param_2, |
|
.param .u64 triton__0d1d2d3d4de5de_param_3, |
|
.param .u32 triton__0d1d2d3d4de5de_param_4, |
|
.param .u32 triton__0d1d2d3d4de5de_param_5 |
|
) |
|
.maxntid 64, 1, 1 |
|
{ |
|
.reg .pred %p<23>; |
|
.reg .b16 %rs<9>; |
|
.reg .b32 %r<84>; |
|
.reg .f32 %f<70>; |
|
.reg .b64 %rd<12>; |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0]; |
|
ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1]; |
|
$L__tmp0: |
|
.loc 1 26 26 |
|
mov.u32 %r50, %tid.x; |
|
and.b32 %r51, %r50, 31; |
|
ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2]; |
|
ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3]; |
|
shl.b32 %r52, %r50, 2; |
|
and.b32 %r53, %r52, 252; |
|
.loc 1 23 28 |
|
mov.u32 %r1, %ctaid.x; |
|
.loc 1 30 40 |
|
shl.b32 %r54, %r1, 8; |
|
.loc 1 30 36 |
|
or.b32 %r55, %r54, %r53; |
|
.loc 1 30 30 |
|
mul.wide.s32 %rd9, %r55, 4; |
|
add.s64 %rd1, %rd5, %rd9; |
|
mov.b32 %r6, 0; |
|
mov.pred %p1, -1; |
|
.loc 1 30 46 |
|
mov.u32 %r2, 0x0; |
|
mov.u32 %r3, 0x0; |
|
mov.u32 %r4, 0x0; |
|
mov.u32 %r5, 0x0; |
|
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; |
|
@!%p1 mov.u32 %r2, %r6; |
|
@!%p1 mov.u32 %r3, %r6; |
|
@!%p1 mov.u32 %r4, %r6; |
|
@!%p1 mov.u32 %r5, %r6; |
|
mov.b32 %f1, %r2; |
|
mov.b32 %f2, %r3; |
|
mov.b32 %f3, %r4; |
|
mov.b32 %f4, %r5; |
|
.loc 1 31 30 |
|
mul.wide.s32 %rd10, %r55, 2; |
|
add.s64 %rd2, %rd6, %rd10; |
|
.loc 1 31 46 |
|
mov.u32 %r10, 0x0; |
|
mov.u32 %r11, 0x0; |
|
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; |
|
@!%p1 mov.u32 %r10, %r6; |
|
@!%p1 mov.u32 %r11, %r6; |
|
cvt.u16.u32 %rs1, %r10; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } |
|
cvt.u16.u32 %rs3, %r11; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } |
|
.loc 1 31 67 |
|
cvt.f32.bf16 %r14, %rs1; |
|
mov.b32 %f5, %r14; |
|
cvt.f32.bf16 %r15, %rs2; |
|
mov.b32 %f6, %r15; |
|
cvt.f32.bf16 %r16, %rs3; |
|
mov.b32 %f7, %r16; |
|
cvt.f32.bf16 %r17, %rs4; |
|
mov.b32 %f8, %r17; |
|
.loc 1 32 31 |
|
mul.wide.u32 %rd11, %r53, 4; |
|
add.s64 %rd3, %rd7, %rd11; |
|
.loc 1 32 36 |
|
mov.u32 %r18, 0x0; |
|
mov.u32 %r19, 0x0; |
|
mov.u32 %r20, 0x0; |
|
mov.u32 %r21, 0x0; |
|
@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; |
|
@!%p1 mov.u32 %r18, %r6; |
|
@!%p1 mov.u32 %r19, %r6; |
|
@!%p1 mov.u32 %r20, %r6; |
|
@!%p1 mov.u32 %r21, %r6; |
|
.loc 1 34 18 |
|
add.f32 %f9, %f5, %f1; |
|
add.f32 %f10, %f6, %f2; |
|
add.f32 %f11, %f7, %f3; |
|
add.f32 %f12, %f8, %f4; |
|
$L__tmp1: |
|
.loc 2 233 15 |
|
add.f32 %f13, %f9, %f10; |
|
add.f32 %f14, %f13, %f11; |
|
add.f32 %f15, %f14, %f12; |
|
$L__tmp2: |
|
.loc 2 243 36 |
|
mov.b32 %r56, %f15; |
|
shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1; |
|
mov.b32 %f16, %r57; |
|
$L__tmp3: |
|
.loc 2 233 15 |
|
add.f32 %f17, %f15, %f16; |
|
$L__tmp4: |
|
.loc 2 243 36 |
|
mov.b32 %r58, %f17; |
|
shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; |
|
mov.b32 %f18, %r59; |
|
$L__tmp5: |
|
.loc 2 233 15 |
|
add.f32 %f19, %f17, %f18; |
|
$L__tmp6: |
|
.loc 2 243 36 |
|
mov.b32 %r60, %f19; |
|
shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1; |
|
mov.b32 %f20, %r61; |
|
$L__tmp7: |
|
.loc 2 233 15 |
|
add.f32 %f21, %f19, %f20; |
|
$L__tmp8: |
|
.loc 2 243 36 |
|
mov.b32 %r62, %f21; |
|
shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1; |
|
mov.b32 %f22, %r63; |
|
$L__tmp9: |
|
.loc 2 233 15 |
|
add.f32 %f23, %f21, %f22; |
|
$L__tmp10: |
|
.loc 2 243 36 |
|
mov.b32 %r64, %f23; |
|
shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; |
|
mov.b32 %f24, %r65; |
|
$L__tmp11: |
|
.loc 2 233 15 |
|
add.f32 %f25, %f23, %f24; |
|
$L__tmp12: |
|
.loc 2 243 36 |
|
setp.eq.s32 %p14, %r51, 0; |
|
shr.u32 %r66, %r50, 3; |
|
and.b32 %r67, %r66, 4; |
|
mov.u32 %r68, global_smem; |
|
add.s32 %r26, %r68, %r67; |
|
mov.b32 %r27, %f25; |
|
@%p14 st.shared.b32 [ %r26 + 0 ], %r27; |
|
bar.sync 0; |
|
setp.lt.s32 %p15, %r50, 2; |
|
add.s32 %r29, %r68, %r52; |
|
@%p15 ld.shared.b32 %r28, [ %r29 + 0 ]; |
|
mov.b32 %f26, %r28; |
|
shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1; |
|
mov.b32 %f27, %r69; |
|
$L__tmp13: |
|
.loc 2 233 15 |
|
add.f32 %f28, %f26, %f27; |
|
$L__tmp14: |
|
.loc 2 243 36 |
|
and.b32 %r70, %r50, 1; |
|
setp.eq.b32 %p21, %r70, 1; |
|
not.pred %p22, %p21; |
|
and.pred %p16, %p15, %p22; |
|
mov.b32 %r31, %f28; |
|
@%p16 st.shared.b32 [ %r29 + 0 ], %r31; |
|
bar.sync 0; |
|
ld.shared.f32 %f29, [global_smem]; |
|
$L__tmp15: |
|
.loc 3 8 15 |
|
add.f32 %f30, %f29, 0f00000000; |
|
$L__tmp16: |
|
.loc 1 42 20 |
|
mov.b32 %r33, %f30; |
|
mov.b32 %r34, 1132462080; |
|
div.full.f32 %r32, %r33, %r34; |
|
mov.b32 %f31, %r32; |
|
.loc 1 43 19 |
|
sub.f32 %f32, %f9, %f31; |
|
sub.f32 %f33, %f10, %f31; |
|
sub.f32 %f34, %f11, %f31; |
|
sub.f32 %f35, %f12, %f31; |
|
.loc 1 44 20 |
|
mul.f32 %f36, %f33, %f33; |
|
$L__tmp17: |
|
.loc 2 243 36 |
|
bar.sync 0; |
|
$L__tmp18: |
|
.loc 2 233 15 |
|
fma.rn.f32 %f37, %f32, %f32, %f36; |
|
fma.rn.f32 %f38, %f34, %f34, %f37; |
|
fma.rn.f32 %f39, %f35, %f35, %f38; |
|
$L__tmp19: |
|
.loc 2 243 36 |
|
mov.b32 %r71, %f39; |
|
shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1; |
|
mov.b32 %f40, %r72; |
|
$L__tmp20: |
|
.loc 2 233 15 |
|
add.f32 %f41, %f39, %f40; |
|
$L__tmp21: |
|
.loc 2 243 36 |
|
mov.b32 %r73, %f41; |
|
shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1; |
|
mov.b32 %f42, %r74; |
|
$L__tmp22: |
|
.loc 2 233 15 |
|
add.f32 %f43, %f41, %f42; |
|
$L__tmp23: |
|
.loc 2 243 36 |
|
mov.b32 %r75, %f43; |
|
shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1; |
|
mov.b32 %f44, %r76; |
|
$L__tmp24: |
|
.loc 2 233 15 |
|
add.f32 %f45, %f43, %f44; |
|
$L__tmp25: |
|
.loc 2 243 36 |
|
mov.b32 %r77, %f45; |
|
shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1; |
|
mov.b32 %f46, %r78; |
|
$L__tmp26: |
|
.loc 2 233 15 |
|
add.f32 %f47, %f45, %f46; |
|
$L__tmp27: |
|
.loc 2 243 36 |
|
mov.b32 %r79, %f47; |
|
shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1; |
|
mov.b32 %f48, %r80; |
|
$L__tmp28: |
|
.loc 2 233 15 |
|
add.f32 %f49, %f47, %f48; |
|
$L__tmp29: |
|
.loc 2 243 36 |
|
mov.b32 %r36, %f49; |
|
@%p14 st.shared.b32 [ %r26 + 0 ], %r36; |
|
bar.sync 0; |
|
@%p15 ld.shared.b32 %r37, [ %r29 + 0 ]; |
|
mov.b32 %f50, %r37; |
|
shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1; |
|
mov.b32 %f51, %r81; |
|
$L__tmp30: |
|
.loc 2 233 15 |
|
add.f32 %f52, %f50, %f51; |
|
$L__tmp31: |
|
.loc 2 243 36 |
|
mov.b32 %r40, %f52; |
|
@%p16 st.shared.b32 [ %r29 + 0 ], %r40; |
|
bar.sync 0; |
|
ld.shared.f32 %f53, [global_smem]; |
|
$L__tmp32: |
|
.loc 3 8 15 |
|
add.f32 %f54, %f53, 0f00000000; |
|
$L__tmp33: |
|
.loc 1 50 20 |
|
mov.b32 %r42, %f54; |
|
div.full.f32 %r41, %r42, %r34; |
|
mov.b32 %f55, %r41; |
|
.loc 1 52 20 |
|
add.f32 %f56, %f55, 0f3727C5AC; |
|
.loc 1 53 26 |
|
rsqrt.approx.ftz.f32 %f57, %f56; |
|
.loc 1 32 36 |
|
mov.b32 %f58, %r21; |
|
mov.b32 %f59, %r20; |
|
mov.b32 %f60, %r19; |
|
mov.b32 %f61, %r18; |
|
.loc 1 54 20 |
|
mul.f32 %f62, %f32, %f57; |
|
mul.f32 %f63, %f33, %f57; |
|
mul.f32 %f64, %f34, %f57; |
|
mul.f32 %f65, %f35, %f57; |
|
.loc 1 55 20 |
|
mul.f32 %f66, %f62, %f61; |
|
mul.f32 %f67, %f63, %f60; |
|
mul.f32 %f68, %f64, %f59; |
|
mul.f32 %f69, %f65, %f58; |
|
.loc 1 57 25 |
|
add.s64 %rd4, %rd8, %rd10; |
|
.loc 1 57 48 |
|
mov.b32 %r44, %f66; |
|
cvt.rn.bf16.f32 %rs5, %r44; |
|
mov.b32 %r45, %f67; |
|
cvt.rn.bf16.f32 %rs6, %r45; |
|
mov.b32 %r46, %f68; |
|
cvt.rn.bf16.f32 %rs7, %r46; |
|
mov.b32 %r47, %f69; |
|
cvt.rn.bf16.f32 %rs8, %r47; |
|
mov.b32 %r82, {%rs5, %rs6}; |
|
mov.b32 %r83, {%rs7, %rs8}; |
|
@%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 }; |
|
.loc 1 57 4 |
|
ret; |
|
$L__tmp34: |
|
$L__func_end0: |
|
|
|
} |
|
|
|
.visible .func (.param .b32 func_retval0) __nv_rsqrtf( |
|
.param .b32 __nv_rsqrtf_param_0 |
|
) |
|
{ |
|
.reg .f32 %f<3>; |
|
$L__func_begin1: |
|
|
|
ld.param.f32 %f1, [__nv_rsqrtf_param_0]; |
|
rsqrt.approx.ftz.f32 %f2, %f1; |
|
st.param.f32 [func_retval0+0], %f2; |
|
ret; |
|
$L__func_end1: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/qh/cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py" |
|
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" |
|
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 32 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 3 |
|
.b8 46 |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 49 |
|
.b8 19 |
|
.b8 0 |
|
.b8 0 |
|
.b8 4 |
|
.b8 29 |
|
.b8 1 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 5 |
|
.b8 29 |
|
.b8 0 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 391 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 113 |
|
.b8 104 |
|
.b8 50 |
|
.b8 100 |
|
.b8 106 |
|
.b8 51 |
|
.b8 53 |
|
.b8 53 |
|
.b8 105 |
|
.b8 97 |
|
.b8 116 |
|
.b8 106 |
|
.b8 122 |
|
.b8 118 |
|
.b8 105 |
|
.b8 53 |
|
.b8 99 |
|
.b8 109 |
|
.b8 122 |
|
.b8 52 |
|
.b8 116 |
|
.b8 120 |
|
.b8 118 |
|
.b8 106 |
|
.b8 100 |
|
.b8 51 |
|
.b8 97 |
|
.b8 112 |
|
.b8 53 |
|
.b8 50 |
|
.b8 115 |
|
.b8 104 |
|
.b8 103 |
|
.b8 97 |
|
.b8 115 |
|
.b8 104 |
|
.b8 52 |
|
.b8 99 |
|
.b8 122 |
|
.b8 105 |
|
.b8 102 |
|
.b8 100 |
|
.b8 99 |
|
.b8 110 |
|
.b8 97 |
|
.b8 102 |
|
.b8 110 |
|
.b8 107 |
|
.b8 107 |
|
.b8 97 |
|
.b8 109 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 113 |
|
.b8 104 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 53 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 53 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 1 |
|
.b8 3 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b32 125 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp14 |
|
.b8 2 |
|
.b8 39 |
|
.b8 58 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp14 |
|
.b8 2 |
|
.b8 243 |
|
.b8 36 |
|
.b8 0 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp15 |
|
.b8 2 |
|
.b8 39 |
|
.b8 58 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp15 |
|
.b64 $L__tmp16 |
|
.b8 3 |
|
.b8 39 |
|
.b8 45 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp17 |
|
.b64 $L__tmp32 |
|
.b8 2 |
|
.b8 47 |
|
.b8 59 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp18 |
|
.b64 $L__tmp31 |
|
.b8 2 |
|
.b8 47 |
|
.b8 59 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp18 |
|
.b64 $L__tmp31 |
|
.b8 2 |
|
.b8 243 |
|
.b8 36 |
|
.b8 0 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp32 |
|
.b64 $L__tmp33 |
|
.b8 3 |
|
.b8 47 |
|
.b8 45 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 395 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 101 |
|
.b8 53 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 395 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|