|
|
|
|
|
|
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
|
|
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; |
|
|
|
.visible .entry triton__0d1d2de( |
|
.param .u64 triton__0d1d2de_param_0, |
|
.param .u64 triton__0d1d2de_param_1, |
|
.param .u32 triton__0d1d2de_param_2 |
|
) |
|
.maxntid 256, 1, 1 |
|
{ |
|
.reg .pred %p<10>; |
|
.reg .b16 %rs<7>; |
|
.reg .b32 %r<25>; |
|
.reg .f32 %f<127>; |
|
.reg .b64 %rd<8>; |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd4, [triton__0d1d2de_param_0]; |
|
ld.param.u64 %rd5, [triton__0d1d2de_param_1]; |
|
$L__tmp0: |
|
.loc 1 21 36 |
|
mov.u32 %r8, %tid.x; |
|
shl.b32 %r9, %r8, 1; |
|
and.b32 %r10, %r9, 510; |
|
.loc 1 20 28 |
|
mov.u32 %r1, %ctaid.x; |
|
.loc 1 20 33 |
|
shl.b32 %r11, %r1, 9; |
|
.loc 1 21 23 |
|
or.b32 %r12, %r11, %r10; |
|
.loc 1 24 34 |
|
mul.wide.s32 %rd6, %r12, 2; |
|
add.s64 %rd7, %rd4, %rd6; |
|
mov.pred %p1, -1; |
|
.loc 1 24 39 |
|
mov.u32 %r2, 0x0; |
|
@%p1 ld.global.b32 { %r2 }, [ %rd7 + 0 ]; |
|
.loc 1 25 30 |
|
add.s64 %rd3, %rd5, %rd6; |
|
.loc 1 25 35 |
|
mov.u32 %r5, 0x0; |
|
@%p1 ld.global.b32 { %r5 }, [ %rd3 + 0 ]; |
|
cvt.u16.u32 %rs3, %r5; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; } |
|
.loc 1 25 44 |
|
cvt.f32.bf16 %r6, %rs3; |
|
mov.b32 %f3, %r6; |
|
cvt.f32.bf16 %r7, %rs4; |
|
mov.b32 %f4, %r7; |
|
.loc 1 29 18 |
|
mul.f32 %f5, %f3, 0f3F3504F3; |
|
.loc 1 30 23 |
|
abs.ftz.f32 %f7, %f5; |
|
setp.ge.f32 %p3, %f7, 0f3F8060FE; |
|
mov.f32 %f115, 0f3789CA3C; |
|
mov.f32 %f114, 0fB9F560B9; |
|
mov.f32 %f113, 0f3BAC840B; |
|
mov.f32 %f112, 0fBD0C8162; |
|
mov.f32 %f111, 0f3E1CF906; |
|
mov.f32 %f110, 0f3F6A937E; |
|
mov.f32 %f109, 0f3F20D842; |
|
mov.f32 %f116, %f7; |
|
@%p3 bra $L__BB0_2; |
|
.loc 1 0 23 |
|
mov.f32 %f115, 0f38B1E96A; |
|
mov.f32 %f114, 0fBA574D20; |
|
mov.f32 %f113, 0f3BAAD5EA; |
|
mov.f32 %f112, 0fBCDC1BE7; |
|
mov.f32 %f111, 0f3DE718AF; |
|
mov.f32 %f110, 0fBEC093AC; |
|
mov.f32 %f109, 0f3E0375D3; |
|
.loc 1 30 23 |
|
mul.f32 %f116, %f5, %f5; |
|
$L__BB0_2: |
|
.loc 1 0 0 |
|
cvt.u16.u32 %rs1, %r2; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } |
|
mul.f32 %f6, %f4, 0f3F3504F3; |
|
.loc 1 30 23 |
|
setp.ltu.f32 %p4, %f7, 0f3F8060FE; |
|
fma.rn.ftz.f32 %f47, %f115, %f116, %f114; |
|
fma.rn.ftz.f32 %f48, %f47, %f116, %f113; |
|
fma.rn.ftz.f32 %f49, %f48, %f116, %f112; |
|
fma.rn.ftz.f32 %f50, %f49, %f116, %f111; |
|
fma.rn.ftz.f32 %f51, %f50, %f116, %f110; |
|
fma.rn.ftz.f32 %f52, %f51, %f116, %f109; |
|
neg.f32 %f53, %f116; |
|
selp.f32 %f54, %f53, %f5, %p3; |
|
fma.rn.ftz.f32 %f117, %f52, %f54, %f54; |
|
mov.f32 %f108, 0f3F800000; |
|
@%p4 bra $L__BB0_4; |
|
ex2.approx.ftz.f32 %f55, %f117; |
|
sub.f32 %f57, %f108, %f55; |
|
mov.b32 %r13, %f57; |
|
mov.b32 %r14, %f5; |
|
and.b32 %r15, %r14, -2147483648; |
|
or.b32 %r16, %r15, %r13; |
|
mov.b32 %f117, %r16; |
|
$L__BB0_4: |
|
.loc 1 0 0 |
|
cvt.f32.bf16 %r3, %rs1; |
|
cvt.f32.bf16 %r4, %rs2; |
|
.loc 1 30 23 |
|
abs.ftz.f32 %f20, %f6; |
|
setp.ge.f32 %p6, %f20, 0f3F8060FE; |
|
mov.f32 %f124, 0f3789CA3C; |
|
mov.f32 %f123, 0fB9F560B9; |
|
mov.f32 %f122, 0f3BAC840B; |
|
mov.f32 %f121, 0fBD0C8162; |
|
mov.f32 %f120, 0f3E1CF906; |
|
mov.f32 %f119, 0f3F6A937E; |
|
mov.f32 %f118, 0f3F20D842; |
|
mov.f32 %f125, %f20; |
|
@%p6 bra $L__BB0_6; |
|
mul.f32 %f125, %f6, %f6; |
|
mov.f32 %f124, 0f38B1E96A; |
|
mov.f32 %f123, 0fBA574D20; |
|
mov.f32 %f122, 0f3BAAD5EA; |
|
mov.f32 %f121, 0fBCDC1BE7; |
|
mov.f32 %f120, 0f3DE718AF; |
|
mov.f32 %f119, 0fBEC093AC; |
|
mov.f32 %f118, 0f3E0375D3; |
|
$L__BB0_6: |
|
.loc 1 0 0 |
|
mov.b32 %f1, %r3; |
|
mov.b32 %f2, %r4; |
|
.loc 1 30 23 |
|
setp.ltu.f32 %p7, %f20, 0f3F8060FE; |
|
fma.rn.ftz.f32 %f72, %f124, %f125, %f123; |
|
fma.rn.ftz.f32 %f73, %f72, %f125, %f122; |
|
fma.rn.ftz.f32 %f74, %f73, %f125, %f121; |
|
fma.rn.ftz.f32 %f75, %f74, %f125, %f120; |
|
fma.rn.ftz.f32 %f76, %f75, %f125, %f119; |
|
fma.rn.ftz.f32 %f77, %f76, %f125, %f118; |
|
neg.f32 %f78, %f125; |
|
selp.f32 %f79, %f78, %f6, %p6; |
|
fma.rn.ftz.f32 %f126, %f77, %f79, %f79; |
|
@%p7 bra $L__BB0_8; |
|
ex2.approx.ftz.f32 %f80, %f126; |
|
sub.f32 %f82, %f108, %f80; |
|
mov.b32 %r17, %f82; |
|
mov.b32 %r18, %f6; |
|
and.b32 %r19, %r18, -2147483648; |
|
or.b32 %r20, %r19, %r17; |
|
mov.b32 %f126, %r20; |
|
$L__BB0_8: |
|
.loc 1 32 18 |
|
add.f32 %f87, %f117, 0f3F800000; |
|
add.f32 %f88, %f126, 0f3F800000; |
|
.loc 1 35 19 |
|
mul.f32 %f89, %f3, %f3; |
|
mul.f32 %f90, %f4, %f4; |
|
.loc 1 37 20 |
|
mul.f32 %f91, %f89, 0fBF000000; |
|
mul.f32 %f92, %f90, 0fBF000000; |
|
.loc 1 38 19 |
|
mul.f32 %f84, %f91, 0f3FB8AA3B; |
|
ex2.approx.f32 %f83, %f84; |
|
mul.f32 %f86, %f92, 0f3FB8AA3B; |
|
ex2.approx.f32 %f85, %f86; |
|
.loc 1 40 20 |
|
mul.f32 %f93, %f83, 0f3ECC422A; |
|
mul.f32 %f94, %f85, 0f3ECC422A; |
|
.loc 1 41 19 |
|
mul.f32 %f95, %f3, %f93; |
|
mul.f32 %f96, %f4, %f94; |
|
.loc 1 42 20 |
|
fma.rn.f32 %f97, %f87, 0f3F000000, %f95; |
|
fma.rn.f32 %f98, %f88, 0f3F000000, %f96; |
|
.loc 1 43 19 |
|
mul.f32 %f99, %f1, %f97; |
|
mul.f32 %f100, %f2, %f98; |
|
.loc 1 45 40 |
|
mov.b32 %r21, %f99; |
|
cvt.rn.bf16.f32 %rs5, %r21; |
|
mov.b32 %r22, %f100; |
|
cvt.rn.bf16.f32 %rs6, %r22; |
|
mov.b32 %r24, {%rs5, %rs6}; |
|
@%p1 st.global.b32 [ %rd7 + 0 ], { %r24 }; |
|
.loc 1 45 4 |
|
ret; |
|
$L__tmp1: |
|
$L__func_end0: |
|
|
|
} |
|
|
|
.visible .func (.param .b32 func_retval0) __nv_erff( |
|
.param .b32 __nv_erff_param_0 |
|
) |
|
{ |
|
.reg .pred %p<4>; |
|
.reg .b32 %r<5>; |
|
.reg .f32 %f<49>; |
|
$L__func_begin1: |
|
|
|
ld.param.f32 %f14, [__nv_erff_param_0]; |
|
abs.ftz.f32 %f1, %f14; |
|
setp.ge.f32 %p1, %f1, 0f3F8060FE; |
|
mov.f32 %f46, 0f3789CA3C; |
|
mov.f32 %f45, 0fB9F560B9; |
|
mov.f32 %f44, 0f3BAC840B; |
|
mov.f32 %f43, 0fBD0C8162; |
|
mov.f32 %f42, 0f3E1CF906; |
|
mov.f32 %f41, 0f3F6A937E; |
|
mov.f32 %f40, 0f3F20D842; |
|
mov.f32 %f47, %f1; |
|
@%p1 bra $L__BB1_2; |
|
mul.f32 %f47, %f14, %f14; |
|
mov.f32 %f46, 0f38B1E96A; |
|
mov.f32 %f45, 0fBA574D20; |
|
mov.f32 %f44, 0f3BAAD5EA; |
|
mov.f32 %f43, 0fBCDC1BE7; |
|
mov.f32 %f42, 0f3DE718AF; |
|
mov.f32 %f41, 0fBEC093AC; |
|
mov.f32 %f40, 0f3E0375D3; |
|
$L__BB1_2: |
|
setp.ltu.f32 %p2, %f1, 0f3F8060FE; |
|
fma.rn.ftz.f32 %f29, %f46, %f47, %f45; |
|
fma.rn.ftz.f32 %f30, %f29, %f47, %f44; |
|
fma.rn.ftz.f32 %f31, %f30, %f47, %f43; |
|
fma.rn.ftz.f32 %f32, %f31, %f47, %f42; |
|
fma.rn.ftz.f32 %f33, %f32, %f47, %f41; |
|
fma.rn.ftz.f32 %f34, %f33, %f47, %f40; |
|
neg.f32 %f35, %f47; |
|
selp.f32 %f36, %f35, %f14, %p1; |
|
fma.rn.ftz.f32 %f48, %f34, %f36, %f36; |
|
@%p2 bra $L__BB1_4; |
|
ex2.approx.ftz.f32 %f37, %f48; |
|
mov.f32 %f38, 0f3F800000; |
|
sub.f32 %f39, %f38, %f37; |
|
mov.b32 %r1, %f39; |
|
mov.b32 %r2, %f14; |
|
and.b32 %r3, %r2, -2147483648; |
|
or.b32 %r4, %r3, %r1; |
|
mov.b32 %f48, %r4; |
|
$L__BB1_4: |
|
st.param.f32 [func_retval0+0], %f48; |
|
ret; |
|
$L__func_end1: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 176 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 53 |
|
.b8 106 |
|
.b8 120 |
|
.b8 97 |
|
.b8 103 |
|
.b8 117 |
|
.b8 120 |
|
.b8 104 |
|
.b8 111 |
|
.b8 51 |
|
.b8 110 |
|
.b8 104 |
|
.b8 114 |
|
.b8 108 |
|
.b8 116 |
|
.b8 53 |
|
.b8 118 |
|
.b8 99 |
|
.b8 105 |
|
.b8 110 |
|
.b8 110 |
|
.b8 122 |
|
.b8 53 |
|
.b8 102 |
|
.b8 101 |
|
.b8 118 |
|
.b8 111 |
|
.b8 100 |
|
.b8 117 |
|
.b8 109 |
|
.b8 108 |
|
.b8 112 |
|
.b8 119 |
|
.b8 110 |
|
.b8 52 |
|
.b8 119 |
|
.b8 121 |
|
.b8 98 |
|
.b8 50 |
|
.b8 118 |
|
.b8 120 |
|
.b8 51 |
|
.b8 120 |
|
.b8 114 |
|
.b8 118 |
|
.b8 101 |
|
.b8 105 |
|
.b8 99 |
|
.b8 101 |
|
.b8 114 |
|
.b8 108 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 53 |
|
.b8 106 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|