|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2d3d4d5d6de7de |
|
.extern .func __assertfail |
|
( |
|
.param .b64 __assertfail_param_0, |
|
.param .b64 __assertfail_param_1, |
|
.param .b32 __assertfail_param_2, |
|
.param .b64 __assertfail_param_3, |
|
.param .b64 __assertfail_param_4 |
|
) |
|
; |
|
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; |
|
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; |
|
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; |
|
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; |
|
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; |
|
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; |
|
.extern .shared .align 1 .b8 global_smem[]; |
|
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; |
|
|
|
.visible .entry triton__0d1d2d3d4d5d6de7de( |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0, |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1, |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2, |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3, |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4, |
|
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5, |
|
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6, |
|
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7 |
|
) |
|
.maxntid 64, 1, 1 |
|
{ |
|
.reg .pred %p<59>; |
|
.reg .b16 %rs<13>; |
|
.reg .b32 %r<176>; |
|
.reg .f32 %f<169>; |
|
.reg .b64 %rd<58>; |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4]; |
|
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1]; |
|
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0]; |
|
$L__tmp0: |
|
.loc 1 24 33 |
|
mov.u32 %r1, %tid.x; |
|
and.b32 %r2, %r1, 31; |
|
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2]; |
|
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3]; |
|
bfe.u32 %r3, %r1, 5, 1; |
|
shl.b32 %r30, %r1, 2; |
|
and.b32 %r4, %r30, 252; |
|
.loc 1 21 28 |
|
mov.u32 %r13, %ctaid.x; |
|
.loc 1 26 30 |
|
mul.wide.s32 %rd25, %r13, 8; |
|
add.s64 %rd11, %rd22, %rd25; |
|
mov.pred %p53, -1; |
|
.loc 1 26 35 |
|
mov.u64 %rd10, 0x0; |
|
@%p53 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ]; |
|
mov.u64 %rd12, 0x0; |
|
@%p53 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ]; |
|
mov.u64 %rd14, 0x0; |
|
@%p53 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ]; |
|
mov.u64 %rd16, 0x0; |
|
@%p53 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ]; |
|
mov.u64 %rd18, 0x0; |
|
@%p53 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd11 + 0 ]; |
|
.loc 1 27 18 |
|
shr.s32 %r31, %r13, 31; |
|
shr.u32 %r32, %r31, 23; |
|
add.s32 %r33, %r13, %r32; |
|
and.b32 %r34, %r33, 16776704; |
|
sub.s32 %r35, %r13, %r34; |
|
.loc 1 35 44 |
|
shl.b32 %r36, %r35, 8; |
|
.loc 1 35 40 |
|
or.b32 %r37, %r36, %r4; |
|
.loc 1 35 34 |
|
mul.wide.s32 %rd26, %r37, 4; |
|
add.s64 %rd37, %rd23, %rd26; |
|
mov.b32 %r151, 0; |
|
.loc 1 35 50 |
|
mov.u32 %r14, 0x0; |
|
mov.u32 %r15, 0x0; |
|
mov.u32 %r16, 0x0; |
|
mov.u32 %r17, 0x0; |
|
@%p53 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd37 + 0 ]; |
|
@!%p53 mov.u32 %r14, %r151; |
|
@!%p53 mov.u32 %r15, %r151; |
|
@!%p53 mov.u32 %r16, %r151; |
|
@!%p53 mov.u32 %r17, %r151; |
|
mov.b32 %f2, %r14; |
|
mov.b32 %f1, %r15; |
|
mov.b32 %f3, %r16; |
|
mov.b32 %f4, %r17; |
|
.loc 1 36 44 |
|
shl.b32 %r38, %r13, 8; |
|
.loc 1 36 40 |
|
or.b32 %r39, %r38, %r4; |
|
.loc 1 36 34 |
|
mul.wide.s32 %rd27, %r39, 2; |
|
add.s64 %rd38, %rd24, %rd27; |
|
.loc 1 36 50 |
|
mov.u32 %r22, 0x0; |
|
mov.u32 %r23, 0x0; |
|
@%p53 ld.global.L1::evict_last.v2.b32 { %r22, %r23 }, [ %rd38 + 0 ]; |
|
@!%p53 mov.u32 %r22, %r151; |
|
@!%p53 mov.u32 %r23, %r151; |
|
cvt.u16.u32 %rs1, %r22; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r22; } |
|
cvt.u16.u32 %rs3, %r23; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r23; } |
|
.loc 1 36 101 |
|
cvt.f32.bf16 %r26, %rs1; |
|
mov.b32 %f5, %r26; |
|
cvt.f32.bf16 %r27, %rs2; |
|
mov.b32 %f6, %r27; |
|
cvt.f32.bf16 %r28, %rs3; |
|
mov.b32 %f7, %r28; |
|
cvt.f32.bf16 %r29, %rs4; |
|
mov.b32 %f8, %r29; |
|
.loc 1 37 22 |
|
add.s64 %rd28, %rd18, 50257; |
|
.loc 1 38 22 |
|
setp.lt.s64 %p14, %rd18, 0; |
|
.loc 1 39 36 |
|
selp.b64 %rd5, %rd28, %rd18, %p14; |
|
.loc 1 40 40 |
|
setp.lt.u64 %p15, %rd5, 50257; |
|
mov.b32 %r175, 883; |
|
mov.u64 %rd57, 1; |
|
.loc 1 40 55 |
|
@%p15 bra $L__BB0_2; |
|
mov.u64 %rd29, assertMessage_0; |
|
cvta.global.u64 %rd30, %rd29; |
|
mov.u64 %rd31, assertFile_0; |
|
cvta.global.u64 %rd32, %rd31; |
|
mov.u64 %rd33, assertFunc_0; |
|
cvta.global.u64 %rd34, %rd33; |
|
{ // callseq 0, 0 |
|
.reg .b32 temp_param_reg; |
|
.param .b64 param0; |
|
st.param.b64 [param0+0], %rd30; |
|
.param .b64 param1; |
|
st.param.b64 [param1+0], %rd32; |
|
.param .b32 param2; |
|
st.param.b32 [param2+0], %r175; |
|
.param .b64 param3; |
|
st.param.b64 [param3+0], %rd34; |
|
.param .b64 param4; |
|
st.param.b64 [param4+0], %rd57; |
|
call.uni |
|
__assertfail, |
|
( |
|
param0, |
|
param1, |
|
param2, |
|
param3, |
|
param4 |
|
); |
|
} // callseq 0 |
|
$L__BB0_2: |
|
.loc 1 0 55 |
|
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5]; |
|
cvt.s64.s32 %rd3, %r39; |
|
.loc 1 38 22 |
|
setp.lt.s64 %p44, %rd10, 0; |
|
.loc 1 41 44 |
|
shl.b64 %rd40, %rd10, 8; |
|
add.s64 %rd41, %rd40, 12865792; |
|
selp.b64 %rd42, %rd41, %rd40, %p44; |
|
cvt.u64.u32 %rd43, %r4; |
|
.loc 1 41 40 |
|
or.b64 %rd44, %rd42, %rd43; |
|
.loc 1 41 34 |
|
shl.b64 %rd45, %rd44, 2; |
|
add.s64 %rd54, %rd7, %rd45; |
|
.loc 1 41 52 |
|
mov.u32 %r41, 0x0; |
|
mov.u32 %r42, 0x0; |
|
mov.u32 %r43, 0x0; |
|
mov.u32 %r44, 0x0; |
|
@%p53 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd54 + 0 ]; |
|
@!%p53 mov.u32 %r41, %r151; |
|
@!%p53 mov.u32 %r42, %r151; |
|
@!%p53 mov.u32 %r43, %r151; |
|
@!%p53 mov.u32 %r44, %r151; |
|
mov.b32 %f15, %r43; |
|
mov.b32 %f16, %r44; |
|
.loc 1 42 22 |
|
add.f32 %f17, %f3, %f15; |
|
add.f32 %f18, %f4, %f16; |
|
.loc 1 44 22 |
|
add.f32 %f19, %f7, %f17; |
|
add.f32 %f20, %f8, %f18; |
|
.loc 1 41 52 |
|
mov.b32 %f21, %r41; |
|
mov.b32 %f22, %r42; |
|
.loc 1 42 22 |
|
add.f32 %f23, %f1, %f22; |
|
add.f32 %f24, %f2, %f21; |
|
.loc 1 44 22 |
|
add.f32 %f25, %f5, %f24; |
|
add.f32 %f26, %f6, %f23; |
|
$L__tmp1: |
|
.loc 2 98 22 |
|
add.f32 %f27, %f26, 0f00000000; |
|
add.f32 %f28, %f25, 0f00000000; |
|
add.f32 %f29, %f19, 0f00000000; |
|
add.f32 %f30, %f20, 0f00000000; |
|
.loc 2 101 30 |
|
sub.f32 %f31, %f25, %f28; |
|
sub.f32 %f32, %f26, %f27; |
|
sub.f32 %f33, %f19, %f29; |
|
sub.f32 %f34, %f20, %f30; |
|
.loc 2 101 13 |
|
fma.rn.f32 %f35, %f25, %f31, 0f00000000; |
|
fma.rn.f32 %f36, %f26, %f32, 0f00000000; |
|
fma.rn.f32 %f37, %f19, %f33, 0f00000000; |
|
fma.rn.f32 %f38, %f20, %f34, 0f00000000; |
|
$L__tmp2: |
|
.loc 2 108 21 |
|
sub.f32 %f39, %f27, %f28; |
|
mov.b32 %r50, 1065353216; |
|
mov.b32 %r51, 1073741824; |
|
.loc 2 110 60 |
|
div.full.f32 %r49, %r50, %r51; |
|
mov.b32 %f40, %r49; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f41, %f40, %f39, %f28; |
|
.loc 2 113 15 |
|
add.f32 %f42, %f35, %f36; |
|
.loc 2 113 30 |
|
mul.f32 %f43, %f39, %f39; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f44, %f40, %f43, %f42; |
|
.loc 2 108 21 |
|
sub.f32 %f45, %f29, %f41; |
|
mov.b32 %r54, 1077936128; |
|
.loc 2 110 60 |
|
div.full.f32 %r52, %r50, %r54; |
|
mov.b32 %f46, %r52; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f47, %f46, %f45, %f41; |
|
.loc 2 113 15 |
|
add.f32 %f48, %f37, %f44; |
|
.loc 2 113 30 |
|
mul.f32 %f49, %f45, %f45; |
|
.loc 2 113 38 |
|
fma.rn.f32 %f50, %f45, %f45, %f49; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f51, %f46, %f50, %f48; |
|
.loc 2 108 21 |
|
sub.f32 %f52, %f30, %f47; |
|
mov.b32 %r57, 1082130432; |
|
.loc 2 110 60 |
|
div.full.f32 %r55, %r50, %r57; |
|
mov.b32 %f53, %r55; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f54, %f53, %f52, %f47; |
|
.loc 2 113 15 |
|
add.f32 %f55, %f38, %f51; |
|
.loc 2 113 30 |
|
mul.f32 %f56, %f52, %f52; |
|
.loc 2 113 38 |
|
mul.f32 %f57, %f56, 0f40400000; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f58, %f53, %f57, %f55; |
|
$L__tmp3: |
|
.loc 2 120 46 |
|
mov.b32 %r118, %f54; |
|
shfl.sync.bfly.b32 %r119, %r118, 16, 31, -1; |
|
mov.b32 %f59, %r119; |
|
mov.b32 %r120, %f58; |
|
shfl.sync.bfly.b32 %r121, %r120, 16, 31, -1; |
|
mov.b32 %f60, %r121; |
|
shfl.sync.bfly.b32 %r59, %r57, 16, 31, -1; |
|
mov.b32 %f61, %r59; |
|
$L__tmp4: |
|
.loc 2 108 21 |
|
sub.f32 %f62, %f59, %f54; |
|
.loc 2 109 28 |
|
add.f32 %f63, %f61, 0f40800000; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p45, %f63, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r60, %f63; |
|
div.full.f32 %r58, %r59, %r60; |
|
mov.b32 %f64, %r58; |
|
.loc 2 110 49 |
|
selp.f32 %f65, 0f00000000, %f64, %p45; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f66, %f65, %f62, %f54; |
|
.loc 2 113 15 |
|
add.f32 %f67, %f58, %f60; |
|
.loc 2 113 30 |
|
mul.f32 %f68, %f62, %f62; |
|
.loc 2 113 38 |
|
mul.f32 %f69, %f68, 0f40800000; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f70, %f65, %f69, %f67; |
|
$L__tmp5: |
|
.loc 2 120 46 |
|
mov.b32 %r122, %f66; |
|
shfl.sync.bfly.b32 %r123, %r122, 8, 31, -1; |
|
mov.b32 %f71, %r123; |
|
mov.b32 %r124, %f70; |
|
shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1; |
|
mov.b32 %f72, %r125; |
|
shfl.sync.bfly.b32 %r62, %r60, 8, 31, -1; |
|
mov.b32 %f73, %r62; |
|
$L__tmp6: |
|
.loc 2 108 21 |
|
sub.f32 %f74, %f71, %f66; |
|
.loc 2 109 28 |
|
add.f32 %f75, %f63, %f73; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p46, %f75, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r63, %f75; |
|
div.full.f32 %r61, %r62, %r63; |
|
mov.b32 %f76, %r61; |
|
.loc 2 110 49 |
|
selp.f32 %f77, 0f00000000, %f76, %p46; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f78, %f77, %f74, %f66; |
|
.loc 2 113 15 |
|
add.f32 %f79, %f70, %f72; |
|
.loc 2 113 30 |
|
mul.f32 %f80, %f74, %f74; |
|
.loc 2 113 38 |
|
mul.f32 %f81, %f63, %f80; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f82, %f77, %f81, %f79; |
|
$L__tmp7: |
|
.loc 2 120 46 |
|
mov.b32 %r126, %f78; |
|
shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1; |
|
mov.b32 %f83, %r127; |
|
mov.b32 %r128, %f82; |
|
shfl.sync.bfly.b32 %r129, %r128, 4, 31, -1; |
|
mov.b32 %f84, %r129; |
|
shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1; |
|
mov.b32 %f85, %r65; |
|
$L__tmp8: |
|
.loc 2 108 21 |
|
sub.f32 %f86, %f83, %f78; |
|
.loc 2 109 28 |
|
add.f32 %f87, %f75, %f85; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p47, %f87, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r66, %f87; |
|
div.full.f32 %r64, %r65, %r66; |
|
mov.b32 %f88, %r64; |
|
.loc 2 110 49 |
|
selp.f32 %f89, 0f00000000, %f88, %p47; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f90, %f89, %f86, %f78; |
|
.loc 2 113 15 |
|
add.f32 %f91, %f82, %f84; |
|
.loc 2 113 30 |
|
mul.f32 %f92, %f86, %f86; |
|
.loc 2 113 38 |
|
mul.f32 %f93, %f75, %f92; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f94, %f89, %f93, %f91; |
|
$L__tmp9: |
|
.loc 2 120 46 |
|
mov.b32 %r130, %f90; |
|
shfl.sync.bfly.b32 %r131, %r130, 2, 31, -1; |
|
mov.b32 %f95, %r131; |
|
mov.b32 %r132, %f94; |
|
shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1; |
|
mov.b32 %f96, %r133; |
|
shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1; |
|
mov.b32 %f97, %r68; |
|
$L__tmp10: |
|
.loc 2 108 21 |
|
sub.f32 %f98, %f95, %f90; |
|
.loc 2 109 28 |
|
add.f32 %f99, %f87, %f97; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p48, %f99, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r69, %f99; |
|
div.full.f32 %r67, %r68, %r69; |
|
mov.b32 %f100, %r67; |
|
.loc 2 110 49 |
|
selp.f32 %f101, 0f00000000, %f100, %p48; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f102, %f101, %f98, %f90; |
|
.loc 2 113 15 |
|
add.f32 %f103, %f94, %f96; |
|
.loc 2 113 30 |
|
mul.f32 %f104, %f98, %f98; |
|
.loc 2 113 38 |
|
mul.f32 %f105, %f87, %f104; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f106, %f101, %f105, %f103; |
|
$L__tmp11: |
|
.loc 2 120 46 |
|
mov.b32 %r134, %f102; |
|
shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1; |
|
mov.b32 %f107, %r135; |
|
mov.b32 %r136, %f106; |
|
shfl.sync.bfly.b32 %r137, %r136, 1, 31, -1; |
|
mov.b32 %f108, %r137; |
|
shfl.sync.bfly.b32 %r71, %r69, 1, 31, -1; |
|
mov.b32 %f109, %r71; |
|
$L__tmp12: |
|
.loc 2 108 21 |
|
sub.f32 %f110, %f107, %f102; |
|
.loc 2 109 28 |
|
add.f32 %f111, %f99, %f109; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p49, %f111, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r72, %f111; |
|
div.full.f32 %r70, %r71, %r72; |
|
mov.b32 %f112, %r70; |
|
.loc 2 110 49 |
|
selp.f32 %f113, 0f00000000, %f112, %p49; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f114, %f113, %f110, %f102; |
|
.loc 2 113 15 |
|
add.f32 %f115, %f106, %f108; |
|
.loc 2 113 30 |
|
mul.f32 %f116, %f110, %f110; |
|
.loc 2 113 38 |
|
mul.f32 %f117, %f99, %f116; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f118, %f113, %f117, %f115; |
|
$L__tmp13: |
|
.loc 2 120 46 |
|
setp.eq.s32 %p21, %r2, 0; |
|
shl.b32 %r138, %r3, 2; |
|
mov.u32 %r139, global_smem; |
|
add.s32 %r73, %r139, %r138; |
|
mov.b32 %r74, %f114; |
|
@%p21 st.shared.b32 [ %r73 + 0 ], %r74; |
|
add.s32 %r140, %r139, 8; |
|
add.s32 %r75, %r140, %r138; |
|
mov.b32 %r76, %f118; |
|
@%p21 st.shared.b32 [ %r75 + 0 ], %r76; |
|
add.s32 %r141, %r139, 16; |
|
add.s32 %r77, %r141, %r138; |
|
@%p21 st.shared.b32 [ %r77 + 0 ], %r72; |
|
bar.sync 0; |
|
setp.lt.s32 %p24, %r1, 2; |
|
add.s32 %r80, %r139, %r30; |
|
@%p24 ld.shared.b32 %r79, [ %r80 + 0 ]; |
|
mov.b32 %f119, %r79; |
|
add.s32 %r82, %r140, %r30; |
|
@%p24 ld.shared.b32 %r81, [ %r82 + 0 ]; |
|
mov.b32 %f120, %r81; |
|
add.s32 %r84, %r141, %r30; |
|
@%p24 ld.shared.b32 %r83, [ %r84 + 0 ]; |
|
mov.b32 %f121, %r83; |
|
shfl.sync.bfly.b32 %r143, %r79, 1, 31, -1; |
|
mov.b32 %f122, %r143; |
|
shfl.sync.bfly.b32 %r144, %r81, 1, 31, -1; |
|
mov.b32 %f123, %r144; |
|
shfl.sync.bfly.b32 %r86, %r83, 1, 31, -1; |
|
mov.b32 %f124, %r86; |
|
$L__tmp14: |
|
.loc 2 108 21 |
|
sub.f32 %f125, %f122, %f119; |
|
.loc 2 109 28 |
|
add.f32 %f126, %f121, %f124; |
|
.loc 2 110 39 |
|
setp.eq.f32 %p50, %f126, 0f00000000; |
|
.loc 2 110 60 |
|
mov.b32 %r87, %f126; |
|
div.full.f32 %r85, %r86, %r87; |
|
mov.b32 %f127, %r85; |
|
.loc 2 110 49 |
|
selp.f32 %f128, 0f00000000, %f127, %p50; |
|
.loc 2 112 17 |
|
fma.rn.f32 %f129, %f125, %f128, %f119; |
|
.loc 2 113 15 |
|
add.f32 %f130, %f120, %f123; |
|
.loc 2 113 30 |
|
mul.f32 %f131, %f125, %f125; |
|
.loc 2 113 38 |
|
mul.f32 %f132, %f121, %f131; |
|
.loc 2 113 22 |
|
fma.rn.f32 %f133, %f132, %f128, %f130; |
|
$L__tmp15: |
|
.loc 2 120 46 |
|
and.b32 %r145, %r1, 1; |
|
setp.eq.b32 %p51, %r145, 1; |
|
not.pred %p52, %p51; |
|
and.pred %p27, %p24, %p52; |
|
mov.b32 %r89, %f129; |
|
@%p27 st.shared.b32 [ %r80 + 0 ], %r89; |
|
mov.b32 %r91, %f133; |
|
@%p27 st.shared.b32 [ %r82 + 0 ], %r91; |
|
@%p27 st.shared.b32 [ %r84 + 0 ], %r87; |
|
bar.sync 0; |
|
ld.shared.f32 %f9, [global_smem]; |
|
ld.shared.f32 %f10, [global_smem+8]; |
|
$L__tmp16: |
|
.loc 1 62 51 |
|
mov.u32 %r94, 0x0; |
|
mov.u32 %r95, 0x0; |
|
mov.u32 %r96, 0x0; |
|
mov.u32 %r97, 0x0; |
|
@%p53 ld.global.L1::evict_last.v4.b32 { %r94, %r95, %r96, %r97 }, [ %rd37 + 0 ]; |
|
@!%p53 mov.u32 %r94, %r151; |
|
@!%p53 mov.u32 %r95, %r151; |
|
@!%p53 mov.u32 %r96, %r151; |
|
@!%p53 mov.u32 %r97, %r151; |
|
.loc 1 63 51 |
|
mov.u32 %r102, 0x0; |
|
mov.u32 %r103, 0x0; |
|
@%p53 ld.global.L1::evict_first.v2.b32 { %r102, %r103 }, [ %rd38 + 0 ]; |
|
@!%p53 mov.u32 %r102, %r151; |
|
@!%p53 mov.u32 %r103, %r151; |
|
cvt.u16.u32 %rs5, %r102; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r102; } |
|
cvt.u16.u32 %rs7, %r103; |
|
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r103; } |
|
.loc 1 63 103 |
|
cvt.f32.bf16 %r106, %rs5; |
|
mov.b32 %f11, %r106; |
|
cvt.f32.bf16 %r107, %rs6; |
|
mov.b32 %f12, %r107; |
|
cvt.f32.bf16 %r108, %rs7; |
|
mov.b32 %f13, %r108; |
|
cvt.f32.bf16 %r109, %rs8; |
|
mov.b32 %f14, %r109; |
|
.loc 1 64 35 |
|
mul.wide.u32 %rd46, %r4, 4; |
|
add.s64 %rd39, %rd8, %rd46; |
|
.loc 1 64 40 |
|
mov.u32 %r110, 0x0; |
|
mov.u32 %r111, 0x0; |
|
mov.u32 %r112, 0x0; |
|
mov.u32 %r113, 0x0; |
|
@%p53 ld.global.L1::evict_last.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd39 + 0 ]; |
|
@!%p53 mov.u32 %r110, %r151; |
|
@!%p53 mov.u32 %r111, %r151; |
|
@!%p53 mov.u32 %r112, %r151; |
|
@!%p53 mov.u32 %r113, %r151; |
|
.loc 1 68 57 |
|
@%p15 bra $L__BB0_4; |
|
mov.u64 %rd47, assertMessage_1; |
|
cvta.global.u64 %rd48, %rd47; |
|
mov.u64 %rd49, assertFile_1; |
|
cvta.global.u64 %rd50, %rd49; |
|
mov.u64 %rd51, assertFunc_1; |
|
cvta.global.u64 %rd52, %rd51; |
|
{ // callseq 1, 0 |
|
.reg .b32 temp_param_reg; |
|
.param .b64 param0; |
|
st.param.b64 [param0+0], %rd48; |
|
.param .b64 param1; |
|
st.param.b64 [param1+0], %rd50; |
|
.param .b32 param2; |
|
st.param.b32 [param2+0], %r175; |
|
.param .b64 param3; |
|
st.param.b64 [param3+0], %rd52; |
|
.param .b64 param4; |
|
st.param.b64 [param4+0], %rd57; |
|
call.uni |
|
__assertfail, |
|
( |
|
param0, |
|
param1, |
|
param2, |
|
param3, |
|
param4 |
|
); |
|
} // callseq 1 |
|
$L__BB0_4: |
|
.loc 1 69 54 |
|
mov.u32 %r147, 0x0; |
|
mov.u32 %r148, 0x0; |
|
mov.u32 %r149, 0x0; |
|
mov.u32 %r150, 0x0; |
|
@%p53 ld.global.L1::evict_first.v4.b32 { %r147, %r148, %r149, %r150 }, [ %rd54 + 0 ]; |
|
@!%p53 mov.u32 %r147, %r151; |
|
@!%p53 mov.u32 %r148, %r151; |
|
@!%p53 mov.u32 %r149, %r151; |
|
@!%p53 mov.u32 %r150, %r151; |
|
.loc 1 75 24 |
|
mov.b32 %r156, %f10; |
|
mov.b32 %r157, 1132462080; |
|
div.full.f32 %r155, %r156, %r157; |
|
mov.b32 %f134, %r155; |
|
.loc 1 77 24 |
|
add.f32 %f135, %f134, 0f3727C5AC; |
|
.loc 1 78 30 |
|
rsqrt.approx.ftz.f32 %f136, %f135; |
|
.loc 1 69 54 |
|
mov.b32 %f137, %r150; |
|
.loc 1 62 51 |
|
mov.b32 %f138, %r97; |
|
.loc 1 70 24 |
|
add.f32 %f139, %f138, %f137; |
|
.loc 1 72 24 |
|
add.f32 %f140, %f14, %f139; |
|
.loc 1 73 24 |
|
sub.f32 %f141, %f140, %f9; |
|
.loc 1 69 54 |
|
mov.b32 %f142, %r149; |
|
.loc 1 62 51 |
|
mov.b32 %f143, %r96; |
|
.loc 1 70 24 |
|
add.f32 %f144, %f143, %f142; |
|
.loc 1 72 24 |
|
add.f32 %f145, %f13, %f144; |
|
.loc 1 73 24 |
|
sub.f32 %f146, %f145, %f9; |
|
.loc 1 69 54 |
|
mov.b32 %f147, %r148; |
|
.loc 1 62 51 |
|
mov.b32 %f148, %r95; |
|
.loc 1 70 24 |
|
add.f32 %f149, %f148, %f147; |
|
.loc 1 72 24 |
|
add.f32 %f150, %f12, %f149; |
|
.loc 1 73 24 |
|
sub.f32 %f151, %f150, %f9; |
|
.loc 1 69 54 |
|
mov.b32 %f152, %r147; |
|
.loc 1 62 51 |
|
mov.b32 %f153, %r94; |
|
.loc 1 70 24 |
|
add.f32 %f154, %f153, %f152; |
|
.loc 1 72 24 |
|
add.f32 %f155, %f11, %f154; |
|
.loc 1 73 24 |
|
sub.f32 %f156, %f155, %f9; |
|
.loc 1 64 40 |
|
mov.b32 %f157, %r110; |
|
mov.b32 %f158, %r111; |
|
mov.b32 %f159, %r112; |
|
mov.b32 %f160, %r113; |
|
.loc 1 79 24 |
|
mul.f32 %f161, %f156, %f136; |
|
mul.f32 %f162, %f151, %f136; |
|
mul.f32 %f163, %f146, %f136; |
|
mul.f32 %f164, %f141, %f136; |
|
.loc 1 80 24 |
|
mul.f32 %f165, %f161, %f157; |
|
mul.f32 %f166, %f162, %f158; |
|
mul.f32 %f167, %f163, %f159; |
|
mul.f32 %f168, %f164, %f160; |
|
.loc 1 82 29 |
|
shl.b64 %rd56, %rd3, 1; |
|
add.s64 %rd55, %rd9, %rd56; |
|
.loc 1 82 52 |
|
mov.b32 %r167, %f165; |
|
cvt.rn.bf16.f32 %rs9, %r167; |
|
mov.b32 %r168, %f166; |
|
cvt.rn.bf16.f32 %rs10, %r168; |
|
mov.b32 %r169, %f167; |
|
cvt.rn.bf16.f32 %rs11, %r169; |
|
mov.b32 %r170, %f168; |
|
cvt.rn.bf16.f32 %rs12, %r170; |
|
mov.b32 %r173, {%rs9, %rs10}; |
|
mov.b32 %r174, {%rs11, %rs12}; |
|
@%p53 st.global.v2.b32 [ %rd55 + 0 ], { %r173, %r174 }; |
|
.loc 1 58 4 |
|
ret; |
|
$L__tmp17: |
|
$L__func_end0: |
|
|
|
} |
|
// .globl __nv_rsqrtf |
|
.visible .func (.param .b32 func_retval0) __nv_rsqrtf( |
|
.param .b32 __nv_rsqrtf_param_0 |
|
) |
|
{ |
|
.reg .f32 %f<3>; |
|
$L__func_begin1: |
|
|
|
ld.param.f32 %f1, [__nv_rsqrtf_param_0]; |
|
rsqrt.approx.ftz.f32 %f2, %f1; |
|
st.param.f32 [func_retval0+0], %f2; |
|
ret; |
|
$L__func_end1: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py" |
|
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 32 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 3 |
|
.b8 46 |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 49 |
|
.b8 19 |
|
.b8 0 |
|
.b8 0 |
|
.b8 4 |
|
.b8 29 |
|
.b8 0 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 5 |
|
.b8 29 |
|
.b8 1 |
|
.b8 49 |
|
.b8 19 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 88 |
|
.b8 11 |
|
.b8 89 |
|
.b8 11 |
|
.b8 87 |
|
.b8 11 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 302 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 99 |
|
.b8 105 |
|
.b8 103 |
|
.b8 54 |
|
.b8 102 |
|
.b8 107 |
|
.b8 105 |
|
.b8 54 |
|
.b8 112 |
|
.b8 52 |
|
.b8 108 |
|
.b8 120 |
|
.b8 114 |
|
.b8 100 |
|
.b8 109 |
|
.b8 103 |
|
.b8 103 |
|
.b8 54 |
|
.b8 101 |
|
.b8 117 |
|
.b8 100 |
|
.b8 97 |
|
.b8 104 |
|
.b8 105 |
|
.b8 101 |
|
.b8 120 |
|
.b8 99 |
|
.b8 118 |
|
.b8 117 |
|
.b8 101 |
|
.b8 101 |
|
.b8 111 |
|
.b8 108 |
|
.b8 50 |
|
.b8 112 |
|
.b8 52 |
|
.b8 113 |
|
.b8 112 |
|
.b8 53 |
|
.b8 51 |
|
.b8 50 |
|
.b8 112 |
|
.b8 118 |
|
.b8 118 |
|
.b8 101 |
|
.b8 50 |
|
.b8 121 |
|
.b8 52 |
|
.b8 54 |
|
.b8 51 |
|
.b8 121 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 99 |
|
.b8 105 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 101 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 101 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 1 |
|
.b8 3 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b32 125 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp1 |
|
.b64 $L__tmp2 |
|
.b8 2 |
|
.b8 47 |
|
.b8 41 |
|
.b8 5 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp15 |
|
.b8 2 |
|
.b8 53 |
|
.b8 44 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp2 |
|
.b64 $L__tmp15 |
|
.b8 2 |
|
.b8 120 |
|
.b8 46 |
|
.b8 0 |
|
.b8 4 |
|
.b32 125 |
|
.b64 $L__tmp3 |
|
.b64 $L__tmp16 |
|
.b8 2 |
|
.b8 53 |
|
.b8 44 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 306 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 51 |
|
.b8 100 |
|
.b8 52 |
|
.b8 100 |
|
.b8 53 |
|
.b8 100 |
|
.b8 54 |
|
.b8 100 |
|
.b8 101 |
|
.b8 55 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 306 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|