0-hero's picture
Add files using upload-large-folder tool
9ab9a5e verified
raw
history blame
36.6 kB
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6d7de8
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2d3d4d5d6d7de8(
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
)
.maxntid 256, 1, 1
{
.reg .pred %p<176>;
.reg .b16 %rs<129>;
.reg .b32 %r<238>;
.reg .f32 %f<393>;
.reg .b64 %rd<166>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_6];
ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_5];
ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_4];
ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6d7de8_param_0];
$L__tmp0:
.loc 1 22 44
mov.u32 %r1, %tid.x;
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6d7de8_param_1];
shr.u32 %r2, %r1, 5;
ld.param.u64 %rd56, [triton__0d1d2d3d4d5d6d7de8_param_2];
.loc 1 24 33
and.b32 %r9, %r1, 255;
ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6d7de8_param_3];
or.b32 %r10, %r9, 256;
.loc 1 21 28
mov.u32 %r3, %ctaid.x;
.loc 1 21 34
cvt.s64.s32 %rd1, %r3;
.loc 1 21 46
mul.wide.s32 %rd60, %r3, 8;
.loc 1 22 23
or.b64 %rd61, %rd60, 1;
cvt.u64.u32 %rd2, %r9;
cvt.u64.u32 %rd3, %r10;
.loc 1 26 30
shl.b64 %rd62, %rd60, 3;
add.s64 %rd41, %rd59, %rd62;
add.s64 %rd43, %rd41, 8;
add.s64 %rd45, %rd41, 16;
add.s64 %rd47, %rd41, 24;
add.s64 %rd49, %rd41, 32;
add.s64 %rd51, %rd41, 40;
add.s64 %rd53, %rd41, 48;
add.s64 %rd55, %rd41, 56;
mov.pred %p1, -1;
.loc 1 26 35
mov.u64 %rd40, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
mov.u64 %rd42, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
mov.u64 %rd44, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
mov.u64 %rd46, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
mov.u64 %rd48, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
mov.u64 %rd50, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd50 }, [ %rd51 + 0 ];
mov.u64 %rd52, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd52 }, [ %rd53 + 0 ];
mov.u64 %rd54, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd54 }, [ %rd55 + 0 ];
.loc 1 27 19
mov.u32 %r7, 0x0;
@%p1 ld.global.b32 { %r7 }, [ %rd56 + 0 ];
.loc 1 29 19
mov.u32 %r8, 0x0;
@%p1 ld.global.b32 { %r8 }, [ %rd57 + 0 ];
.loc 1 36 46
mul.wide.s32 %rd4, %r3, 402056;
mul.lo.s64 %rd5, %rd61, 50257;
.loc 1 38 23
setp.eq.s64 %p11, %rd40, -1;
setp.eq.s64 %p12, %rd42, -1;
setp.eq.s64 %p13, %rd44, -1;
setp.eq.s64 %p14, %rd46, -1;
setp.eq.s64 %p15, %rd48, -1;
setp.eq.s64 %p16, %rd50, -1;
setp.eq.s64 %p17, %rd52, -1;
setp.eq.s64 %p18, %rd54, -1;
.loc 1 39 22
div.full.f32 %r6, %r7, %r8;
mov.b32 %f89, %r6;
.loc 1 41 37
selp.f32 %f8, 0f00000000, %f89, %p18;
selp.f32 %f7, 0f00000000, %f89, %p17;
selp.f32 %f6, 0f00000000, %f89, %p16;
selp.f32 %f5, 0f00000000, %f89, %p15;
selp.f32 %f4, 0f00000000, %f89, %p14;
selp.f32 %f3, 0f00000000, %f89, %p13;
selp.f32 %f2, 0f00000000, %f89, %p12;
selp.f32 %f1, 0f00000000, %f89, %p11;
mov.f32 %f377, 0f00000000;
mov.u64 %rd157, 0;
shl.b64 %rd83, %rd4, 2;
shl.b64 %rd86, %rd5, 2;
mov.f32 %f378, %f377;
mov.f32 %f379, %f377;
mov.f32 %f380, %f377;
mov.f32 %f381, %f377;
mov.f32 %f382, %f377;
mov.f32 %f383, %f377;
mov.f32 %f384, %f377;
mov.f32 %f385, %f377;
mov.f32 %f386, %f377;
mov.f32 %f387, %f377;
mov.f32 %f388, %f377;
mov.f32 %f389, %f377;
mov.f32 %f390, %f377;
mov.f32 %f391, %f377;
mov.f32 %f392, %f377;
$L__BB0_1:
.loc 1 33 27
or.b64 %rd79, %rd157, %rd2;
or.b64 %rd80, %rd157, %rd3;
.loc 1 34 25
setp.lt.u64 %p22, %rd80, 50257;
setp.lt.u64 %p20, %rd79, 50257;
.loc 1 36 34
shl.b64 %rd81, %rd79, 2;
add.s64 %rd82, %rd36, %rd81;
add.s64 %rd63, %rd82, %rd83;
shl.b64 %rd84, %rd80, 2;
add.s64 %rd85, %rd36, %rd84;
add.s64 %rd64, %rd85, %rd83;
add.s64 %rd65, %rd82, %rd86;
add.s64 %rd66, %rd85, %rd86;
add.s64 %rd67, %rd65, 201028;
add.s64 %rd68, %rd66, 201028;
add.s64 %rd69, %rd65, 402056;
add.s64 %rd70, %rd66, 402056;
add.s64 %rd71, %rd65, 603084;
add.s64 %rd72, %rd66, 603084;
add.s64 %rd73, %rd65, 804112;
add.s64 %rd74, %rd66, 804112;
add.s64 %rd75, %rd65, 1005140;
add.s64 %rd76, %rd66, 1005140;
add.s64 %rd77, %rd65, 1206168;
add.s64 %rd78, %rd66, 1206168;
mov.b32 %r173, 0;
.loc 1 36 52
mov.u32 %r11, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r11 }, [ %rd63 + 0 ];
@!%p20 mov.u32 %r11, %r173;
mov.u32 %r13, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r13 }, [ %rd64 + 0 ];
@!%p22 mov.u32 %r13, %r173;
mov.u32 %r15, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r15 }, [ %rd65 + 0 ];
@!%p20 mov.u32 %r15, %r173;
mov.u32 %r17, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r17 }, [ %rd66 + 0 ];
@!%p22 mov.u32 %r17, %r173;
mov.u32 %r19, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r19 }, [ %rd67 + 0 ];
@!%p20 mov.u32 %r19, %r173;
mov.u32 %r21, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r21 }, [ %rd68 + 0 ];
@!%p22 mov.u32 %r21, %r173;
mov.u32 %r23, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r23 }, [ %rd69 + 0 ];
@!%p20 mov.u32 %r23, %r173;
mov.u32 %r25, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r25 }, [ %rd70 + 0 ];
@!%p22 mov.u32 %r25, %r173;
mov.u32 %r27, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r27 }, [ %rd71 + 0 ];
@!%p20 mov.u32 %r27, %r173;
mov.u32 %r29, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r29 }, [ %rd72 + 0 ];
@!%p22 mov.u32 %r29, %r173;
mov.u32 %r31, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd73 + 0 ];
@!%p20 mov.u32 %r31, %r173;
mov.u32 %r33, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r33 }, [ %rd74 + 0 ];
@!%p22 mov.u32 %r33, %r173;
mov.u32 %r35, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r35 }, [ %rd75 + 0 ];
@!%p20 mov.u32 %r35, %r173;
mov.u32 %r37, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r37 }, [ %rd76 + 0 ];
@!%p22 mov.u32 %r37, %r173;
mov.u32 %r39, 0x0;
@%p20 ld.global.L1::evict_last.b32 { %r39 }, [ %rd77 + 0 ];
@!%p20 mov.u32 %r39, %r173;
mov.u32 %r41, 0x0;
@%p22 ld.global.L1::evict_last.b32 { %r41 }, [ %rd78 + 0 ];
@!%p22 mov.u32 %r41, %r173;
mov.b32 %f90, %r41;
mov.b32 %f91, %r39;
mov.b32 %f92, %r37;
mov.b32 %f93, %r35;
mov.b32 %f94, %r33;
mov.b32 %f95, %r31;
mov.b32 %f96, %r29;
mov.b32 %f97, %r27;
mov.b32 %f98, %r25;
mov.b32 %f99, %r23;
mov.b32 %f100, %r21;
mov.b32 %f101, %r19;
mov.b32 %f102, %r17;
mov.b32 %f103, %r15;
mov.b32 %f104, %r13;
mov.b32 %f105, %r11;
.loc 1 42 23
mul.f32 %f106, %f1, %f105;
mul.f32 %f107, %f1, %f104;
mul.f32 %f108, %f2, %f103;
mul.f32 %f109, %f2, %f102;
mul.f32 %f110, %f3, %f101;
mul.f32 %f111, %f3, %f100;
mul.f32 %f112, %f4, %f99;
mul.f32 %f113, %f4, %f98;
mul.f32 %f114, %f5, %f97;
mul.f32 %f115, %f5, %f96;
mul.f32 %f116, %f6, %f95;
mul.f32 %f117, %f6, %f94;
mul.f32 %f118, %f7, %f93;
mul.f32 %f119, %f7, %f92;
mul.f32 %f120, %f8, %f91;
mul.f32 %f121, %f8, %f90;
.loc 1 45 40
selp.f32 %f122, %f121, 0f80000000, %p22;
selp.f32 %f123, %f120, 0f80000000, %p20;
selp.f32 %f124, %f119, 0f80000000, %p22;
selp.f32 %f125, %f118, 0f80000000, %p20;
selp.f32 %f126, %f117, 0f80000000, %p22;
selp.f32 %f127, %f116, 0f80000000, %p20;
selp.f32 %f128, %f115, 0f80000000, %p22;
selp.f32 %f129, %f114, 0f80000000, %p20;
selp.f32 %f130, %f113, 0f80000000, %p22;
selp.f32 %f131, %f112, 0f80000000, %p20;
selp.f32 %f132, %f111, 0f80000000, %p22;
selp.f32 %f133, %f110, 0f80000000, %p20;
selp.f32 %f134, %f109, 0f80000000, %p22;
selp.f32 %f135, %f108, 0f80000000, %p20;
selp.f32 %f136, %f107, 0f80000000, %p22;
selp.f32 %f137, %f106, 0f80000000, %p20;
add.f32 %f377, %f377, %f137;
add.f32 %f378, %f378, %f136;
add.f32 %f379, %f379, %f135;
add.f32 %f380, %f380, %f134;
add.f32 %f381, %f381, %f133;
add.f32 %f382, %f382, %f132;
add.f32 %f383, %f383, %f131;
add.f32 %f384, %f384, %f130;
add.f32 %f385, %f385, %f129;
add.f32 %f386, %f386, %f128;
add.f32 %f387, %f387, %f127;
add.f32 %f388, %f388, %f126;
add.f32 %f389, %f389, %f125;
add.f32 %f390, %f390, %f124;
add.f32 %f391, %f391, %f123;
add.f32 %f392, %f392, %f122;
.loc 1 32 36
add.s64 %rd157, %rd157, 512;
cvt.u32.u64 %r43, %rd157;
add.s32 %r44, %r43, -512;
setp.lt.u32 %p51, %r44, 49745;
@%p51 bra $L__BB0_1;
.loc 1 22 44
and.b32 %r65, %r1, 31;
.loc 1 24 33
and.b32 %r66, %r2, 7;
$L__tmp1:
.loc 2 233 15
add.f32 %f138, %f377, %f378;
add.f32 %f139, %f379, %f380;
add.f32 %f140, %f381, %f382;
add.f32 %f141, %f383, %f384;
add.f32 %f142, %f385, %f386;
add.f32 %f143, %f387, %f388;
add.f32 %f144, %f389, %f390;
add.f32 %f145, %f391, %f392;
$L__tmp2:
.loc 2 243 36
mov.b32 %r67, %f138;
shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
mov.b32 %f146, %r68;
$L__tmp3:
.loc 2 233 15
add.f32 %f147, %f138, %f146;
$L__tmp4:
.loc 2 243 36
mov.b32 %r69, %f147;
shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
mov.b32 %f148, %r70;
$L__tmp5:
.loc 2 233 15
add.f32 %f149, %f147, %f148;
$L__tmp6:
.loc 2 243 36
mov.b32 %r71, %f149;
shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
mov.b32 %f150, %r72;
$L__tmp7:
.loc 2 233 15
add.f32 %f151, %f149, %f150;
$L__tmp8:
.loc 2 243 36
mov.b32 %r73, %f151;
shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1;
mov.b32 %f152, %r74;
$L__tmp9:
.loc 2 233 15
add.f32 %f153, %f151, %f152;
$L__tmp10:
.loc 2 243 36
mov.b32 %r75, %f153;
shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1;
mov.b32 %f154, %r76;
$L__tmp11:
.loc 2 233 15
add.f32 %f155, %f153, %f154;
$L__tmp12:
.loc 2 243 36
mov.b32 %r77, %f139;
shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
mov.b32 %f156, %r78;
$L__tmp13:
.loc 2 233 15
add.f32 %f157, %f139, %f156;
$L__tmp14:
.loc 2 243 36
mov.b32 %r79, %f157;
shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
mov.b32 %f158, %r80;
$L__tmp15:
.loc 2 233 15
add.f32 %f159, %f157, %f158;
$L__tmp16:
.loc 2 243 36
mov.b32 %r81, %f159;
shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
mov.b32 %f160, %r82;
$L__tmp17:
.loc 2 233 15
add.f32 %f161, %f159, %f160;
$L__tmp18:
.loc 2 243 36
mov.b32 %r83, %f161;
shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
mov.b32 %f162, %r84;
$L__tmp19:
.loc 2 233 15
add.f32 %f163, %f161, %f162;
$L__tmp20:
.loc 2 243 36
mov.b32 %r85, %f163;
shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
mov.b32 %f164, %r86;
$L__tmp21:
.loc 2 233 15
add.f32 %f165, %f163, %f164;
$L__tmp22:
.loc 2 243 36
mov.b32 %r87, %f140;
shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
mov.b32 %f166, %r88;
$L__tmp23:
.loc 2 233 15
add.f32 %f167, %f140, %f166;
$L__tmp24:
.loc 2 243 36
mov.b32 %r89, %f167;
shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
mov.b32 %f168, %r90;
$L__tmp25:
.loc 2 233 15
add.f32 %f169, %f167, %f168;
$L__tmp26:
.loc 2 243 36
mov.b32 %r91, %f169;
shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
mov.b32 %f170, %r92;
$L__tmp27:
.loc 2 233 15
add.f32 %f171, %f169, %f170;
$L__tmp28:
.loc 2 243 36
mov.b32 %r93, %f171;
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
mov.b32 %f172, %r94;
$L__tmp29:
.loc 2 233 15
add.f32 %f173, %f171, %f172;
$L__tmp30:
.loc 2 243 36
mov.b32 %r95, %f173;
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
mov.b32 %f174, %r96;
$L__tmp31:
.loc 2 233 15
add.f32 %f175, %f173, %f174;
$L__tmp32:
.loc 2 243 36
mov.b32 %r97, %f141;
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
mov.b32 %f176, %r98;
$L__tmp33:
.loc 2 233 15
add.f32 %f177, %f141, %f176;
$L__tmp34:
.loc 2 243 36
mov.b32 %r99, %f177;
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
mov.b32 %f178, %r100;
$L__tmp35:
.loc 2 233 15
add.f32 %f179, %f177, %f178;
$L__tmp36:
.loc 2 243 36
mov.b32 %r101, %f179;
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
mov.b32 %f180, %r102;
$L__tmp37:
.loc 2 233 15
add.f32 %f181, %f179, %f180;
$L__tmp38:
.loc 2 243 36
mov.b32 %r103, %f181;
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
mov.b32 %f182, %r104;
$L__tmp39:
.loc 2 233 15
add.f32 %f183, %f181, %f182;
$L__tmp40:
.loc 2 243 36
mov.b32 %r105, %f183;
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
mov.b32 %f184, %r106;
$L__tmp41:
.loc 2 233 15
add.f32 %f185, %f183, %f184;
$L__tmp42:
.loc 2 243 36
mov.b32 %r107, %f142;
shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1;
mov.b32 %f186, %r108;
$L__tmp43:
.loc 2 233 15
add.f32 %f187, %f142, %f186;
$L__tmp44:
.loc 2 243 36
mov.b32 %r109, %f187;
shfl.sync.bfly.b32 %r110, %r109, 8, 31, -1;
mov.b32 %f188, %r110;
$L__tmp45:
.loc 2 233 15
add.f32 %f189, %f187, %f188;
$L__tmp46:
.loc 2 243 36
mov.b32 %r111, %f189;
shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
mov.b32 %f190, %r112;
$L__tmp47:
.loc 2 233 15
add.f32 %f191, %f189, %f190;
$L__tmp48:
.loc 2 243 36
mov.b32 %r113, %f191;
shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
mov.b32 %f192, %r114;
$L__tmp49:
.loc 2 233 15
add.f32 %f193, %f191, %f192;
$L__tmp50:
.loc 2 243 36
mov.b32 %r115, %f193;
shfl.sync.bfly.b32 %r116, %r115, 1, 31, -1;
mov.b32 %f194, %r116;
$L__tmp51:
.loc 2 233 15
add.f32 %f195, %f193, %f194;
$L__tmp52:
.loc 2 243 36
mov.b32 %r117, %f143;
shfl.sync.bfly.b32 %r118, %r117, 16, 31, -1;
mov.b32 %f196, %r118;
$L__tmp53:
.loc 2 233 15
add.f32 %f197, %f143, %f196;
$L__tmp54:
.loc 2 243 36
mov.b32 %r119, %f197;
shfl.sync.bfly.b32 %r120, %r119, 8, 31, -1;
mov.b32 %f198, %r120;
$L__tmp55:
.loc 2 233 15
add.f32 %f199, %f197, %f198;
$L__tmp56:
.loc 2 243 36
mov.b32 %r121, %f199;
shfl.sync.bfly.b32 %r122, %r121, 4, 31, -1;
mov.b32 %f200, %r122;
$L__tmp57:
.loc 2 233 15
add.f32 %f201, %f199, %f200;
$L__tmp58:
.loc 2 243 36
mov.b32 %r123, %f201;
shfl.sync.bfly.b32 %r124, %r123, 2, 31, -1;
mov.b32 %f202, %r124;
$L__tmp59:
.loc 2 233 15
add.f32 %f203, %f201, %f202;
$L__tmp60:
.loc 2 243 36
mov.b32 %r125, %f203;
shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1;
mov.b32 %f204, %r126;
$L__tmp61:
.loc 2 233 15
add.f32 %f205, %f203, %f204;
$L__tmp62:
.loc 2 243 36
mov.b32 %r127, %f144;
shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
mov.b32 %f206, %r128;
$L__tmp63:
.loc 2 233 15
add.f32 %f207, %f144, %f206;
$L__tmp64:
.loc 2 243 36
mov.b32 %r129, %f207;
shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
mov.b32 %f208, %r130;
$L__tmp65:
.loc 2 233 15
add.f32 %f209, %f207, %f208;
$L__tmp66:
.loc 2 243 36
mov.b32 %r131, %f209;
shfl.sync.bfly.b32 %r132, %r131, 4, 31, -1;
mov.b32 %f210, %r132;
$L__tmp67:
.loc 2 233 15
add.f32 %f211, %f209, %f210;
$L__tmp68:
.loc 2 243 36
mov.b32 %r133, %f211;
shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
mov.b32 %f212, %r134;
$L__tmp69:
.loc 2 233 15
add.f32 %f213, %f211, %f212;
$L__tmp70:
.loc 2 243 36
mov.b32 %r135, %f213;
shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
mov.b32 %f214, %r136;
$L__tmp71:
.loc 2 233 15
add.f32 %f215, %f213, %f214;
$L__tmp72:
.loc 2 243 36
mov.b32 %r137, %f145;
shfl.sync.bfly.b32 %r138, %r137, 16, 31, -1;
mov.b32 %f216, %r138;
$L__tmp73:
.loc 2 233 15
add.f32 %f217, %f145, %f216;
$L__tmp74:
.loc 2 243 36
mov.b32 %r139, %f217;
shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1;
mov.b32 %f218, %r140;
$L__tmp75:
.loc 2 233 15
add.f32 %f219, %f217, %f218;
$L__tmp76:
.loc 2 243 36
mov.b32 %r141, %f219;
shfl.sync.bfly.b32 %r142, %r141, 4, 31, -1;
mov.b32 %f220, %r142;
$L__tmp77:
.loc 2 233 15
add.f32 %f221, %f219, %f220;
$L__tmp78:
.loc 2 243 36
mov.b32 %r143, %f221;
shfl.sync.bfly.b32 %r144, %r143, 2, 31, -1;
mov.b32 %f222, %r144;
$L__tmp79:
.loc 2 233 15
add.f32 %f223, %f221, %f222;
$L__tmp80:
.loc 2 243 36
mov.b32 %r145, %f223;
shfl.sync.bfly.b32 %r146, %r145, 1, 31, -1;
mov.b32 %f224, %r146;
$L__tmp81:
.loc 2 233 15
add.f32 %f225, %f223, %f224;
$L__tmp82:
.loc 2 243 36
setp.eq.s32 %p52, %r65, 0;
shl.b32 %r147, %r66, 2;
mov.u32 %r148, global_smem;
add.s32 %r45, %r148, %r147;
mov.b32 %r46, %f155;
@%p52 st.shared.b32 [ %r45 + 0 ], %r46;
add.s32 %r47, %r45, 32;
mov.b32 %r48, %f165;
@%p52 st.shared.b32 [ %r47 + 0 ], %r48;
add.s32 %r49, %r45, 64;
mov.b32 %r50, %f175;
@%p52 st.shared.b32 [ %r49 + 0 ], %r50;
add.s32 %r51, %r45, 96;
mov.b32 %r52, %f185;
@%p52 st.shared.b32 [ %r51 + 0 ], %r52;
add.s32 %r53, %r45, 128;
mov.b32 %r54, %f195;
@%p52 st.shared.b32 [ %r53 + 0 ], %r54;
add.s32 %r55, %r45, 160;
mov.b32 %r56, %f205;
@%p52 st.shared.b32 [ %r55 + 0 ], %r56;
add.s32 %r57, %r45, 192;
mov.b32 %r58, %f215;
@%p52 st.shared.b32 [ %r57 + 0 ], %r58;
add.s32 %r59, %r45, 224;
mov.b32 %r60, %f225;
@%p52 st.shared.b32 [ %r59 + 0 ], %r60;
bar.sync 0;
setp.lt.s32 %p60, %r1, 64;
shl.b32 %r149, %r1, 2;
add.s32 %r62, %r148, %r149;
@%p60 ld.shared.b32 %r61, [ %r62 + 0 ];
mov.b32 %f226, %r61;
shfl.sync.bfly.b32 %r150, %r61, 4, 31, -1;
mov.b32 %f227, %r150;
$L__tmp83:
.loc 2 233 15
add.f32 %f228, %f226, %f227;
$L__tmp84:
.loc 2 243 36
mov.b32 %r151, %f228;
shfl.sync.bfly.b32 %r152, %r151, 2, 31, -1;
mov.b32 %f229, %r152;
$L__tmp85:
.loc 2 233 15
add.f32 %f230, %f228, %f229;
$L__tmp86:
.loc 2 243 36
mov.b32 %r153, %f230;
shfl.sync.bfly.b32 %r154, %r153, 1, 31, -1;
mov.b32 %f231, %r154;
$L__tmp87:
.loc 2 233 15
add.f32 %f232, %f230, %f231;
$L__tmp88:
.loc 2 243 36
and.b32 %r155, %r1, 7;
setp.eq.s32 %p62, %r155, 0;
and.pred %p61, %p60, %p62;
mov.b32 %r64, %f232;
@%p61 st.shared.b32 [ %r62 + 0 ], %r64;
bar.sync 0;
ld.shared.f32 %f57, [global_smem];
ld.shared.f32 %f58, [global_smem+32];
ld.shared.f32 %f59, [global_smem+64];
ld.shared.f32 %f60, [global_smem+96];
ld.shared.f32 %f61, [global_smem+128];
ld.shared.f32 %f62, [global_smem+160];
ld.shared.f32 %f63, [global_smem+192];
ld.shared.f32 %f64, [global_smem+224];
$L__tmp89:
.loc 1 51 36
mul.lo.s64 %rd10, %rd1, 804112;
shl.b64 %rd88, %rd3, 1;
add.s64 %rd164, %rd39, %rd88;
add.s64 %rd163, %rd38, %rd88;
shl.b64 %rd13, %rd3, 2;
mul.lo.s64 %rd89, %rd1, 1608224;
add.s64 %rd162, %rd36, %rd89;
add.s64 %rd161, %rd37, %rd88;
shl.b64 %rd90, %rd2, 1;
add.s64 %rd160, %rd39, %rd90;
add.s64 %rd159, %rd38, %rd90;
shl.b64 %rd18, %rd2, 2;
add.s64 %rd158, %rd37, %rd90;
mov.u64 %rd165, 0;
mov.u16 %rs2, 0;
$L__BB0_3:
.loc 1 52 27
add.s64 %rd155, %rd2, %rd165;
.loc 1 53 25
add.s64 %rd156, %rd3, %rd165;
setp.lt.u64 %p63, %rd155, 50257;
setp.lt.u64 %p65, %rd156, 50257;
.loc 1 55 35
add.s64 %rd91, %rd158, %rd10;
add.s64 %rd92, %rd161, %rd10;
add.s64 %rd93, %rd91, 100514;
add.s64 %rd94, %rd92, 100514;
add.s64 %rd95, %rd91, 201028;
add.s64 %rd96, %rd92, 201028;
add.s64 %rd97, %rd91, 301542;
add.s64 %rd98, %rd92, 301542;
add.s64 %rd99, %rd91, 402056;
add.s64 %rd100, %rd92, 402056;
add.s64 %rd101, %rd91, 502570;
add.s64 %rd102, %rd92, 502570;
add.s64 %rd103, %rd91, 603084;
add.s64 %rd104, %rd92, 603084;
add.s64 %rd105, %rd91, 703598;
.loc 1 55 53
add.s64 %rd106, %rd92, 703598;
mov.u16 %rs1, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd91 + 0 ];
@!%p63 mov.u16 %rs1, %rs2;
mov.u16 %rs3, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd92 + 0 ];
@!%p65 mov.u16 %rs3, %rs2;
mov.u16 %rs5, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd93 + 0 ];
@!%p63 mov.u16 %rs5, %rs2;
mov.u16 %rs7, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd94 + 0 ];
@!%p65 mov.u16 %rs7, %rs2;
mov.u16 %rs9, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd95 + 0 ];
@!%p63 mov.u16 %rs9, %rs2;
mov.u16 %rs11, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd96 + 0 ];
@!%p65 mov.u16 %rs11, %rs2;
mov.u16 %rs13, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd97 + 0 ];
@!%p63 mov.u16 %rs13, %rs2;
mov.u16 %rs15, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd98 + 0 ];
@!%p65 mov.u16 %rs15, %rs2;
mov.u16 %rs17, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd99 + 0 ];
@!%p63 mov.u16 %rs17, %rs2;
mov.u16 %rs19, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd100 + 0 ];
@!%p65 mov.u16 %rs19, %rs2;
mov.u16 %rs21, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd101 + 0 ];
@!%p63 mov.u16 %rs21, %rs2;
mov.u16 %rs23, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd102 + 0 ];
@!%p65 mov.u16 %rs23, %rs2;
mov.u16 %rs25, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd103 + 0 ];
@!%p63 mov.u16 %rs25, %rs2;
mov.u16 %rs27, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd104 + 0 ];
@!%p65 mov.u16 %rs27, %rs2;
mov.u16 %rs29, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd105 + 0 ];
@!%p63 mov.u16 %rs29, %rs2;
mov.u16 %rs31, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd106 + 0 ];
@!%p65 mov.u16 %rs31, %rs2;
.loc 1 55 105
cvt.f32.bf16 %r156, %rs1;
mov.b32 %f265, %r156;
cvt.f32.bf16 %r157, %rs3;
mov.b32 %f266, %r157;
cvt.f32.bf16 %r158, %rs5;
mov.b32 %f267, %r158;
cvt.f32.bf16 %r159, %rs7;
mov.b32 %f268, %r159;
cvt.f32.bf16 %r160, %rs9;
mov.b32 %f269, %r160;
cvt.f32.bf16 %r161, %rs11;
mov.b32 %f270, %r161;
cvt.f32.bf16 %r162, %rs13;
mov.b32 %f271, %r162;
cvt.f32.bf16 %r163, %rs15;
mov.b32 %f272, %r163;
cvt.f32.bf16 %r164, %rs17;
mov.b32 %f273, %r164;
cvt.f32.bf16 %r165, %rs19;
mov.b32 %f274, %r165;
cvt.f32.bf16 %r166, %rs21;
mov.b32 %f275, %r166;
cvt.f32.bf16 %r167, %rs23;
mov.b32 %f276, %r167;
cvt.f32.bf16 %r168, %rs25;
mov.b32 %f277, %r168;
cvt.f32.bf16 %r169, %rs27;
mov.b32 %f278, %r169;
cvt.f32.bf16 %r170, %rs29;
mov.b32 %f279, %r170;
cvt.f32.bf16 %r171, %rs31;
mov.b32 %f280, %r171;
.loc 1 56 35
add.s64 %rd107, %rd162, %rd18;
add.s64 %rd108, %rd162, %rd13;
add.s64 %rd109, %rd107, 201028;
add.s64 %rd110, %rd108, 201028;
add.s64 %rd111, %rd107, 402056;
add.s64 %rd112, %rd108, 402056;
add.s64 %rd113, %rd107, 603084;
add.s64 %rd114, %rd108, 603084;
add.s64 %rd115, %rd107, 804112;
add.s64 %rd116, %rd108, 804112;
add.s64 %rd117, %rd107, 1005140;
add.s64 %rd118, %rd108, 1005140;
add.s64 %rd119, %rd107, 1206168;
add.s64 %rd120, %rd108, 1206168;
add.s64 %rd121, %rd107, 1407196;
.loc 1 56 53
add.s64 %rd122, %rd108, 1407196;
mov.u32 %r172, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r172 }, [ %rd107 + 0 ];
@!%p63 mov.u32 %r172, %r173;
mov.b32 %f281, %r172;
mov.u32 %r174, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r174 }, [ %rd108 + 0 ];
@!%p65 mov.u32 %r174, %r173;
mov.b32 %f282, %r174;
mov.u32 %r176, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r176 }, [ %rd109 + 0 ];
@!%p63 mov.u32 %r176, %r173;
mov.b32 %f283, %r176;
mov.u32 %r178, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r178 }, [ %rd110 + 0 ];
@!%p65 mov.u32 %r178, %r173;
mov.b32 %f284, %r178;
mov.u32 %r180, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r180 }, [ %rd111 + 0 ];
@!%p63 mov.u32 %r180, %r173;
mov.b32 %f285, %r180;
mov.u32 %r182, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r182 }, [ %rd112 + 0 ];
@!%p65 mov.u32 %r182, %r173;
mov.b32 %f286, %r182;
mov.u32 %r184, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r184 }, [ %rd113 + 0 ];
@!%p63 mov.u32 %r184, %r173;
mov.b32 %f287, %r184;
mov.u32 %r186, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r186 }, [ %rd114 + 0 ];
@!%p65 mov.u32 %r186, %r173;
mov.b32 %f288, %r186;
mov.u32 %r188, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r188 }, [ %rd115 + 0 ];
@!%p63 mov.u32 %r188, %r173;
mov.b32 %f289, %r188;
mov.u32 %r190, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r190 }, [ %rd116 + 0 ];
@!%p65 mov.u32 %r190, %r173;
mov.b32 %f290, %r190;
mov.u32 %r192, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r192 }, [ %rd117 + 0 ];
@!%p63 mov.u32 %r192, %r173;
mov.b32 %f291, %r192;
mov.u32 %r194, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r194 }, [ %rd118 + 0 ];
@!%p65 mov.u32 %r194, %r173;
mov.b32 %f292, %r194;
mov.u32 %r196, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r196 }, [ %rd119 + 0 ];
@!%p63 mov.u32 %r196, %r173;
mov.b32 %f293, %r196;
mov.u32 %r198, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r198 }, [ %rd120 + 0 ];
@!%p65 mov.u32 %r198, %r173;
mov.b32 %f294, %r198;
mov.u32 %r200, 0x0;
@%p63 ld.global.L1::evict_first.b32 { %r200 }, [ %rd121 + 0 ];
@!%p63 mov.u32 %r200, %r173;
mov.b32 %f295, %r200;
mov.u32 %r202, 0x0;
@%p65 ld.global.L1::evict_first.b32 { %r202 }, [ %rd122 + 0 ];
@!%p65 mov.u32 %r202, %r173;
mov.b32 %f296, %r202;
.loc 1 57 35
add.s64 %rd123, %rd159, %rd10;
add.s64 %rd124, %rd163, %rd10;
add.s64 %rd125, %rd123, 100514;
add.s64 %rd126, %rd124, 100514;
add.s64 %rd127, %rd123, 201028;
add.s64 %rd128, %rd124, 201028;
add.s64 %rd129, %rd123, 301542;
add.s64 %rd130, %rd124, 301542;
add.s64 %rd131, %rd123, 402056;
add.s64 %rd132, %rd124, 402056;
add.s64 %rd133, %rd123, 502570;
add.s64 %rd134, %rd124, 502570;
add.s64 %rd135, %rd123, 603084;
add.s64 %rd136, %rd124, 603084;
add.s64 %rd137, %rd123, 703598;
.loc 1 57 53
add.s64 %rd138, %rd124, 703598;
mov.u16 %rs49, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd123 + 0 ];
@!%p63 mov.u16 %rs49, %rs2;
mov.u16 %rs51, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd124 + 0 ];
@!%p65 mov.u16 %rs51, %rs2;
mov.u16 %rs53, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd125 + 0 ];
@!%p63 mov.u16 %rs53, %rs2;
mov.u16 %rs55, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd126 + 0 ];
@!%p65 mov.u16 %rs55, %rs2;
mov.u16 %rs57, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd127 + 0 ];
@!%p63 mov.u16 %rs57, %rs2;
mov.u16 %rs59, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd128 + 0 ];
@!%p65 mov.u16 %rs59, %rs2;
mov.u16 %rs61, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd129 + 0 ];
@!%p63 mov.u16 %rs61, %rs2;
mov.u16 %rs63, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd130 + 0 ];
@!%p65 mov.u16 %rs63, %rs2;
mov.u16 %rs65, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd131 + 0 ];
@!%p63 mov.u16 %rs65, %rs2;
mov.u16 %rs67, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd132 + 0 ];
@!%p65 mov.u16 %rs67, %rs2;
mov.u16 %rs69, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd133 + 0 ];
@!%p63 mov.u16 %rs69, %rs2;
mov.u16 %rs71, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd134 + 0 ];
@!%p65 mov.u16 %rs71, %rs2;
mov.u16 %rs73, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd135 + 0 ];
@!%p63 mov.u16 %rs73, %rs2;
mov.u16 %rs75, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd136 + 0 ];
@!%p65 mov.u16 %rs75, %rs2;
mov.u16 %rs77, 0x0;
@%p63 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd137 + 0 ];
@!%p63 mov.u16 %rs77, %rs2;
mov.u16 %rs79, 0x0;
@%p65 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd138 + 0 ];
@!%p65 mov.u16 %rs79, %rs2;
.loc 1 57 105
cvt.f32.bf16 %r204, %rs49;
mov.b32 %f297, %r204;
cvt.f32.bf16 %r205, %rs51;
mov.b32 %f298, %r205;
cvt.f32.bf16 %r206, %rs53;
mov.b32 %f299, %r206;
cvt.f32.bf16 %r207, %rs55;
mov.b32 %f300, %r207;
cvt.f32.bf16 %r208, %rs57;
mov.b32 %f301, %r208;
cvt.f32.bf16 %r209, %rs59;
mov.b32 %f302, %r209;
cvt.f32.bf16 %r210, %rs61;
mov.b32 %f303, %r210;
cvt.f32.bf16 %r211, %rs63;
mov.b32 %f304, %r211;
cvt.f32.bf16 %r212, %rs65;
mov.b32 %f305, %r212;
cvt.f32.bf16 %r213, %rs67;
mov.b32 %f306, %r213;
cvt.f32.bf16 %r214, %rs69;
mov.b32 %f307, %r214;
cvt.f32.bf16 %r215, %rs71;
mov.b32 %f308, %r215;
cvt.f32.bf16 %r216, %rs73;
mov.b32 %f309, %r216;
cvt.f32.bf16 %r217, %rs75;
mov.b32 %f310, %r217;
cvt.f32.bf16 %r218, %rs77;
mov.b32 %f311, %r218;
cvt.f32.bf16 %r219, %rs79;
mov.b32 %f312, %r219;
.loc 1 65 23
mul.f32 %f234, %f297, 0f3FB8AA3B;
ex2.approx.f32 %f233, %f234;
mul.f32 %f236, %f298, 0f3FB8AA3B;
ex2.approx.f32 %f235, %f236;
mul.f32 %f238, %f299, 0f3FB8AA3B;
ex2.approx.f32 %f237, %f238;
mul.f32 %f240, %f300, 0f3FB8AA3B;
ex2.approx.f32 %f239, %f240;
mul.f32 %f242, %f301, 0f3FB8AA3B;
ex2.approx.f32 %f241, %f242;
mul.f32 %f244, %f302, 0f3FB8AA3B;
ex2.approx.f32 %f243, %f244;
mul.f32 %f246, %f303, 0f3FB8AA3B;
ex2.approx.f32 %f245, %f246;
mul.f32 %f248, %f304, 0f3FB8AA3B;
ex2.approx.f32 %f247, %f248;
mul.f32 %f250, %f305, 0f3FB8AA3B;
ex2.approx.f32 %f249, %f250;
mul.f32 %f252, %f306, 0f3FB8AA3B;
ex2.approx.f32 %f251, %f252;
mul.f32 %f254, %f307, 0f3FB8AA3B;
ex2.approx.f32 %f253, %f254;
mul.f32 %f256, %f308, 0f3FB8AA3B;
ex2.approx.f32 %f255, %f256;
mul.f32 %f258, %f309, 0f3FB8AA3B;
ex2.approx.f32 %f257, %f258;
mul.f32 %f260, %f310, 0f3FB8AA3B;
ex2.approx.f32 %f259, %f260;
mul.f32 %f262, %f311, 0f3FB8AA3B;
ex2.approx.f32 %f261, %f262;
mul.f32 %f264, %f312, 0f3FB8AA3B;
ex2.approx.f32 %f263, %f264;
.loc 1 66 24
mul.f32 %f313, %f57, %f233;
mul.f32 %f314, %f57, %f235;
mul.f32 %f315, %f58, %f237;
mul.f32 %f316, %f58, %f239;
mul.f32 %f317, %f59, %f241;
mul.f32 %f318, %f59, %f243;
mul.f32 %f319, %f60, %f245;
mul.f32 %f320, %f60, %f247;
mul.f32 %f321, %f61, %f249;
mul.f32 %f322, %f61, %f251;
mul.f32 %f323, %f62, %f253;
mul.f32 %f324, %f62, %f255;
mul.f32 %f325, %f63, %f257;
mul.f32 %f326, %f63, %f259;
mul.f32 %f327, %f64, %f261;
mul.f32 %f328, %f64, %f263;
.loc 1 67 24
neg.f32 %f329, %f313;
fma.rn.f32 %f330, %f1, %f281, %f329;
neg.f32 %f331, %f314;
fma.rn.f32 %f332, %f1, %f282, %f331;
neg.f32 %f333, %f315;
fma.rn.f32 %f334, %f2, %f283, %f333;
neg.f32 %f335, %f316;
fma.rn.f32 %f336, %f2, %f284, %f335;
neg.f32 %f337, %f317;
fma.rn.f32 %f338, %f3, %f285, %f337;
neg.f32 %f339, %f318;
fma.rn.f32 %f340, %f3, %f286, %f339;
neg.f32 %f341, %f319;
fma.rn.f32 %f342, %f4, %f287, %f341;
neg.f32 %f343, %f320;
fma.rn.f32 %f344, %f4, %f288, %f343;
neg.f32 %f345, %f321;
fma.rn.f32 %f346, %f5, %f289, %f345;
neg.f32 %f347, %f322;
fma.rn.f32 %f348, %f5, %f290, %f347;
neg.f32 %f349, %f323;
fma.rn.f32 %f350, %f6, %f291, %f349;
neg.f32 %f351, %f324;
fma.rn.f32 %f352, %f6, %f292, %f351;
neg.f32 %f353, %f325;
fma.rn.f32 %f354, %f7, %f293, %f353;
neg.f32 %f355, %f326;
fma.rn.f32 %f356, %f7, %f294, %f355;
neg.f32 %f357, %f327;
fma.rn.f32 %f358, %f8, %f295, %f357;
neg.f32 %f359, %f328;
fma.rn.f32 %f360, %f8, %f296, %f359;
.loc 1 69 24
add.f32 %f361, %f265, %f330;
add.f32 %f362, %f266, %f332;
add.f32 %f363, %f267, %f334;
add.f32 %f364, %f268, %f336;
add.f32 %f365, %f269, %f338;
add.f32 %f366, %f270, %f340;
add.f32 %f367, %f271, %f342;
add.f32 %f368, %f272, %f344;
add.f32 %f369, %f273, %f346;
add.f32 %f370, %f274, %f348;
add.f32 %f371, %f275, %f350;
add.f32 %f372, %f276, %f352;
add.f32 %f373, %f277, %f354;
add.f32 %f374, %f278, %f356;
add.f32 %f375, %f279, %f358;
add.f32 %f376, %f280, %f360;
.loc 1 70 29
add.s64 %rd139, %rd160, %rd10;
add.s64 %rd140, %rd164, %rd10;
add.s64 %rd141, %rd139, 100514;
add.s64 %rd142, %rd140, 100514;
add.s64 %rd143, %rd139, 201028;
add.s64 %rd144, %rd140, 201028;
add.s64 %rd145, %rd139, 301542;
add.s64 %rd146, %rd140, 301542;
add.s64 %rd147, %rd139, 402056;
add.s64 %rd148, %rd140, 402056;
add.s64 %rd149, %rd139, 502570;
add.s64 %rd150, %rd140, 502570;
add.s64 %rd151, %rd139, 603084;
add.s64 %rd152, %rd140, 603084;
add.s64 %rd153, %rd139, 703598;
.loc 1 70 54
add.s64 %rd154, %rd140, 703598;
mov.b32 %r220, %f361;
cvt.rn.bf16.f32 %rs97, %r220;
mov.b32 %r221, %f362;
cvt.rn.bf16.f32 %rs98, %r221;
mov.b32 %r222, %f363;
cvt.rn.bf16.f32 %rs99, %r222;
mov.b32 %r223, %f364;
cvt.rn.bf16.f32 %rs100, %r223;
mov.b32 %r224, %f365;
cvt.rn.bf16.f32 %rs101, %r224;
mov.b32 %r225, %f366;
cvt.rn.bf16.f32 %rs102, %r225;
mov.b32 %r226, %f367;
cvt.rn.bf16.f32 %rs103, %r226;
mov.b32 %r227, %f368;
cvt.rn.bf16.f32 %rs104, %r227;
mov.b32 %r228, %f369;
cvt.rn.bf16.f32 %rs105, %r228;
mov.b32 %r229, %f370;
cvt.rn.bf16.f32 %rs106, %r229;
mov.b32 %r230, %f371;
cvt.rn.bf16.f32 %rs107, %r230;
mov.b32 %r231, %f372;
cvt.rn.bf16.f32 %rs108, %r231;
mov.b32 %r232, %f373;
cvt.rn.bf16.f32 %rs109, %r232;
mov.b32 %r233, %f374;
cvt.rn.bf16.f32 %rs110, %r233;
mov.b32 %r234, %f375;
cvt.rn.bf16.f32 %rs111, %r234;
mov.b32 %r235, %f376;
cvt.rn.bf16.f32 %rs112, %r235;
@%p63 st.global.b16 [ %rd139 + 0 ], { %rs97 };
@%p65 st.global.b16 [ %rd140 + 0 ], { %rs98 };
@%p63 st.global.b16 [ %rd141 + 0 ], { %rs99 };
@%p65 st.global.b16 [ %rd142 + 0 ], { %rs100 };
@%p63 st.global.b16 [ %rd143 + 0 ], { %rs101 };
@%p65 st.global.b16 [ %rd144 + 0 ], { %rs102 };
@%p63 st.global.b16 [ %rd145 + 0 ], { %rs103 };
@%p65 st.global.b16 [ %rd146 + 0 ], { %rs104 };
@%p63 st.global.b16 [ %rd147 + 0 ], { %rs105 };
@%p65 st.global.b16 [ %rd148 + 0 ], { %rs106 };
@%p63 st.global.b16 [ %rd149 + 0 ], { %rs107 };
@%p65 st.global.b16 [ %rd150 + 0 ], { %rs108 };
@%p63 st.global.b16 [ %rd151 + 0 ], { %rs109 };
@%p65 st.global.b16 [ %rd152 + 0 ], { %rs110 };
@%p63 st.global.b16 [ %rd153 + 0 ], { %rs111 };
@%p65 st.global.b16 [ %rd154 + 0 ], { %rs112 };
.loc 1 51 36
add.s64 %rd165, %rd165, 512;
cvt.u32.u64 %r236, %rd165;
add.s32 %r237, %r236, -512;
add.s64 %rd164, %rd164, 1024;
add.s64 %rd163, %rd163, 1024;
add.s64 %rd162, %rd162, 2048;
add.s64 %rd161, %rd161, 1024;
add.s64 %rd160, %rd160, 1024;
add.s64 %rd159, %rd159, 1024;
add.s64 %rd158, %rd158, 1024;
setp.lt.u32 %p175, %r237, 49745;
@%p175 bra $L__BB0_3;
.loc 1 51 4
ret;
$L__tmp90:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 278
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 107
.b8 122
.b8 103
.b8 108
.b8 55
.b8 116
.b8 104
.b8 98
.b8 52
.b8 120
.b8 100
.b8 102
.b8 107
.b8 102
.b8 110
.b8 100
.b8 50
.b8 116
.b8 105
.b8 100
.b8 107
.b8 115
.b8 54
.b8 109
.b8 116
.b8 53
.b8 102
.b8 51
.b8 104
.b8 97
.b8 117
.b8 119
.b8 102
.b8 121
.b8 106
.b8 102
.b8 108
.b8 98
.b8 116
.b8 122
.b8 121
.b8 101
.b8 112
.b8 111
.b8 53
.b8 111
.b8 120
.b8 107
.b8 118
.b8 104
.b8 107
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 107
.b8 122
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp88
.b8 2
.b8 46
.b8 27
.b8 5
.b32 125
.b64 $L__tmp1
.b64 $L__tmp88
.b8 2
.b8 243
.b8 36
.b8 0
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp89
.b8 2
.b8 46
.b8 27
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 282
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 282
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }