0-hero's picture
Add files using upload-large-folder tool
f67f72f verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2d3d4d5d6d7de8
.visible .entry triton__0d1d2d3d4d5d6d7de8(
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
)
.maxntid 128, 1, 1
{
.reg .pred %p<49>;
.reg .b16 %rs<33>;
.reg .b32 %r<72>;
.reg .f32 %f<98>;
.reg .b64 %rd<66>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_0];
ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_1];
$L__tmp0:
.loc 1 22 44
mov.u32 %r13, %tid.x;
ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_2];
bfe.u32 %r14, %r13, 3, 4;
ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7de8_param_3];
.loc 1 24 33
and.b32 %r1, %r13, 7;
.loc 1 21 28
mov.u32 %r6, %ctaid.x;
.loc 1 21 34
cvt.s64.s32 %rd1, %r6;
.loc 1 21 46
mul.wide.s32 %rd30, %r6, 64;
cvt.u64.u32 %rd2, %r14;
.loc 1 22 23
or.b64 %rd31, %rd30, %rd2;
.loc 1 26 30
shl.b64 %rd32, %rd31, 3;
add.s64 %rd19, %rd29, %rd32;
add.s64 %rd21, %rd19, 128;
add.s64 %rd23, %rd19, 256;
add.s64 %rd25, %rd19, 384;
mov.pred %p1, -1;
.loc 1 26 35
mov.u64 %rd18, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
mov.u64 %rd20, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
mov.u64 %rd22, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
mov.u64 %rd24, 0x0;
@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
.loc 1 27 19
mov.u32 %r10, 0x0;
@%p1 ld.global.b32 { %r10 }, [ %rd26 + 0 ];
.loc 1 29 19
mov.u32 %r11, 0x0;
@%p1 ld.global.b32 { %r11 }, [ %rd27 + 0 ];
.loc 1 38 23
setp.eq.s64 %p7, %rd18, -1;
setp.eq.s64 %p8, %rd20, -1;
setp.eq.s64 %p9, %rd22, -1;
setp.eq.s64 %p10, %rd24, -1;
.loc 1 39 22
div.full.f32 %r9, %r10, %r11;
mov.b32 %f25, %r9;
.loc 1 41 37
selp.f32 %f4, 0f00000000, %f25, %p10;
selp.f32 %f3, 0f00000000, %f25, %p9;
selp.f32 %f2, 0f00000000, %f25, %p8;
selp.f32 %f1, 0f00000000, %f25, %p7;
.loc 1 32 36
mul.wide.s32 %rd33, %r6, 12865792;
mul.wide.u32 %rd34, %r14, 201028;
add.s64 %rd35, %rd33, %rd34;
cvt.u64.u32 %rd36, %r13;
and.b64 %rd3, %rd36, 7;
mul.wide.u32 %rd37, %r1, 4;
add.s64 %rd38, %rd35, %rd37;
add.s64 %rd39, %rd38, %rd28;
add.s64 %rd65, %rd39, 9649344;
mov.f32 %f94, 0f00000000;
mov.b32 %r70, -8;
mov.u64 %rd63, %rd65;
mov.f32 %f95, %f94;
mov.f32 %f96, %f94;
mov.f32 %f97, %f94;
$L__BB0_1:
add.s32 %r70, %r70, 8;
.loc 1 33 27
add.s32 %r23, %r70, %r1;
.loc 1 34 25
setp.lt.u32 %p11, %r23, 50257;
.loc 1 36 34
add.s64 %rd40, %rd63, -9649344;
add.s64 %rd41, %rd63, -6432896;
add.s64 %rd42, %rd63, -3216448;
mov.b32 %r54, 0;
.loc 1 36 52
mov.u32 %r15, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r15 }, [ %rd40 + 0 ];
@!%p11 mov.u32 %r15, %r54;
mov.u32 %r17, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r17 }, [ %rd41 + 0 ];
@!%p11 mov.u32 %r17, %r54;
mov.u32 %r19, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r19 }, [ %rd42 + 0 ];
@!%p11 mov.u32 %r19, %r54;
mov.u32 %r21, 0x0;
@%p11 ld.global.L1::evict_last.b32 { %r21 }, [ %rd63 + 0 ];
@!%p11 mov.u32 %r21, %r54;
mov.b32 %f26, %r21;
mov.b32 %f27, %r19;
mov.b32 %f28, %r17;
mov.b32 %f29, %r15;
.loc 1 42 23
mul.f32 %f30, %f1, %f29;
mul.f32 %f31, %f2, %f28;
mul.f32 %f32, %f3, %f27;
mul.f32 %f33, %f4, %f26;
.loc 1 45 40
selp.f32 %f34, %f33, 0f80000000, %p11;
selp.f32 %f35, %f32, 0f80000000, %p11;
selp.f32 %f36, %f31, 0f80000000, %p11;
selp.f32 %f37, %f30, 0f80000000, %p11;
add.f32 %f94, %f94, %f37;
add.f32 %f95, %f95, %f36;
add.f32 %f96, %f96, %f35;
add.f32 %f97, %f97, %f34;
.loc 1 32 36
add.s64 %rd63, %rd63, 32;
setp.lt.u32 %p19, %r70, 50249;
@%p19 bra $L__BB0_1;
$L__tmp1:
.loc 2 243 36
mov.b32 %r25, %f94;
shfl.sync.bfly.b32 %r26, %r25, 4, 31, -1;
mov.b32 %f38, %r26;
$L__tmp2:
.loc 2 233 15
add.f32 %f39, %f94, %f38;
$L__tmp3:
.loc 2 243 36
mov.b32 %r27, %f39;
shfl.sync.bfly.b32 %r28, %r27, 2, 31, -1;
mov.b32 %f40, %r28;
$L__tmp4:
.loc 2 233 15
add.f32 %f41, %f39, %f40;
$L__tmp5:
.loc 2 243 36
mov.b32 %r29, %f41;
shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1;
mov.b32 %f42, %r30;
$L__tmp6:
.loc 2 233 15
add.f32 %f13, %f41, %f42;
$L__tmp7:
.loc 2 243 36
mov.b32 %r31, %f95;
shfl.sync.bfly.b32 %r32, %r31, 4, 31, -1;
mov.b32 %f43, %r32;
$L__tmp8:
.loc 2 233 15
add.f32 %f44, %f95, %f43;
$L__tmp9:
.loc 2 243 36
mov.b32 %r33, %f44;
shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1;
mov.b32 %f45, %r34;
$L__tmp10:
.loc 2 233 15
add.f32 %f46, %f44, %f45;
$L__tmp11:
.loc 2 243 36
mov.b32 %r35, %f46;
shfl.sync.bfly.b32 %r36, %r35, 1, 31, -1;
mov.b32 %f47, %r36;
$L__tmp12:
.loc 2 233 15
add.f32 %f14, %f46, %f47;
$L__tmp13:
.loc 2 243 36
mov.b32 %r37, %f96;
shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
mov.b32 %f48, %r38;
$L__tmp14:
.loc 2 233 15
add.f32 %f49, %f96, %f48;
$L__tmp15:
.loc 2 243 36
mov.b32 %r39, %f49;
shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
mov.b32 %f50, %r40;
$L__tmp16:
.loc 2 233 15
add.f32 %f51, %f49, %f50;
$L__tmp17:
.loc 2 243 36
mov.b32 %r41, %f51;
shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
mov.b32 %f52, %r42;
$L__tmp18:
.loc 2 233 15
add.f32 %f15, %f51, %f52;
$L__tmp19:
.loc 2 243 36
mov.b32 %r43, %f97;
shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
mov.b32 %f53, %r44;
$L__tmp20:
.loc 2 233 15
add.f32 %f54, %f97, %f53;
$L__tmp21:
.loc 2 243 36
mov.b32 %r45, %f54;
shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1;
mov.b32 %f55, %r46;
$L__tmp22:
.loc 2 233 15
add.f32 %f56, %f54, %f55;
$L__tmp23:
.loc 2 243 36
mov.b32 %r47, %f56;
shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1;
mov.b32 %f57, %r48;
$L__tmp24:
.loc 2 233 15
add.f32 %f16, %f56, %f57;
$L__tmp25:
.loc 1 51 36
shl.b64 %rd44, %rd3, 1;
add.s64 %rd7, %rd17, %rd44;
mul.lo.s64 %rd45, %rd1, 6432896;
mul.lo.s64 %rd46, %rd2, 100514;
add.s64 %rd64, %rd45, %rd46;
add.s64 %rd9, %rd16, %rd44;
add.s64 %rd10, %rd15, %rd44;
mov.b32 %r71, -8;
mov.u16 %rs2, 0;
$L__BB0_3:
add.s32 %r71, %r71, 8;
.loc 1 52 27
add.s32 %r69, %r71, %r1;
.loc 1 53 25
setp.lt.u32 %p20, %r69, 50257;
.loc 1 55 35
add.s64 %rd47, %rd10, %rd64;
add.s64 %rd48, %rd47, 1608224;
add.s64 %rd49, %rd47, 3216448;
.loc 1 55 53
add.s64 %rd50, %rd47, 4824672;
mov.u16 %rs1, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
@!%p20 mov.u16 %rs1, %rs2;
mov.u16 %rs3, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd48 + 0 ];
@!%p20 mov.u16 %rs3, %rs2;
mov.u16 %rs5, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd49 + 0 ];
@!%p20 mov.u16 %rs5, %rs2;
mov.u16 %rs7, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd50 + 0 ];
@!%p20 mov.u16 %rs7, %rs2;
.loc 1 55 105
cvt.f32.bf16 %r49, %rs1;
mov.b32 %f66, %r49;
cvt.f32.bf16 %r50, %rs3;
mov.b32 %f67, %r50;
cvt.f32.bf16 %r51, %rs5;
mov.b32 %f68, %r51;
cvt.f32.bf16 %r52, %rs7;
mov.b32 %f69, %r52;
.loc 1 56 35
add.s64 %rd51, %rd65, -9649344;
add.s64 %rd52, %rd65, -6432896;
add.s64 %rd53, %rd65, -3216448;
.loc 1 56 53
mov.u32 %r53, 0x0;
@%p20 ld.global.L1::evict_first.b32 { %r53 }, [ %rd51 + 0 ];
@!%p20 mov.u32 %r53, %r54;
mov.b32 %f70, %r53;
mov.u32 %r55, 0x0;
@%p20 ld.global.L1::evict_first.b32 { %r55 }, [ %rd52 + 0 ];
@!%p20 mov.u32 %r55, %r54;
mov.b32 %f71, %r55;
mov.u32 %r57, 0x0;
@%p20 ld.global.L1::evict_first.b32 { %r57 }, [ %rd53 + 0 ];
@!%p20 mov.u32 %r57, %r54;
mov.b32 %f72, %r57;
mov.u32 %r59, 0x0;
@%p20 ld.global.L1::evict_first.b32 { %r59 }, [ %rd65 + 0 ];
@!%p20 mov.u32 %r59, %r54;
mov.b32 %f73, %r59;
.loc 1 57 35
add.s64 %rd55, %rd9, %rd64;
add.s64 %rd56, %rd55, 1608224;
add.s64 %rd57, %rd55, 3216448;
.loc 1 57 53
add.s64 %rd58, %rd55, 4824672;
mov.u16 %rs13, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd55 + 0 ];
@!%p20 mov.u16 %rs13, %rs2;
mov.u16 %rs15, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd56 + 0 ];
@!%p20 mov.u16 %rs15, %rs2;
mov.u16 %rs17, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd57 + 0 ];
@!%p20 mov.u16 %rs17, %rs2;
mov.u16 %rs19, 0x0;
@%p20 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd58 + 0 ];
@!%p20 mov.u16 %rs19, %rs2;
.loc 1 57 105
cvt.f32.bf16 %r61, %rs13;
mov.b32 %f74, %r61;
cvt.f32.bf16 %r62, %rs15;
mov.b32 %f75, %r62;
cvt.f32.bf16 %r63, %rs17;
mov.b32 %f76, %r63;
cvt.f32.bf16 %r64, %rs19;
mov.b32 %f77, %r64;
.loc 1 65 23
mul.f32 %f59, %f74, 0f3FB8AA3B;
ex2.approx.f32 %f58, %f59;
mul.f32 %f61, %f75, 0f3FB8AA3B;
ex2.approx.f32 %f60, %f61;
mul.f32 %f63, %f76, 0f3FB8AA3B;
ex2.approx.f32 %f62, %f63;
mul.f32 %f65, %f77, 0f3FB8AA3B;
ex2.approx.f32 %f64, %f65;
.loc 1 66 24
mul.f32 %f78, %f13, %f58;
mul.f32 %f79, %f14, %f60;
mul.f32 %f80, %f15, %f62;
mul.f32 %f81, %f16, %f64;
.loc 1 67 24
neg.f32 %f82, %f78;
fma.rn.f32 %f83, %f1, %f70, %f82;
neg.f32 %f84, %f79;
fma.rn.f32 %f85, %f2, %f71, %f84;
neg.f32 %f86, %f80;
fma.rn.f32 %f87, %f3, %f72, %f86;
neg.f32 %f88, %f81;
fma.rn.f32 %f89, %f4, %f73, %f88;
.loc 1 69 24
add.f32 %f90, %f66, %f83;
add.f32 %f91, %f67, %f85;
add.f32 %f92, %f68, %f87;
add.f32 %f93, %f69, %f89;
.loc 1 70 29
add.s64 %rd59, %rd7, %rd64;
add.s64 %rd60, %rd59, 1608224;
add.s64 %rd61, %rd59, 3216448;
.loc 1 70 54
add.s64 %rd62, %rd59, 4824672;
mov.b32 %r65, %f90;
cvt.rn.bf16.f32 %rs25, %r65;
mov.b32 %r66, %f91;
cvt.rn.bf16.f32 %rs26, %r66;
mov.b32 %r67, %f92;
cvt.rn.bf16.f32 %rs27, %r67;
mov.b32 %r68, %f93;
cvt.rn.bf16.f32 %rs28, %r68;
@%p20 st.global.b16 [ %rd59 + 0 ], { %rs25 };
@%p20 st.global.b16 [ %rd60 + 0 ], { %rs26 };
@%p20 st.global.b16 [ %rd61 + 0 ], { %rs27 };
@%p20 st.global.b16 [ %rd62 + 0 ], { %rs28 };
.loc 1 51 36
add.s64 %rd65, %rd65, 32;
add.s64 %rd64, %rd64, 16;
setp.lt.u32 %p48, %r71, 50249;
@%p48 bra $L__BB0_3;
.loc 1 51 4
ret;
$L__tmp26:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 32
.b8 11
.b8 0
.b8 0
.b8 3
.b8 46
.b8 1
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 49
.b8 19
.b8 0
.b8 0
.b8 4
.b8 29
.b8 0
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 5
.b8 29
.b8 1
.b8 49
.b8 19
.b8 17
.b8 1
.b8 18
.b8 1
.b8 88
.b8 11
.b8 89
.b8 11
.b8 87
.b8 11
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 278
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 107
.b8 122
.b8 103
.b8 108
.b8 55
.b8 116
.b8 104
.b8 98
.b8 52
.b8 120
.b8 100
.b8 102
.b8 107
.b8 102
.b8 110
.b8 100
.b8 50
.b8 116
.b8 105
.b8 100
.b8 107
.b8 115
.b8 54
.b8 109
.b8 116
.b8 53
.b8 102
.b8 51
.b8 104
.b8 97
.b8 117
.b8 119
.b8 102
.b8 121
.b8 106
.b8 102
.b8 108
.b8 98
.b8 116
.b8 122
.b8 121
.b8 101
.b8 112
.b8 111
.b8 53
.b8 111
.b8 120
.b8 107
.b8 118
.b8 104
.b8 107
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 107
.b8 122
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b8 1
.b8 18
.b8 1
.b8 1
.b8 3
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b32 125
.b8 4
.b32 125
.b64 $L__tmp1
.b64 $L__tmp24
.b8 2
.b8 46
.b8 27
.b8 5
.b32 125
.b64 $L__tmp2
.b64 $L__tmp25
.b8 2
.b8 46
.b8 27
.b8 4
.b32 125
.b64 $L__tmp2
.b64 $L__tmp25
.b8 2
.b8 243
.b8 36
.b8 0
.b8 0
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 282
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 51
.b8 100
.b8 52
.b8 100
.b8 53
.b8 100
.b8 54
.b8 100
.b8 55
.b8 100
.b8 101
.b8 56
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 282
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }