0-hero's picture
Add files using upload-large-folder tool
f9d5f95 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2de
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2de(
.param .u64 triton__0d1d2de_param_0,
.param .u64 triton__0d1d2de_param_1,
.param .u32 triton__0d1d2de_param_2
)
.maxntid 128, 1, 1
{
.reg .pred %p<4>;
.reg .b16 %rs<9>;
.reg .b32 %r<37>;
.reg .b64 %rd<13>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
$L__tmp0:
.loc 1 21 36
mov.u32 %r22, %tid.x;
and.b32 %r23, %r22, 127;
shl.b32 %r24, %r23, 3;
shl.b32 %r25, %r23, 2;
.loc 1 20 28
mov.u32 %r1, %ctaid.x;
.loc 1 20 33
shl.b32 %r26, %r1, 10;
.loc 1 21 23
or.b32 %r27, %r26, %r24;
or.b32 %r28, %r26, %r25;
.loc 1 24 30
mul.wide.s32 %rd6, %r27, 2;
add.s64 %rd1, %rd4, %rd6;
mov.pred %p1, -1;
.loc 1 24 35
mov.u32 %r2, 0x0;
mov.u32 %r3, 0x0;
mov.u32 %r4, 0x0;
mov.u32 %r5, 0x0;
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
shr.u32 %r29, %r2, 16;
shr.u32 %r30, %r3, 16;
shr.u32 %r31, %r4, 16;
shr.u32 %r32, %r5, 16;
.loc 1 24 44
shl.b32 %r33, %r23, 4;
mov.u32 %r34, global_smem;
add.s32 %r35, %r34, %r33;
st.shared.u16 [%r35], %r2;
st.shared.u16 [%r35+2], %r29;
st.shared.u16 [%r35+4], %r3;
st.shared.u16 [%r35+6], %r30;
st.shared.u16 [%r35+8], %r4;
st.shared.u16 [%r35+10], %r31;
st.shared.u16 [%r35+12], %r5;
st.shared.u16 [%r35+14], %r32;
bar.sync 0;
add.s32 %r36, %r34, %r24;
ld.shared.u16 %rs1, [%r36];
ld.shared.u16 %rs2, [%r36+2];
ld.shared.u16 %rs3, [%r36+4];
ld.shared.u16 %rs4, [%r36+6];
ld.shared.u16 %rs5, [%r36+1024];
ld.shared.u16 %rs6, [%r36+1026];
ld.shared.u16 %rs7, [%r36+1028];
ld.shared.u16 %rs8, [%r36+1030];
cvt.f32.bf16 %r14, %rs1;
cvt.f32.bf16 %r15, %rs2;
cvt.f32.bf16 %r16, %rs3;
cvt.f32.bf16 %r17, %rs4;
cvt.f32.bf16 %r18, %rs5;
cvt.f32.bf16 %r19, %rs6;
cvt.f32.bf16 %r20, %rs7;
cvt.f32.bf16 %r21, %rs8;
.loc 1 26 25
mul.wide.s32 %rd7, %r28, 4;
add.s64 %rd2, %rd5, %rd7;
cvt.s64.s32 %rd8, %r26;
cvt.u64.u32 %rd9, %r25;
or.b64 %rd10, %rd8, %rd9;
shl.b64 %rd11, %rd10, 2;
add.s64 %rd12, %rd5, %rd11;
add.s64 %rd3, %rd12, 2048;
.loc 1 26 36
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
.loc 1 26 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 176
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 107
.b8 54
.b8 50
.b8 107
.b8 50
.b8 120
.b8 122
.b8 98
.b8 98
.b8 54
.b8 53
.b8 55
.b8 115
.b8 110
.b8 102
.b8 100
.b8 111
.b8 119
.b8 119
.b8 97
.b8 110
.b8 122
.b8 115
.b8 122
.b8 97
.b8 105
.b8 106
.b8 54
.b8 113
.b8 122
.b8 119
.b8 54
.b8 118
.b8 117
.b8 99
.b8 55
.b8 99
.b8 102
.b8 105
.b8 100
.b8 111
.b8 109
.b8 106
.b8 112
.b8 107
.b8 107
.b8 54
.b8 105
.b8 103
.b8 99
.b8 109
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 107
.b8 54
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 180
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 180
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }