Add files using upload-large-folder tool
Browse files
.triton/dump/11759acf26ac56366b171628132485d6/triton_.ptx
ADDED
@@ -0,0 +1,788 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
22 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
23 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
24 |
+
)
|
25 |
+
.maxntid 64, 1, 1
|
26 |
+
{
|
27 |
+
.reg .pred %p<31>;
|
28 |
+
.reg .b16 %rs<17>;
|
29 |
+
.reg .b32 %r<103>;
|
30 |
+
.reg .f32 %f<86>;
|
31 |
+
.reg .b64 %rd<21>;
|
32 |
+
.loc 1 18 0
|
33 |
+
$L__func_begin0:
|
34 |
+
.loc 1 18 0
|
35 |
+
|
36 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
37 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 26 26
|
40 |
+
mov.u32 %r68, %tid.x;
|
41 |
+
and.b32 %r69, %r68, 31;
|
42 |
+
ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
43 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
44 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
45 |
+
and.b32 %r70, %r68, 63;
|
46 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
47 |
+
shl.b32 %r71, %r70, 2;
|
48 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
49 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
50 |
+
.loc 1 23 28
|
51 |
+
mov.u32 %r1, %ctaid.x;
|
52 |
+
.loc 1 30 40
|
53 |
+
shl.b32 %r72, %r1, 8;
|
54 |
+
.loc 1 30 36
|
55 |
+
or.b32 %r73, %r72, %r71;
|
56 |
+
.loc 1 30 30
|
57 |
+
mul.wide.s32 %rd17, %r73, 4;
|
58 |
+
add.s64 %rd1, %rd10, %rd17;
|
59 |
+
mov.b32 %r6, 0;
|
60 |
+
mov.pred %p1, -1;
|
61 |
+
.loc 1 30 46
|
62 |
+
mov.u32 %r2, 0x0;
|
63 |
+
mov.u32 %r3, 0x0;
|
64 |
+
mov.u32 %r4, 0x0;
|
65 |
+
mov.u32 %r5, 0x0;
|
66 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
67 |
+
@!%p1 mov.u32 %r2, %r6;
|
68 |
+
@!%p1 mov.u32 %r3, %r6;
|
69 |
+
@!%p1 mov.u32 %r4, %r6;
|
70 |
+
@!%p1 mov.u32 %r5, %r6;
|
71 |
+
mov.b32 %f1, %r4;
|
72 |
+
mov.b32 %f2, %r5;
|
73 |
+
.loc 1 31 30
|
74 |
+
mul.wide.s32 %rd18, %r73, 2;
|
75 |
+
add.s64 %rd2, %rd11, %rd18;
|
76 |
+
.loc 1 31 46
|
77 |
+
mov.u32 %r10, 0x0;
|
78 |
+
mov.u32 %r11, 0x0;
|
79 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
80 |
+
@!%p1 mov.u32 %r10, %r6;
|
81 |
+
@!%p1 mov.u32 %r11, %r6;
|
82 |
+
cvt.u16.u32 %rs1, %r10;
|
83 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
84 |
+
cvt.u16.u32 %rs3, %r11;
|
85 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
86 |
+
.loc 1 31 67
|
87 |
+
cvt.f32.bf16 %r14, %rs1;
|
88 |
+
mov.b32 %f3, %r14;
|
89 |
+
cvt.f32.bf16 %r15, %rs2;
|
90 |
+
mov.b32 %f4, %r15;
|
91 |
+
cvt.f32.bf16 %r16, %rs3;
|
92 |
+
mov.b32 %f5, %r16;
|
93 |
+
cvt.f32.bf16 %r17, %rs4;
|
94 |
+
mov.b32 %f6, %r17;
|
95 |
+
.loc 1 32 30
|
96 |
+
add.s64 %rd3, %rd12, %rd18;
|
97 |
+
.loc 1 32 46
|
98 |
+
mov.u32 %r18, 0x0;
|
99 |
+
mov.u32 %r19, 0x0;
|
100 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
101 |
+
@!%p1 mov.u32 %r18, %r6;
|
102 |
+
@!%p1 mov.u32 %r19, %r6;
|
103 |
+
cvt.u16.u32 %rs5, %r18;
|
104 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
105 |
+
cvt.u16.u32 %rs7, %r19;
|
106 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
107 |
+
.loc 1 32 67
|
108 |
+
cvt.f32.bf16 %r22, %rs5;
|
109 |
+
mov.b32 %f7, %r22;
|
110 |
+
cvt.f32.bf16 %r23, %rs6;
|
111 |
+
mov.b32 %f8, %r23;
|
112 |
+
cvt.f32.bf16 %r24, %rs7;
|
113 |
+
mov.b32 %f9, %r24;
|
114 |
+
cvt.f32.bf16 %r25, %rs8;
|
115 |
+
mov.b32 %f10, %r25;
|
116 |
+
.loc 1 33 30
|
117 |
+
add.s64 %rd4, %rd13, %rd18;
|
118 |
+
.loc 1 33 46
|
119 |
+
mov.u32 %r26, 0x0;
|
120 |
+
mov.u32 %r27, 0x0;
|
121 |
+
@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
|
122 |
+
@!%p1 mov.u32 %r26, %r6;
|
123 |
+
@!%p1 mov.u32 %r27, %r6;
|
124 |
+
cvt.u16.u32 %rs9, %r26;
|
125 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
|
126 |
+
cvt.u16.u32 %rs11, %r27;
|
127 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
|
128 |
+
.loc 1 33 67
|
129 |
+
cvt.f32.bf16 %r30, %rs9;
|
130 |
+
mov.b32 %f11, %r30;
|
131 |
+
cvt.f32.bf16 %r31, %rs10;
|
132 |
+
mov.b32 %f12, %r31;
|
133 |
+
cvt.f32.bf16 %r32, %rs11;
|
134 |
+
mov.b32 %f13, %r32;
|
135 |
+
cvt.f32.bf16 %r33, %rs12;
|
136 |
+
mov.b32 %f14, %r33;
|
137 |
+
.loc 1 34 31
|
138 |
+
mul.wide.u32 %rd19, %r71, 4;
|
139 |
+
add.s64 %rd5, %rd14, %rd19;
|
140 |
+
.loc 1 34 36
|
141 |
+
mov.u32 %r34, 0x0;
|
142 |
+
mov.u32 %r35, 0x0;
|
143 |
+
mov.u32 %r36, 0x0;
|
144 |
+
mov.u32 %r37, 0x0;
|
145 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
|
146 |
+
@!%p1 mov.u32 %r34, %r6;
|
147 |
+
@!%p1 mov.u32 %r35, %r6;
|
148 |
+
@!%p1 mov.u32 %r36, %r6;
|
149 |
+
@!%p1 mov.u32 %r37, %r6;
|
150 |
+
.loc 1 36 18
|
151 |
+
add.f32 %f15, %f5, %f1;
|
152 |
+
add.f32 %f16, %f6, %f2;
|
153 |
+
.loc 1 38 18
|
154 |
+
add.f32 %f17, %f15, %f9;
|
155 |
+
add.f32 %f18, %f16, %f10;
|
156 |
+
.loc 1 30 46
|
157 |
+
mov.b32 %f19, %r2;
|
158 |
+
mov.b32 %f20, %r3;
|
159 |
+
.loc 1 36 18
|
160 |
+
add.f32 %f21, %f4, %f20;
|
161 |
+
add.f32 %f22, %f3, %f19;
|
162 |
+
.loc 1 38 18
|
163 |
+
add.f32 %f23, %f22, %f7;
|
164 |
+
add.f32 %f24, %f21, %f8;
|
165 |
+
.loc 1 40 18
|
166 |
+
add.f32 %f25, %f24, %f12;
|
167 |
+
add.f32 %f26, %f23, %f11;
|
168 |
+
add.f32 %f27, %f17, %f13;
|
169 |
+
add.f32 %f28, %f18, %f14;
|
170 |
+
$L__tmp1:
|
171 |
+
.loc 2 233 15
|
172 |
+
add.f32 %f29, %f26, %f25;
|
173 |
+
add.f32 %f30, %f29, %f27;
|
174 |
+
add.f32 %f31, %f30, %f28;
|
175 |
+
$L__tmp2:
|
176 |
+
.loc 2 243 36
|
177 |
+
mov.b32 %r74, %f31;
|
178 |
+
shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
|
179 |
+
mov.b32 %f32, %r75;
|
180 |
+
$L__tmp3:
|
181 |
+
.loc 2 233 15
|
182 |
+
add.f32 %f33, %f31, %f32;
|
183 |
+
$L__tmp4:
|
184 |
+
.loc 2 243 36
|
185 |
+
mov.b32 %r76, %f33;
|
186 |
+
shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
|
187 |
+
mov.b32 %f34, %r77;
|
188 |
+
$L__tmp5:
|
189 |
+
.loc 2 233 15
|
190 |
+
add.f32 %f35, %f33, %f34;
|
191 |
+
$L__tmp6:
|
192 |
+
.loc 2 243 36
|
193 |
+
mov.b32 %r78, %f35;
|
194 |
+
shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1;
|
195 |
+
mov.b32 %f36, %r79;
|
196 |
+
$L__tmp7:
|
197 |
+
.loc 2 233 15
|
198 |
+
add.f32 %f37, %f35, %f36;
|
199 |
+
$L__tmp8:
|
200 |
+
.loc 2 243 36
|
201 |
+
mov.b32 %r80, %f37;
|
202 |
+
shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1;
|
203 |
+
mov.b32 %f38, %r81;
|
204 |
+
$L__tmp9:
|
205 |
+
.loc 2 233 15
|
206 |
+
add.f32 %f39, %f37, %f38;
|
207 |
+
$L__tmp10:
|
208 |
+
.loc 2 243 36
|
209 |
+
mov.b32 %r82, %f39;
|
210 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
211 |
+
mov.b32 %f40, %r83;
|
212 |
+
$L__tmp11:
|
213 |
+
.loc 2 233 15
|
214 |
+
add.f32 %f41, %f39, %f40;
|
215 |
+
$L__tmp12:
|
216 |
+
.loc 2 243 36
|
217 |
+
setp.eq.s32 %p20, %r69, 0;
|
218 |
+
shr.u32 %r84, %r68, 3;
|
219 |
+
and.b32 %r85, %r84, 4;
|
220 |
+
mov.u32 %r86, global_smem;
|
221 |
+
add.s32 %r42, %r86, %r85;
|
222 |
+
mov.b32 %r43, %f41;
|
223 |
+
@%p20 st.shared.b32 [ %r42 + 0 ], %r43;
|
224 |
+
bar.sync 0;
|
225 |
+
setp.lt.s32 %p21, %r68, 2;
|
226 |
+
shl.b32 %r87, %r68, 2;
|
227 |
+
add.s32 %r45, %r86, %r87;
|
228 |
+
@%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
|
229 |
+
mov.b32 %f42, %r44;
|
230 |
+
shfl.sync.bfly.b32 %r88, %r44, 1, 31, -1;
|
231 |
+
mov.b32 %f43, %r88;
|
232 |
+
$L__tmp13:
|
233 |
+
.loc 2 233 15
|
234 |
+
add.f32 %f44, %f42, %f43;
|
235 |
+
$L__tmp14:
|
236 |
+
.loc 2 243 36
|
237 |
+
and.b32 %r89, %r68, 1;
|
238 |
+
setp.eq.b32 %p29, %r89, 1;
|
239 |
+
not.pred %p30, %p29;
|
240 |
+
and.pred %p22, %p21, %p30;
|
241 |
+
mov.b32 %r47, %f44;
|
242 |
+
@%p22 st.shared.b32 [ %r45 + 0 ], %r47;
|
243 |
+
bar.sync 0;
|
244 |
+
ld.shared.f32 %f45, [global_smem];
|
245 |
+
$L__tmp15:
|
246 |
+
.loc 3 8 15
|
247 |
+
add.f32 %f46, %f45, 0f00000000;
|
248 |
+
$L__tmp16:
|
249 |
+
.loc 1 48 20
|
250 |
+
mov.b32 %r49, %f46;
|
251 |
+
mov.b32 %r50, 1132462080;
|
252 |
+
div.full.f32 %r67, %r49, %r50;
|
253 |
+
mov.b32 %f47, %r67;
|
254 |
+
.loc 1 49 20
|
255 |
+
sub.f32 %f48, %f26, %f47;
|
256 |
+
sub.f32 %f49, %f25, %f47;
|
257 |
+
sub.f32 %f50, %f27, %f47;
|
258 |
+
sub.f32 %f51, %f28, %f47;
|
259 |
+
.loc 1 50 20
|
260 |
+
mul.f32 %f52, %f49, %f49;
|
261 |
+
$L__tmp17:
|
262 |
+
.loc 2 243 36
|
263 |
+
bar.sync 0;
|
264 |
+
$L__tmp18:
|
265 |
+
.loc 2 233 15
|
266 |
+
fma.rn.f32 %f53, %f48, %f48, %f52;
|
267 |
+
fma.rn.f32 %f54, %f50, %f50, %f53;
|
268 |
+
fma.rn.f32 %f55, %f51, %f51, %f54;
|
269 |
+
$L__tmp19:
|
270 |
+
.loc 2 243 36
|
271 |
+
mov.b32 %r90, %f55;
|
272 |
+
shfl.sync.bfly.b32 %r91, %r90, 16, 31, -1;
|
273 |
+
mov.b32 %f56, %r91;
|
274 |
+
$L__tmp20:
|
275 |
+
.loc 2 233 15
|
276 |
+
add.f32 %f57, %f55, %f56;
|
277 |
+
$L__tmp21:
|
278 |
+
.loc 2 243 36
|
279 |
+
mov.b32 %r92, %f57;
|
280 |
+
shfl.sync.bfly.b32 %r93, %r92, 8, 31, -1;
|
281 |
+
mov.b32 %f58, %r93;
|
282 |
+
$L__tmp22:
|
283 |
+
.loc 2 233 15
|
284 |
+
add.f32 %f59, %f57, %f58;
|
285 |
+
$L__tmp23:
|
286 |
+
.loc 2 243 36
|
287 |
+
mov.b32 %r94, %f59;
|
288 |
+
shfl.sync.bfly.b32 %r95, %r94, 4, 31, -1;
|
289 |
+
mov.b32 %f60, %r95;
|
290 |
+
$L__tmp24:
|
291 |
+
.loc 2 233 15
|
292 |
+
add.f32 %f61, %f59, %f60;
|
293 |
+
$L__tmp25:
|
294 |
+
.loc 2 243 36
|
295 |
+
mov.b32 %r96, %f61;
|
296 |
+
shfl.sync.bfly.b32 %r97, %r96, 2, 31, -1;
|
297 |
+
mov.b32 %f62, %r97;
|
298 |
+
$L__tmp26:
|
299 |
+
.loc 2 233 15
|
300 |
+
add.f32 %f63, %f61, %f62;
|
301 |
+
$L__tmp27:
|
302 |
+
.loc 2 243 36
|
303 |
+
mov.b32 %r98, %f63;
|
304 |
+
shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1;
|
305 |
+
mov.b32 %f64, %r99;
|
306 |
+
$L__tmp28:
|
307 |
+
.loc 2 233 15
|
308 |
+
add.f32 %f65, %f63, %f64;
|
309 |
+
$L__tmp29:
|
310 |
+
.loc 2 243 36
|
311 |
+
mov.b32 %r52, %f65;
|
312 |
+
@%p20 st.shared.b32 [ %r42 + 0 ], %r52;
|
313 |
+
bar.sync 0;
|
314 |
+
@%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
|
315 |
+
mov.b32 %f66, %r53;
|
316 |
+
shfl.sync.bfly.b32 %r100, %r53, 1, 31, -1;
|
317 |
+
mov.b32 %f67, %r100;
|
318 |
+
$L__tmp30:
|
319 |
+
.loc 2 233 15
|
320 |
+
add.f32 %f68, %f66, %f67;
|
321 |
+
$L__tmp31:
|
322 |
+
.loc 2 243 36
|
323 |
+
mov.b32 %r56, %f68;
|
324 |
+
@%p22 st.shared.b32 [ %r45 + 0 ], %r56;
|
325 |
+
bar.sync 0;
|
326 |
+
ld.shared.f32 %f69, [global_smem];
|
327 |
+
$L__tmp32:
|
328 |
+
.loc 3 8 15
|
329 |
+
add.f32 %f70, %f69, 0f00000000;
|
330 |
+
$L__tmp33:
|
331 |
+
.loc 1 55 20
|
332 |
+
mov.b32 %r58, %f70;
|
333 |
+
div.full.f32 %r57, %r58, %r50;
|
334 |
+
mov.b32 %f71, %r57;
|
335 |
+
.loc 1 57 20
|
336 |
+
add.f32 %f72, %f71, 0f3727C5AC;
|
337 |
+
.loc 1 58 26
|
338 |
+
rsqrt.approx.ftz.f32 %f73, %f72;
|
339 |
+
.loc 1 34 36
|
340 |
+
mov.b32 %f74, %r37;
|
341 |
+
mov.b32 %f75, %r36;
|
342 |
+
mov.b32 %f76, %r35;
|
343 |
+
mov.b32 %f77, %r34;
|
344 |
+
.loc 1 60 20
|
345 |
+
mul.f32 %f78, %f48, %f73;
|
346 |
+
mul.f32 %f79, %f49, %f73;
|
347 |
+
mul.f32 %f80, %f50, %f73;
|
348 |
+
mul.f32 %f81, %f51, %f73;
|
349 |
+
.loc 1 61 20
|
350 |
+
mul.f32 %f82, %f78, %f77;
|
351 |
+
mul.f32 %f83, %f79, %f76;
|
352 |
+
mul.f32 %f84, %f80, %f75;
|
353 |
+
mul.f32 %f85, %f81, %f74;
|
354 |
+
.loc 1 63 4
|
355 |
+
bar.sync 0;
|
356 |
+
.loc 1 64 28
|
357 |
+
mul.wide.s32 %rd20, %r1, 4;
|
358 |
+
add.s64 %rd6, %rd9, %rd20;
|
359 |
+
.loc 1 64 40
|
360 |
+
setp.eq.s32 %p26, %r70, 0;
|
361 |
+
mov.b32 %r60, %f73;
|
362 |
+
@%p26 st.global.b32 [ %rd6 + 0 ], { %r60 };
|
363 |
+
.loc 1 65 25
|
364 |
+
add.s64 %rd7, %rd16, %rd18;
|
365 |
+
.loc 1 65 48
|
366 |
+
mov.b32 %r61, %f82;
|
367 |
+
cvt.rn.bf16.f32 %rs13, %r61;
|
368 |
+
mov.b32 %r62, %f83;
|
369 |
+
cvt.rn.bf16.f32 %rs14, %r62;
|
370 |
+
mov.b32 %r63, %f84;
|
371 |
+
cvt.rn.bf16.f32 %rs15, %r63;
|
372 |
+
mov.b32 %r64, %f85;
|
373 |
+
cvt.rn.bf16.f32 %rs16, %r64;
|
374 |
+
mov.b32 %r101, {%rs13, %rs14};
|
375 |
+
mov.b32 %r102, {%rs15, %rs16};
|
376 |
+
@%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r101, %r102 };
|
377 |
+
.loc 1 66 25
|
378 |
+
add.s64 %rd8, %rd15, %rd20;
|
379 |
+
.loc 1 66 37
|
380 |
+
@%p26 st.global.b32 [ %rd8 + 0 ], { %r67 };
|
381 |
+
.loc 1 66 4
|
382 |
+
ret;
|
383 |
+
$L__tmp34:
|
384 |
+
$L__func_end0:
|
385 |
+
|
386 |
+
}
|
387 |
+
// .globl __nv_rsqrtf
|
388 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
389 |
+
.param .b32 __nv_rsqrtf_param_0
|
390 |
+
)
|
391 |
+
{
|
392 |
+
.reg .f32 %f<3>;
|
393 |
+
$L__func_begin1:
|
394 |
+
|
395 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
396 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
397 |
+
st.param.f32 [func_retval0+0], %f2;
|
398 |
+
ret;
|
399 |
+
$L__func_end1:
|
400 |
+
|
401 |
+
}
|
402 |
+
.file 1 "/tmp/torchinductor_root/dx/cdxa5yqgsimvskocpuiz4ajfrjfcwys3opyrdv53xfphj4576qx7.py"
|
403 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
404 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
405 |
+
.section .debug_abbrev
|
406 |
+
{
|
407 |
+
.b8 1
|
408 |
+
.b8 17
|
409 |
+
.b8 1
|
410 |
+
.b8 37
|
411 |
+
.b8 8
|
412 |
+
.b8 19
|
413 |
+
.b8 5
|
414 |
+
.b8 3
|
415 |
+
.b8 8
|
416 |
+
.b8 16
|
417 |
+
.b8 6
|
418 |
+
.b8 27
|
419 |
+
.b8 8
|
420 |
+
.b8 180
|
421 |
+
.b8 66
|
422 |
+
.b8 12
|
423 |
+
.b8 17
|
424 |
+
.b8 1
|
425 |
+
.b8 18
|
426 |
+
.b8 1
|
427 |
+
.b8 0
|
428 |
+
.b8 0
|
429 |
+
.b8 2
|
430 |
+
.b8 46
|
431 |
+
.b8 0
|
432 |
+
.b8 135
|
433 |
+
.b8 64
|
434 |
+
.b8 8
|
435 |
+
.b8 3
|
436 |
+
.b8 8
|
437 |
+
.b8 58
|
438 |
+
.b8 11
|
439 |
+
.b8 59
|
440 |
+
.b8 11
|
441 |
+
.b8 63
|
442 |
+
.b8 12
|
443 |
+
.b8 32
|
444 |
+
.b8 11
|
445 |
+
.b8 0
|
446 |
+
.b8 0
|
447 |
+
.b8 3
|
448 |
+
.b8 46
|
449 |
+
.b8 1
|
450 |
+
.b8 17
|
451 |
+
.b8 1
|
452 |
+
.b8 18
|
453 |
+
.b8 1
|
454 |
+
.b8 64
|
455 |
+
.b8 10
|
456 |
+
.b8 49
|
457 |
+
.b8 19
|
458 |
+
.b8 0
|
459 |
+
.b8 0
|
460 |
+
.b8 4
|
461 |
+
.b8 29
|
462 |
+
.b8 1
|
463 |
+
.b8 49
|
464 |
+
.b8 19
|
465 |
+
.b8 17
|
466 |
+
.b8 1
|
467 |
+
.b8 18
|
468 |
+
.b8 1
|
469 |
+
.b8 88
|
470 |
+
.b8 11
|
471 |
+
.b8 89
|
472 |
+
.b8 11
|
473 |
+
.b8 87
|
474 |
+
.b8 11
|
475 |
+
.b8 0
|
476 |
+
.b8 0
|
477 |
+
.b8 5
|
478 |
+
.b8 29
|
479 |
+
.b8 0
|
480 |
+
.b8 49
|
481 |
+
.b8 19
|
482 |
+
.b8 17
|
483 |
+
.b8 1
|
484 |
+
.b8 18
|
485 |
+
.b8 1
|
486 |
+
.b8 88
|
487 |
+
.b8 11
|
488 |
+
.b8 89
|
489 |
+
.b8 11
|
490 |
+
.b8 87
|
491 |
+
.b8 11
|
492 |
+
.b8 0
|
493 |
+
.b8 0
|
494 |
+
.b8 0
|
495 |
+
}
|
496 |
+
.section .debug_info
|
497 |
+
{
|
498 |
+
.b32 407
|
499 |
+
.b8 2
|
500 |
+
.b8 0
|
501 |
+
.b32 .debug_abbrev
|
502 |
+
.b8 8
|
503 |
+
.b8 1
|
504 |
+
.b8 116
|
505 |
+
.b8 114
|
506 |
+
.b8 105
|
507 |
+
.b8 116
|
508 |
+
.b8 111
|
509 |
+
.b8 110
|
510 |
+
.b8 0
|
511 |
+
.b8 2
|
512 |
+
.b8 0
|
513 |
+
.b8 99
|
514 |
+
.b8 100
|
515 |
+
.b8 120
|
516 |
+
.b8 97
|
517 |
+
.b8 53
|
518 |
+
.b8 121
|
519 |
+
.b8 113
|
520 |
+
.b8 103
|
521 |
+
.b8 115
|
522 |
+
.b8 105
|
523 |
+
.b8 109
|
524 |
+
.b8 118
|
525 |
+
.b8 115
|
526 |
+
.b8 107
|
527 |
+
.b8 111
|
528 |
+
.b8 99
|
529 |
+
.b8 112
|
530 |
+
.b8 117
|
531 |
+
.b8 105
|
532 |
+
.b8 122
|
533 |
+
.b8 52
|
534 |
+
.b8 97
|
535 |
+
.b8 106
|
536 |
+
.b8 102
|
537 |
+
.b8 114
|
538 |
+
.b8 106
|
539 |
+
.b8 102
|
540 |
+
.b8 99
|
541 |
+
.b8 119
|
542 |
+
.b8 121
|
543 |
+
.b8 115
|
544 |
+
.b8 51
|
545 |
+
.b8 111
|
546 |
+
.b8 112
|
547 |
+
.b8 121
|
548 |
+
.b8 114
|
549 |
+
.b8 100
|
550 |
+
.b8 118
|
551 |
+
.b8 53
|
552 |
+
.b8 51
|
553 |
+
.b8 120
|
554 |
+
.b8 102
|
555 |
+
.b8 112
|
556 |
+
.b8 104
|
557 |
+
.b8 106
|
558 |
+
.b8 52
|
559 |
+
.b8 53
|
560 |
+
.b8 55
|
561 |
+
.b8 54
|
562 |
+
.b8 113
|
563 |
+
.b8 120
|
564 |
+
.b8 55
|
565 |
+
.b8 46
|
566 |
+
.b8 112
|
567 |
+
.b8 121
|
568 |
+
.b8 0
|
569 |
+
.b32 .debug_line
|
570 |
+
.b8 47
|
571 |
+
.b8 116
|
572 |
+
.b8 109
|
573 |
+
.b8 112
|
574 |
+
.b8 47
|
575 |
+
.b8 116
|
576 |
+
.b8 111
|
577 |
+
.b8 114
|
578 |
+
.b8 99
|
579 |
+
.b8 104
|
580 |
+
.b8 105
|
581 |
+
.b8 110
|
582 |
+
.b8 100
|
583 |
+
.b8 117
|
584 |
+
.b8 99
|
585 |
+
.b8 116
|
586 |
+
.b8 111
|
587 |
+
.b8 114
|
588 |
+
.b8 95
|
589 |
+
.b8 114
|
590 |
+
.b8 111
|
591 |
+
.b8 111
|
592 |
+
.b8 116
|
593 |
+
.b8 47
|
594 |
+
.b8 100
|
595 |
+
.b8 120
|
596 |
+
.b8 0
|
597 |
+
.b8 1
|
598 |
+
.b64 $L__func_begin0
|
599 |
+
.b64 $L__func_end0
|
600 |
+
.b8 2
|
601 |
+
.b8 116
|
602 |
+
.b8 114
|
603 |
+
.b8 105
|
604 |
+
.b8 116
|
605 |
+
.b8 111
|
606 |
+
.b8 110
|
607 |
+
.b8 95
|
608 |
+
.b8 95
|
609 |
+
.b8 48
|
610 |
+
.b8 100
|
611 |
+
.b8 49
|
612 |
+
.b8 100
|
613 |
+
.b8 50
|
614 |
+
.b8 100
|
615 |
+
.b8 51
|
616 |
+
.b8 100
|
617 |
+
.b8 52
|
618 |
+
.b8 100
|
619 |
+
.b8 53
|
620 |
+
.b8 100
|
621 |
+
.b8 54
|
622 |
+
.b8 100
|
623 |
+
.b8 55
|
624 |
+
.b8 100
|
625 |
+
.b8 56
|
626 |
+
.b8 100
|
627 |
+
.b8 101
|
628 |
+
.b8 57
|
629 |
+
.b8 100
|
630 |
+
.b8 101
|
631 |
+
.b8 0
|
632 |
+
.b8 116
|
633 |
+
.b8 114
|
634 |
+
.b8 105
|
635 |
+
.b8 116
|
636 |
+
.b8 111
|
637 |
+
.b8 110
|
638 |
+
.b8 95
|
639 |
+
.b8 95
|
640 |
+
.b8 48
|
641 |
+
.b8 100
|
642 |
+
.b8 49
|
643 |
+
.b8 100
|
644 |
+
.b8 50
|
645 |
+
.b8 100
|
646 |
+
.b8 51
|
647 |
+
.b8 100
|
648 |
+
.b8 52
|
649 |
+
.b8 100
|
650 |
+
.b8 53
|
651 |
+
.b8 100
|
652 |
+
.b8 54
|
653 |
+
.b8 100
|
654 |
+
.b8 55
|
655 |
+
.b8 100
|
656 |
+
.b8 56
|
657 |
+
.b8 100
|
658 |
+
.b8 101
|
659 |
+
.b8 57
|
660 |
+
.b8 100
|
661 |
+
.b8 101
|
662 |
+
.b8 0
|
663 |
+
.b8 1
|
664 |
+
.b8 18
|
665 |
+
.b8 1
|
666 |
+
.b8 1
|
667 |
+
.b8 3
|
668 |
+
.b64 $L__func_begin0
|
669 |
+
.b64 $L__func_end0
|
670 |
+
.b8 1
|
671 |
+
.b8 156
|
672 |
+
.b32 125
|
673 |
+
.b8 4
|
674 |
+
.b32 125
|
675 |
+
.b64 $L__tmp1
|
676 |
+
.b64 $L__tmp14
|
677 |
+
.b8 2
|
678 |
+
.b8 45
|
679 |
+
.b8 59
|
680 |
+
.b8 5
|
681 |
+
.b32 125
|
682 |
+
.b64 $L__tmp1
|
683 |
+
.b64 $L__tmp14
|
684 |
+
.b8 2
|
685 |
+
.b8 243
|
686 |
+
.b8 36
|
687 |
+
.b8 0
|
688 |
+
.b8 5
|
689 |
+
.b32 125
|
690 |
+
.b64 $L__tmp2
|
691 |
+
.b64 $L__tmp15
|
692 |
+
.b8 2
|
693 |
+
.b8 45
|
694 |
+
.b8 59
|
695 |
+
.b8 5
|
696 |
+
.b32 125
|
697 |
+
.b64 $L__tmp15
|
698 |
+
.b64 $L__tmp16
|
699 |
+
.b8 3
|
700 |
+
.b8 45
|
701 |
+
.b8 45
|
702 |
+
.b8 5
|
703 |
+
.b32 125
|
704 |
+
.b64 $L__tmp17
|
705 |
+
.b64 $L__tmp32
|
706 |
+
.b8 2
|
707 |
+
.b8 53
|
708 |
+
.b8 59
|
709 |
+
.b8 4
|
710 |
+
.b32 125
|
711 |
+
.b64 $L__tmp18
|
712 |
+
.b64 $L__tmp31
|
713 |
+
.b8 2
|
714 |
+
.b8 53
|
715 |
+
.b8 59
|
716 |
+
.b8 5
|
717 |
+
.b32 125
|
718 |
+
.b64 $L__tmp18
|
719 |
+
.b64 $L__tmp31
|
720 |
+
.b8 2
|
721 |
+
.b8 243
|
722 |
+
.b8 36
|
723 |
+
.b8 0
|
724 |
+
.b8 5
|
725 |
+
.b32 125
|
726 |
+
.b64 $L__tmp32
|
727 |
+
.b64 $L__tmp33
|
728 |
+
.b8 3
|
729 |
+
.b8 53
|
730 |
+
.b8 45
|
731 |
+
.b8 0
|
732 |
+
.b8 0
|
733 |
+
}
|
734 |
+
.section .debug_pubnames
|
735 |
+
{
|
736 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
737 |
+
$L__pubNames_start0:
|
738 |
+
.b8 2
|
739 |
+
.b8 0
|
740 |
+
.b32 .debug_info
|
741 |
+
.b32 411
|
742 |
+
.b32 125
|
743 |
+
.b8 116
|
744 |
+
.b8 114
|
745 |
+
.b8 105
|
746 |
+
.b8 116
|
747 |
+
.b8 111
|
748 |
+
.b8 110
|
749 |
+
.b8 95
|
750 |
+
.b8 95
|
751 |
+
.b8 48
|
752 |
+
.b8 100
|
753 |
+
.b8 49
|
754 |
+
.b8 100
|
755 |
+
.b8 50
|
756 |
+
.b8 100
|
757 |
+
.b8 51
|
758 |
+
.b8 100
|
759 |
+
.b8 52
|
760 |
+
.b8 100
|
761 |
+
.b8 53
|
762 |
+
.b8 100
|
763 |
+
.b8 54
|
764 |
+
.b8 100
|
765 |
+
.b8 55
|
766 |
+
.b8 100
|
767 |
+
.b8 56
|
768 |
+
.b8 100
|
769 |
+
.b8 101
|
770 |
+
.b8 57
|
771 |
+
.b8 100
|
772 |
+
.b8 101
|
773 |
+
.b8 0
|
774 |
+
.b32 0
|
775 |
+
$L__pubNames_end0:
|
776 |
+
}
|
777 |
+
.section .debug_pubtypes
|
778 |
+
{
|
779 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
780 |
+
$L__pubTypes_start0:
|
781 |
+
.b8 2
|
782 |
+
.b8 0
|
783 |
+
.b32 .debug_info
|
784 |
+
.b32 411
|
785 |
+
.b32 0
|
786 |
+
$L__pubTypes_end0:
|
787 |
+
}
|
788 |
+
.section .debug_loc { }
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.cubin
ADDED
Binary file (4.52 kB). View file
|
|