Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir +21 -0
- .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx +734 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir +184 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir +38 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir +34 -0
- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir +19 -0
- .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir +18 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin +0 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx +525 -0
- .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir +24 -0
- .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir +793 -0
- .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx +1517 -0
- .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir +92 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir +43 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx +278 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir +18 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir +17 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin +0 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir +296 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir +73 -0
- .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx +1927 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir +860 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir +27 -0
- .triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.llir +54 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin +0 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin +0 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir +89 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin +0 -0
- .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir +55 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir +45 -0
- .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir +18 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir +48 -0
- .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir +18 -0
- .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin +0 -0
- .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir +18 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir +28 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir +20 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.llir +48 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin +0 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx +771 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir +100 -0
- .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ptx +295 -0
- .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttir +18 -0
- .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx +565 -0
- .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttgir +68 -0
- .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir +61 -0
- .triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx +296 -0
- .triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir +18 -0
- .triton/dump/a69784da01a97187168f22847465505f/triton_.ttir +71 -0
- .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.llir +54 -0
.triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
14 |
+
%8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
|
15 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
16 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
17 |
+
%11 = arith.truncf %8 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
18 |
+
tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
19 |
+
tt.return
|
20 |
+
}
|
21 |
+
}
|
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx
ADDED
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
12 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
21 |
+
)
|
22 |
+
.maxntid 128, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<49>;
|
25 |
+
.reg .b16 %rs<33>;
|
26 |
+
.reg .b32 %r<72>;
|
27 |
+
.reg .f32 %f<98>;
|
28 |
+
.reg .b64 %rd<66>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
34 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
35 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
36 |
+
ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
37 |
+
ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 22 44
|
40 |
+
mov.u32 %r13, %tid.x;
|
41 |
+
ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
42 |
+
bfe.u32 %r14, %r13, 3, 4;
|
43 |
+
ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
44 |
+
.loc 1 24 33
|
45 |
+
and.b32 %r1, %r13, 7;
|
46 |
+
.loc 1 21 28
|
47 |
+
mov.u32 %r6, %ctaid.x;
|
48 |
+
.loc 1 21 34
|
49 |
+
cvt.s64.s32 %rd1, %r6;
|
50 |
+
.loc 1 21 46
|
51 |
+
mul.wide.s32 %rd30, %r6, 64;
|
52 |
+
cvt.u64.u32 %rd2, %r14;
|
53 |
+
.loc 1 22 23
|
54 |
+
or.b64 %rd31, %rd30, %rd2;
|
55 |
+
.loc 1 26 30
|
56 |
+
shl.b64 %rd32, %rd31, 3;
|
57 |
+
add.s64 %rd19, %rd29, %rd32;
|
58 |
+
add.s64 %rd21, %rd19, 128;
|
59 |
+
add.s64 %rd23, %rd19, 256;
|
60 |
+
add.s64 %rd25, %rd19, 384;
|
61 |
+
mov.pred %p1, -1;
|
62 |
+
.loc 1 26 35
|
63 |
+
mov.u64 %rd18, 0x0;
|
64 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
|
65 |
+
mov.u64 %rd20, 0x0;
|
66 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
|
67 |
+
mov.u64 %rd22, 0x0;
|
68 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
|
69 |
+
mov.u64 %rd24, 0x0;
|
70 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
|
71 |
+
.loc 1 27 19
|
72 |
+
mov.u32 %r10, 0x0;
|
73 |
+
@%p1 ld.global.b32 { %r10 }, [ %rd26 + 0 ];
|
74 |
+
.loc 1 29 19
|
75 |
+
mov.u32 %r11, 0x0;
|
76 |
+
@%p1 ld.global.b32 { %r11 }, [ %rd27 + 0 ];
|
77 |
+
.loc 1 38 23
|
78 |
+
setp.eq.s64 %p7, %rd18, -1;
|
79 |
+
setp.eq.s64 %p8, %rd20, -1;
|
80 |
+
setp.eq.s64 %p9, %rd22, -1;
|
81 |
+
setp.eq.s64 %p10, %rd24, -1;
|
82 |
+
.loc 1 39 22
|
83 |
+
div.full.f32 %r9, %r10, %r11;
|
84 |
+
mov.b32 %f25, %r9;
|
85 |
+
.loc 1 41 37
|
86 |
+
selp.f32 %f4, 0f00000000, %f25, %p10;
|
87 |
+
selp.f32 %f3, 0f00000000, %f25, %p9;
|
88 |
+
selp.f32 %f2, 0f00000000, %f25, %p8;
|
89 |
+
selp.f32 %f1, 0f00000000, %f25, %p7;
|
90 |
+
.loc 1 32 36
|
91 |
+
mul.wide.s32 %rd33, %r6, 12865792;
|
92 |
+
mul.wide.u32 %rd34, %r14, 201028;
|
93 |
+
add.s64 %rd35, %rd33, %rd34;
|
94 |
+
cvt.u64.u32 %rd36, %r13;
|
95 |
+
and.b64 %rd3, %rd36, 7;
|
96 |
+
mul.wide.u32 %rd37, %r1, 4;
|
97 |
+
add.s64 %rd38, %rd35, %rd37;
|
98 |
+
add.s64 %rd39, %rd38, %rd28;
|
99 |
+
add.s64 %rd65, %rd39, 9649344;
|
100 |
+
mov.f32 %f94, 0f00000000;
|
101 |
+
mov.b32 %r70, -8;
|
102 |
+
mov.u64 %rd63, %rd65;
|
103 |
+
mov.f32 %f95, %f94;
|
104 |
+
mov.f32 %f96, %f94;
|
105 |
+
mov.f32 %f97, %f94;
|
106 |
+
$L__BB0_1:
|
107 |
+
add.s32 %r70, %r70, 8;
|
108 |
+
.loc 1 33 27
|
109 |
+
add.s32 %r23, %r70, %r1;
|
110 |
+
.loc 1 34 25
|
111 |
+
setp.lt.u32 %p11, %r23, 50257;
|
112 |
+
.loc 1 36 34
|
113 |
+
add.s64 %rd40, %rd63, -9649344;
|
114 |
+
add.s64 %rd41, %rd63, -6432896;
|
115 |
+
add.s64 %rd42, %rd63, -3216448;
|
116 |
+
mov.b32 %r54, 0;
|
117 |
+
.loc 1 36 52
|
118 |
+
mov.u32 %r15, 0x0;
|
119 |
+
@%p11 ld.global.L1::evict_last.b32 { %r15 }, [ %rd40 + 0 ];
|
120 |
+
@!%p11 mov.u32 %r15, %r54;
|
121 |
+
mov.u32 %r17, 0x0;
|
122 |
+
@%p11 ld.global.L1::evict_last.b32 { %r17 }, [ %rd41 + 0 ];
|
123 |
+
@!%p11 mov.u32 %r17, %r54;
|
124 |
+
mov.u32 %r19, 0x0;
|
125 |
+
@%p11 ld.global.L1::evict_last.b32 { %r19 }, [ %rd42 + 0 ];
|
126 |
+
@!%p11 mov.u32 %r19, %r54;
|
127 |
+
mov.u32 %r21, 0x0;
|
128 |
+
@%p11 ld.global.L1::evict_last.b32 { %r21 }, [ %rd63 + 0 ];
|
129 |
+
@!%p11 mov.u32 %r21, %r54;
|
130 |
+
mov.b32 %f26, %r21;
|
131 |
+
mov.b32 %f27, %r19;
|
132 |
+
mov.b32 %f28, %r17;
|
133 |
+
mov.b32 %f29, %r15;
|
134 |
+
.loc 1 42 23
|
135 |
+
mul.f32 %f30, %f1, %f29;
|
136 |
+
mul.f32 %f31, %f2, %f28;
|
137 |
+
mul.f32 %f32, %f3, %f27;
|
138 |
+
mul.f32 %f33, %f4, %f26;
|
139 |
+
.loc 1 45 40
|
140 |
+
selp.f32 %f34, %f33, 0f80000000, %p11;
|
141 |
+
selp.f32 %f35, %f32, 0f80000000, %p11;
|
142 |
+
selp.f32 %f36, %f31, 0f80000000, %p11;
|
143 |
+
selp.f32 %f37, %f30, 0f80000000, %p11;
|
144 |
+
add.f32 %f94, %f94, %f37;
|
145 |
+
add.f32 %f95, %f95, %f36;
|
146 |
+
add.f32 %f96, %f96, %f35;
|
147 |
+
add.f32 %f97, %f97, %f34;
|
148 |
+
.loc 1 32 36
|
149 |
+
add.s64 %rd63, %rd63, 32;
|
150 |
+
setp.lt.u32 %p19, %r70, 50249;
|
151 |
+
@%p19 bra $L__BB0_1;
|
152 |
+
$L__tmp1:
|
153 |
+
.loc 2 243 36
|
154 |
+
mov.b32 %r25, %f94;
|
155 |
+
shfl.sync.bfly.b32 %r26, %r25, 4, 31, -1;
|
156 |
+
mov.b32 %f38, %r26;
|
157 |
+
$L__tmp2:
|
158 |
+
.loc 2 233 15
|
159 |
+
add.f32 %f39, %f94, %f38;
|
160 |
+
$L__tmp3:
|
161 |
+
.loc 2 243 36
|
162 |
+
mov.b32 %r27, %f39;
|
163 |
+
shfl.sync.bfly.b32 %r28, %r27, 2, 31, -1;
|
164 |
+
mov.b32 %f40, %r28;
|
165 |
+
$L__tmp4:
|
166 |
+
.loc 2 233 15
|
167 |
+
add.f32 %f41, %f39, %f40;
|
168 |
+
$L__tmp5:
|
169 |
+
.loc 2 243 36
|
170 |
+
mov.b32 %r29, %f41;
|
171 |
+
shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1;
|
172 |
+
mov.b32 %f42, %r30;
|
173 |
+
$L__tmp6:
|
174 |
+
.loc 2 233 15
|
175 |
+
add.f32 %f13, %f41, %f42;
|
176 |
+
$L__tmp7:
|
177 |
+
.loc 2 243 36
|
178 |
+
mov.b32 %r31, %f95;
|
179 |
+
shfl.sync.bfly.b32 %r32, %r31, 4, 31, -1;
|
180 |
+
mov.b32 %f43, %r32;
|
181 |
+
$L__tmp8:
|
182 |
+
.loc 2 233 15
|
183 |
+
add.f32 %f44, %f95, %f43;
|
184 |
+
$L__tmp9:
|
185 |
+
.loc 2 243 36
|
186 |
+
mov.b32 %r33, %f44;
|
187 |
+
shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1;
|
188 |
+
mov.b32 %f45, %r34;
|
189 |
+
$L__tmp10:
|
190 |
+
.loc 2 233 15
|
191 |
+
add.f32 %f46, %f44, %f45;
|
192 |
+
$L__tmp11:
|
193 |
+
.loc 2 243 36
|
194 |
+
mov.b32 %r35, %f46;
|
195 |
+
shfl.sync.bfly.b32 %r36, %r35, 1, 31, -1;
|
196 |
+
mov.b32 %f47, %r36;
|
197 |
+
$L__tmp12:
|
198 |
+
.loc 2 233 15
|
199 |
+
add.f32 %f14, %f46, %f47;
|
200 |
+
$L__tmp13:
|
201 |
+
.loc 2 243 36
|
202 |
+
mov.b32 %r37, %f96;
|
203 |
+
shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
|
204 |
+
mov.b32 %f48, %r38;
|
205 |
+
$L__tmp14:
|
206 |
+
.loc 2 233 15
|
207 |
+
add.f32 %f49, %f96, %f48;
|
208 |
+
$L__tmp15:
|
209 |
+
.loc 2 243 36
|
210 |
+
mov.b32 %r39, %f49;
|
211 |
+
shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
|
212 |
+
mov.b32 %f50, %r40;
|
213 |
+
$L__tmp16:
|
214 |
+
.loc 2 233 15
|
215 |
+
add.f32 %f51, %f49, %f50;
|
216 |
+
$L__tmp17:
|
217 |
+
.loc 2 243 36
|
218 |
+
mov.b32 %r41, %f51;
|
219 |
+
shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
|
220 |
+
mov.b32 %f52, %r42;
|
221 |
+
$L__tmp18:
|
222 |
+
.loc 2 233 15
|
223 |
+
add.f32 %f15, %f51, %f52;
|
224 |
+
$L__tmp19:
|
225 |
+
.loc 2 243 36
|
226 |
+
mov.b32 %r43, %f97;
|
227 |
+
shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
|
228 |
+
mov.b32 %f53, %r44;
|
229 |
+
$L__tmp20:
|
230 |
+
.loc 2 233 15
|
231 |
+
add.f32 %f54, %f97, %f53;
|
232 |
+
$L__tmp21:
|
233 |
+
.loc 2 243 36
|
234 |
+
mov.b32 %r45, %f54;
|
235 |
+
shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1;
|
236 |
+
mov.b32 %f55, %r46;
|
237 |
+
$L__tmp22:
|
238 |
+
.loc 2 233 15
|
239 |
+
add.f32 %f56, %f54, %f55;
|
240 |
+
$L__tmp23:
|
241 |
+
.loc 2 243 36
|
242 |
+
mov.b32 %r47, %f56;
|
243 |
+
shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1;
|
244 |
+
mov.b32 %f57, %r48;
|
245 |
+
$L__tmp24:
|
246 |
+
.loc 2 233 15
|
247 |
+
add.f32 %f16, %f56, %f57;
|
248 |
+
$L__tmp25:
|
249 |
+
.loc 1 51 36
|
250 |
+
shl.b64 %rd44, %rd3, 1;
|
251 |
+
add.s64 %rd7, %rd17, %rd44;
|
252 |
+
mul.lo.s64 %rd45, %rd1, 6432896;
|
253 |
+
mul.lo.s64 %rd46, %rd2, 100514;
|
254 |
+
add.s64 %rd64, %rd45, %rd46;
|
255 |
+
add.s64 %rd9, %rd16, %rd44;
|
256 |
+
add.s64 %rd10, %rd15, %rd44;
|
257 |
+
mov.b32 %r71, -8;
|
258 |
+
mov.u16 %rs2, 0;
|
259 |
+
$L__BB0_3:
|
260 |
+
add.s32 %r71, %r71, 8;
|
261 |
+
.loc 1 52 27
|
262 |
+
add.s32 %r69, %r71, %r1;
|
263 |
+
.loc 1 53 25
|
264 |
+
setp.lt.u32 %p20, %r69, 50257;
|
265 |
+
.loc 1 55 35
|
266 |
+
add.s64 %rd47, %rd10, %rd64;
|
267 |
+
add.s64 %rd48, %rd47, 1608224;
|
268 |
+
add.s64 %rd49, %rd47, 3216448;
|
269 |
+
.loc 1 55 53
|
270 |
+
add.s64 %rd50, %rd47, 4824672;
|
271 |
+
mov.u16 %rs1, 0x0;
|
272 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
|
273 |
+
@!%p20 mov.u16 %rs1, %rs2;
|
274 |
+
mov.u16 %rs3, 0x0;
|
275 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd48 + 0 ];
|
276 |
+
@!%p20 mov.u16 %rs3, %rs2;
|
277 |
+
mov.u16 %rs5, 0x0;
|
278 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd49 + 0 ];
|
279 |
+
@!%p20 mov.u16 %rs5, %rs2;
|
280 |
+
mov.u16 %rs7, 0x0;
|
281 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd50 + 0 ];
|
282 |
+
@!%p20 mov.u16 %rs7, %rs2;
|
283 |
+
.loc 1 55 105
|
284 |
+
cvt.f32.bf16 %r49, %rs1;
|
285 |
+
mov.b32 %f66, %r49;
|
286 |
+
cvt.f32.bf16 %r50, %rs3;
|
287 |
+
mov.b32 %f67, %r50;
|
288 |
+
cvt.f32.bf16 %r51, %rs5;
|
289 |
+
mov.b32 %f68, %r51;
|
290 |
+
cvt.f32.bf16 %r52, %rs7;
|
291 |
+
mov.b32 %f69, %r52;
|
292 |
+
.loc 1 56 35
|
293 |
+
add.s64 %rd51, %rd65, -9649344;
|
294 |
+
add.s64 %rd52, %rd65, -6432896;
|
295 |
+
add.s64 %rd53, %rd65, -3216448;
|
296 |
+
.loc 1 56 53
|
297 |
+
mov.u32 %r53, 0x0;
|
298 |
+
@%p20 ld.global.L1::evict_first.b32 { %r53 }, [ %rd51 + 0 ];
|
299 |
+
@!%p20 mov.u32 %r53, %r54;
|
300 |
+
mov.b32 %f70, %r53;
|
301 |
+
mov.u32 %r55, 0x0;
|
302 |
+
@%p20 ld.global.L1::evict_first.b32 { %r55 }, [ %rd52 + 0 ];
|
303 |
+
@!%p20 mov.u32 %r55, %r54;
|
304 |
+
mov.b32 %f71, %r55;
|
305 |
+
mov.u32 %r57, 0x0;
|
306 |
+
@%p20 ld.global.L1::evict_first.b32 { %r57 }, [ %rd53 + 0 ];
|
307 |
+
@!%p20 mov.u32 %r57, %r54;
|
308 |
+
mov.b32 %f72, %r57;
|
309 |
+
mov.u32 %r59, 0x0;
|
310 |
+
@%p20 ld.global.L1::evict_first.b32 { %r59 }, [ %rd65 + 0 ];
|
311 |
+
@!%p20 mov.u32 %r59, %r54;
|
312 |
+
mov.b32 %f73, %r59;
|
313 |
+
.loc 1 57 35
|
314 |
+
add.s64 %rd55, %rd9, %rd64;
|
315 |
+
add.s64 %rd56, %rd55, 1608224;
|
316 |
+
add.s64 %rd57, %rd55, 3216448;
|
317 |
+
.loc 1 57 53
|
318 |
+
add.s64 %rd58, %rd55, 4824672;
|
319 |
+
mov.u16 %rs13, 0x0;
|
320 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd55 + 0 ];
|
321 |
+
@!%p20 mov.u16 %rs13, %rs2;
|
322 |
+
mov.u16 %rs15, 0x0;
|
323 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd56 + 0 ];
|
324 |
+
@!%p20 mov.u16 %rs15, %rs2;
|
325 |
+
mov.u16 %rs17, 0x0;
|
326 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd57 + 0 ];
|
327 |
+
@!%p20 mov.u16 %rs17, %rs2;
|
328 |
+
mov.u16 %rs19, 0x0;
|
329 |
+
@%p20 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd58 + 0 ];
|
330 |
+
@!%p20 mov.u16 %rs19, %rs2;
|
331 |
+
.loc 1 57 105
|
332 |
+
cvt.f32.bf16 %r61, %rs13;
|
333 |
+
mov.b32 %f74, %r61;
|
334 |
+
cvt.f32.bf16 %r62, %rs15;
|
335 |
+
mov.b32 %f75, %r62;
|
336 |
+
cvt.f32.bf16 %r63, %rs17;
|
337 |
+
mov.b32 %f76, %r63;
|
338 |
+
cvt.f32.bf16 %r64, %rs19;
|
339 |
+
mov.b32 %f77, %r64;
|
340 |
+
.loc 1 65 23
|
341 |
+
mul.f32 %f59, %f74, 0f3FB8AA3B;
|
342 |
+
ex2.approx.f32 %f58, %f59;
|
343 |
+
mul.f32 %f61, %f75, 0f3FB8AA3B;
|
344 |
+
ex2.approx.f32 %f60, %f61;
|
345 |
+
mul.f32 %f63, %f76, 0f3FB8AA3B;
|
346 |
+
ex2.approx.f32 %f62, %f63;
|
347 |
+
mul.f32 %f65, %f77, 0f3FB8AA3B;
|
348 |
+
ex2.approx.f32 %f64, %f65;
|
349 |
+
.loc 1 66 24
|
350 |
+
mul.f32 %f78, %f13, %f58;
|
351 |
+
mul.f32 %f79, %f14, %f60;
|
352 |
+
mul.f32 %f80, %f15, %f62;
|
353 |
+
mul.f32 %f81, %f16, %f64;
|
354 |
+
.loc 1 67 24
|
355 |
+
neg.f32 %f82, %f78;
|
356 |
+
fma.rn.f32 %f83, %f1, %f70, %f82;
|
357 |
+
neg.f32 %f84, %f79;
|
358 |
+
fma.rn.f32 %f85, %f2, %f71, %f84;
|
359 |
+
neg.f32 %f86, %f80;
|
360 |
+
fma.rn.f32 %f87, %f3, %f72, %f86;
|
361 |
+
neg.f32 %f88, %f81;
|
362 |
+
fma.rn.f32 %f89, %f4, %f73, %f88;
|
363 |
+
.loc 1 69 24
|
364 |
+
add.f32 %f90, %f66, %f83;
|
365 |
+
add.f32 %f91, %f67, %f85;
|
366 |
+
add.f32 %f92, %f68, %f87;
|
367 |
+
add.f32 %f93, %f69, %f89;
|
368 |
+
.loc 1 70 29
|
369 |
+
add.s64 %rd59, %rd7, %rd64;
|
370 |
+
add.s64 %rd60, %rd59, 1608224;
|
371 |
+
add.s64 %rd61, %rd59, 3216448;
|
372 |
+
.loc 1 70 54
|
373 |
+
add.s64 %rd62, %rd59, 4824672;
|
374 |
+
mov.b32 %r65, %f90;
|
375 |
+
cvt.rn.bf16.f32 %rs25, %r65;
|
376 |
+
mov.b32 %r66, %f91;
|
377 |
+
cvt.rn.bf16.f32 %rs26, %r66;
|
378 |
+
mov.b32 %r67, %f92;
|
379 |
+
cvt.rn.bf16.f32 %rs27, %r67;
|
380 |
+
mov.b32 %r68, %f93;
|
381 |
+
cvt.rn.bf16.f32 %rs28, %r68;
|
382 |
+
@%p20 st.global.b16 [ %rd59 + 0 ], { %rs25 };
|
383 |
+
@%p20 st.global.b16 [ %rd60 + 0 ], { %rs26 };
|
384 |
+
@%p20 st.global.b16 [ %rd61 + 0 ], { %rs27 };
|
385 |
+
@%p20 st.global.b16 [ %rd62 + 0 ], { %rs28 };
|
386 |
+
.loc 1 51 36
|
387 |
+
add.s64 %rd65, %rd65, 32;
|
388 |
+
add.s64 %rd64, %rd64, 16;
|
389 |
+
setp.lt.u32 %p48, %r71, 50249;
|
390 |
+
@%p48 bra $L__BB0_3;
|
391 |
+
.loc 1 51 4
|
392 |
+
ret;
|
393 |
+
$L__tmp26:
|
394 |
+
$L__func_end0:
|
395 |
+
|
396 |
+
}
|
397 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
398 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
399 |
+
.section .debug_abbrev
|
400 |
+
{
|
401 |
+
.b8 1
|
402 |
+
.b8 17
|
403 |
+
.b8 1
|
404 |
+
.b8 37
|
405 |
+
.b8 8
|
406 |
+
.b8 19
|
407 |
+
.b8 5
|
408 |
+
.b8 3
|
409 |
+
.b8 8
|
410 |
+
.b8 16
|
411 |
+
.b8 6
|
412 |
+
.b8 27
|
413 |
+
.b8 8
|
414 |
+
.b8 180
|
415 |
+
.b8 66
|
416 |
+
.b8 12
|
417 |
+
.b8 17
|
418 |
+
.b8 1
|
419 |
+
.b8 18
|
420 |
+
.b8 1
|
421 |
+
.b8 0
|
422 |
+
.b8 0
|
423 |
+
.b8 2
|
424 |
+
.b8 46
|
425 |
+
.b8 0
|
426 |
+
.b8 135
|
427 |
+
.b8 64
|
428 |
+
.b8 8
|
429 |
+
.b8 3
|
430 |
+
.b8 8
|
431 |
+
.b8 58
|
432 |
+
.b8 11
|
433 |
+
.b8 59
|
434 |
+
.b8 11
|
435 |
+
.b8 63
|
436 |
+
.b8 12
|
437 |
+
.b8 32
|
438 |
+
.b8 11
|
439 |
+
.b8 0
|
440 |
+
.b8 0
|
441 |
+
.b8 3
|
442 |
+
.b8 46
|
443 |
+
.b8 1
|
444 |
+
.b8 17
|
445 |
+
.b8 1
|
446 |
+
.b8 18
|
447 |
+
.b8 1
|
448 |
+
.b8 64
|
449 |
+
.b8 10
|
450 |
+
.b8 49
|
451 |
+
.b8 19
|
452 |
+
.b8 0
|
453 |
+
.b8 0
|
454 |
+
.b8 4
|
455 |
+
.b8 29
|
456 |
+
.b8 0
|
457 |
+
.b8 49
|
458 |
+
.b8 19
|
459 |
+
.b8 17
|
460 |
+
.b8 1
|
461 |
+
.b8 18
|
462 |
+
.b8 1
|
463 |
+
.b8 88
|
464 |
+
.b8 11
|
465 |
+
.b8 89
|
466 |
+
.b8 11
|
467 |
+
.b8 87
|
468 |
+
.b8 11
|
469 |
+
.b8 0
|
470 |
+
.b8 0
|
471 |
+
.b8 5
|
472 |
+
.b8 29
|
473 |
+
.b8 1
|
474 |
+
.b8 49
|
475 |
+
.b8 19
|
476 |
+
.b8 17
|
477 |
+
.b8 1
|
478 |
+
.b8 18
|
479 |
+
.b8 1
|
480 |
+
.b8 88
|
481 |
+
.b8 11
|
482 |
+
.b8 89
|
483 |
+
.b8 11
|
484 |
+
.b8 87
|
485 |
+
.b8 11
|
486 |
+
.b8 0
|
487 |
+
.b8 0
|
488 |
+
.b8 0
|
489 |
+
}
|
490 |
+
.section .debug_info
|
491 |
+
{
|
492 |
+
.b32 278
|
493 |
+
.b8 2
|
494 |
+
.b8 0
|
495 |
+
.b32 .debug_abbrev
|
496 |
+
.b8 8
|
497 |
+
.b8 1
|
498 |
+
.b8 116
|
499 |
+
.b8 114
|
500 |
+
.b8 105
|
501 |
+
.b8 116
|
502 |
+
.b8 111
|
503 |
+
.b8 110
|
504 |
+
.b8 0
|
505 |
+
.b8 2
|
506 |
+
.b8 0
|
507 |
+
.b8 99
|
508 |
+
.b8 107
|
509 |
+
.b8 122
|
510 |
+
.b8 103
|
511 |
+
.b8 108
|
512 |
+
.b8 55
|
513 |
+
.b8 116
|
514 |
+
.b8 104
|
515 |
+
.b8 98
|
516 |
+
.b8 52
|
517 |
+
.b8 120
|
518 |
+
.b8 100
|
519 |
+
.b8 102
|
520 |
+
.b8 107
|
521 |
+
.b8 102
|
522 |
+
.b8 110
|
523 |
+
.b8 100
|
524 |
+
.b8 50
|
525 |
+
.b8 116
|
526 |
+
.b8 105
|
527 |
+
.b8 100
|
528 |
+
.b8 107
|
529 |
+
.b8 115
|
530 |
+
.b8 54
|
531 |
+
.b8 109
|
532 |
+
.b8 116
|
533 |
+
.b8 53
|
534 |
+
.b8 102
|
535 |
+
.b8 51
|
536 |
+
.b8 104
|
537 |
+
.b8 97
|
538 |
+
.b8 117
|
539 |
+
.b8 119
|
540 |
+
.b8 102
|
541 |
+
.b8 121
|
542 |
+
.b8 106
|
543 |
+
.b8 102
|
544 |
+
.b8 108
|
545 |
+
.b8 98
|
546 |
+
.b8 116
|
547 |
+
.b8 122
|
548 |
+
.b8 121
|
549 |
+
.b8 101
|
550 |
+
.b8 112
|
551 |
+
.b8 111
|
552 |
+
.b8 53
|
553 |
+
.b8 111
|
554 |
+
.b8 120
|
555 |
+
.b8 107
|
556 |
+
.b8 118
|
557 |
+
.b8 104
|
558 |
+
.b8 107
|
559 |
+
.b8 46
|
560 |
+
.b8 112
|
561 |
+
.b8 121
|
562 |
+
.b8 0
|
563 |
+
.b32 .debug_line
|
564 |
+
.b8 47
|
565 |
+
.b8 116
|
566 |
+
.b8 109
|
567 |
+
.b8 112
|
568 |
+
.b8 47
|
569 |
+
.b8 116
|
570 |
+
.b8 111
|
571 |
+
.b8 114
|
572 |
+
.b8 99
|
573 |
+
.b8 104
|
574 |
+
.b8 105
|
575 |
+
.b8 110
|
576 |
+
.b8 100
|
577 |
+
.b8 117
|
578 |
+
.b8 99
|
579 |
+
.b8 116
|
580 |
+
.b8 111
|
581 |
+
.b8 114
|
582 |
+
.b8 95
|
583 |
+
.b8 114
|
584 |
+
.b8 111
|
585 |
+
.b8 111
|
586 |
+
.b8 116
|
587 |
+
.b8 47
|
588 |
+
.b8 107
|
589 |
+
.b8 122
|
590 |
+
.b8 0
|
591 |
+
.b8 1
|
592 |
+
.b64 $L__func_begin0
|
593 |
+
.b64 $L__func_end0
|
594 |
+
.b8 2
|
595 |
+
.b8 116
|
596 |
+
.b8 114
|
597 |
+
.b8 105
|
598 |
+
.b8 116
|
599 |
+
.b8 111
|
600 |
+
.b8 110
|
601 |
+
.b8 95
|
602 |
+
.b8 95
|
603 |
+
.b8 48
|
604 |
+
.b8 100
|
605 |
+
.b8 49
|
606 |
+
.b8 100
|
607 |
+
.b8 50
|
608 |
+
.b8 100
|
609 |
+
.b8 51
|
610 |
+
.b8 100
|
611 |
+
.b8 52
|
612 |
+
.b8 100
|
613 |
+
.b8 53
|
614 |
+
.b8 100
|
615 |
+
.b8 54
|
616 |
+
.b8 100
|
617 |
+
.b8 55
|
618 |
+
.b8 100
|
619 |
+
.b8 101
|
620 |
+
.b8 56
|
621 |
+
.b8 0
|
622 |
+
.b8 116
|
623 |
+
.b8 114
|
624 |
+
.b8 105
|
625 |
+
.b8 116
|
626 |
+
.b8 111
|
627 |
+
.b8 110
|
628 |
+
.b8 95
|
629 |
+
.b8 95
|
630 |
+
.b8 48
|
631 |
+
.b8 100
|
632 |
+
.b8 49
|
633 |
+
.b8 100
|
634 |
+
.b8 50
|
635 |
+
.b8 100
|
636 |
+
.b8 51
|
637 |
+
.b8 100
|
638 |
+
.b8 52
|
639 |
+
.b8 100
|
640 |
+
.b8 53
|
641 |
+
.b8 100
|
642 |
+
.b8 54
|
643 |
+
.b8 100
|
644 |
+
.b8 55
|
645 |
+
.b8 100
|
646 |
+
.b8 101
|
647 |
+
.b8 56
|
648 |
+
.b8 0
|
649 |
+
.b8 1
|
650 |
+
.b8 18
|
651 |
+
.b8 1
|
652 |
+
.b8 1
|
653 |
+
.b8 3
|
654 |
+
.b64 $L__func_begin0
|
655 |
+
.b64 $L__func_end0
|
656 |
+
.b8 1
|
657 |
+
.b8 156
|
658 |
+
.b32 125
|
659 |
+
.b8 4
|
660 |
+
.b32 125
|
661 |
+
.b64 $L__tmp1
|
662 |
+
.b64 $L__tmp24
|
663 |
+
.b8 2
|
664 |
+
.b8 46
|
665 |
+
.b8 27
|
666 |
+
.b8 5
|
667 |
+
.b32 125
|
668 |
+
.b64 $L__tmp2
|
669 |
+
.b64 $L__tmp25
|
670 |
+
.b8 2
|
671 |
+
.b8 46
|
672 |
+
.b8 27
|
673 |
+
.b8 4
|
674 |
+
.b32 125
|
675 |
+
.b64 $L__tmp2
|
676 |
+
.b64 $L__tmp25
|
677 |
+
.b8 2
|
678 |
+
.b8 243
|
679 |
+
.b8 36
|
680 |
+
.b8 0
|
681 |
+
.b8 0
|
682 |
+
.b8 0
|
683 |
+
}
|
684 |
+
.section .debug_pubnames
|
685 |
+
{
|
686 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
687 |
+
$L__pubNames_start0:
|
688 |
+
.b8 2
|
689 |
+
.b8 0
|
690 |
+
.b32 .debug_info
|
691 |
+
.b32 282
|
692 |
+
.b32 125
|
693 |
+
.b8 116
|
694 |
+
.b8 114
|
695 |
+
.b8 105
|
696 |
+
.b8 116
|
697 |
+
.b8 111
|
698 |
+
.b8 110
|
699 |
+
.b8 95
|
700 |
+
.b8 95
|
701 |
+
.b8 48
|
702 |
+
.b8 100
|
703 |
+
.b8 49
|
704 |
+
.b8 100
|
705 |
+
.b8 50
|
706 |
+
.b8 100
|
707 |
+
.b8 51
|
708 |
+
.b8 100
|
709 |
+
.b8 52
|
710 |
+
.b8 100
|
711 |
+
.b8 53
|
712 |
+
.b8 100
|
713 |
+
.b8 54
|
714 |
+
.b8 100
|
715 |
+
.b8 55
|
716 |
+
.b8 100
|
717 |
+
.b8 101
|
718 |
+
.b8 56
|
719 |
+
.b8 0
|
720 |
+
.b32 0
|
721 |
+
$L__pubNames_end0:
|
722 |
+
}
|
723 |
+
.section .debug_pubtypes
|
724 |
+
{
|
725 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
726 |
+
$L__pubTypes_start0:
|
727 |
+
.b8 2
|
728 |
+
.b8 0
|
729 |
+
.b32 .debug_info
|
730 |
+
.b32 282
|
731 |
+
.b32 0
|
732 |
+
$L__pubTypes_end0:
|
733 |
+
}
|
734 |
+
.section .debug_loc { }
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
|
7 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
8 |
+
|
9 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
10 |
+
|
11 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
|
12 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
13 |
+
%5 = and i32 %4, 127, !dbg !10
|
14 |
+
%6 = shl nuw nsw i32 %5, 1, !dbg !10
|
15 |
+
%7 = or i32 %6, 1, !dbg !10
|
16 |
+
%8 = or i32 %6, 256, !dbg !10
|
17 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
|
18 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
19 |
+
%11 = shl nsw i64 %10, 9, !dbg !13
|
20 |
+
%12 = zext nneg i32 %6 to i64
|
21 |
+
%13 = zext nneg i32 %8 to i64
|
22 |
+
%14 = or i64 %11, %12, !dbg !14
|
23 |
+
%15 = or i64 %11, %13, !dbg !14
|
24 |
+
%16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
|
25 |
+
%17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
|
26 |
+
%18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
|
27 |
+
%19 = extractvalue { i64, i64 } %18, 0, !dbg !16
|
28 |
+
%20 = extractvalue { i64, i64 } %18, 1, !dbg !16
|
29 |
+
%21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
|
30 |
+
%22 = extractvalue { i64, i64 } %21, 0, !dbg !16
|
31 |
+
%23 = extractvalue { i64, i64 } %21, 1, !dbg !16
|
32 |
+
%24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
|
33 |
+
%25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
|
34 |
+
%26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
|
35 |
+
%27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
|
36 |
+
%28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
|
37 |
+
%29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
|
38 |
+
%30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
|
39 |
+
%31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
|
40 |
+
%32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
|
41 |
+
%33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
|
42 |
+
%34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
|
43 |
+
%35 = extractelement <4 x i1> %33, i64 3, !dbg !22
|
44 |
+
%36 = zext i1 %35 to i8, !dbg !22
|
45 |
+
%37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
|
46 |
+
store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
|
47 |
+
%38 = zext nneg i32 %7 to i64, !dbg !22
|
48 |
+
%39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
|
49 |
+
%40 = extractelement <4 x i1> %33, i64 2, !dbg !22
|
50 |
+
%41 = zext i1 %40 to i8, !dbg !22
|
51 |
+
%42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
|
52 |
+
store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
|
53 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
54 |
+
%43 = zext nneg i32 %5 to i64, !dbg !22
|
55 |
+
%44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
|
56 |
+
%45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
|
57 |
+
%46 = or i32 %5, 128, !dbg !22
|
58 |
+
%47 = zext nneg i32 %46 to i64, !dbg !22
|
59 |
+
%48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
|
60 |
+
%49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
|
61 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
62 |
+
%50 = extractelement <4 x i1> %33, i64 1, !dbg !22
|
63 |
+
%51 = zext i1 %50 to i8, !dbg !22
|
64 |
+
%52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
|
65 |
+
store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
|
66 |
+
%53 = extractelement <4 x i1> %33, i64 0, !dbg !22
|
67 |
+
%54 = zext i1 %53 to i8, !dbg !22
|
68 |
+
%55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
|
69 |
+
store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
|
70 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
71 |
+
%56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
|
72 |
+
%57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
|
73 |
+
%58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
|
74 |
+
%59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
|
75 |
+
%60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
|
76 |
+
%61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
|
77 |
+
%62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
|
78 |
+
%63 = bitcast <4 x i1> %62 to i4, !dbg !23
|
79 |
+
%.not = icmp eq i4 %63, 0, !dbg !23
|
80 |
+
br i1 %.not, label %65, label %64, !dbg !23
|
81 |
+
|
82 |
+
64: ; preds = %3
|
83 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
|
84 |
+
br label %65, !dbg !23
|
85 |
+
|
86 |
+
65: ; preds = %64, %3
|
87 |
+
%66 = or i32 %6, 257, !dbg !10
|
88 |
+
%67 = zext nneg i32 %66 to i64
|
89 |
+
%68 = or i64 %11, %67, !dbg !14
|
90 |
+
%69 = or i64 %11, %38, !dbg !14
|
91 |
+
%70 = mul nsw i64 %14, 50257, !dbg !24
|
92 |
+
%71 = mul nsw i64 %69, 50257, !dbg !24
|
93 |
+
%72 = mul nsw i64 %15, 50257, !dbg !24
|
94 |
+
%73 = mul nsw i64 %68, 50257, !dbg !24
|
95 |
+
%74 = extractelement <4 x i64> %32, i64 3, !dbg !25
|
96 |
+
%75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
|
97 |
+
%76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
|
98 |
+
%77 = extractelement <4 x i64> %32, i64 2, !dbg !25
|
99 |
+
%78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
|
100 |
+
%79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
|
101 |
+
%80 = extractelement <4 x i64> %32, i64 1, !dbg !25
|
102 |
+
%81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
|
103 |
+
%82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
|
104 |
+
%83 = extractelement <4 x i64> %32, i64 0, !dbg !25
|
105 |
+
%84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
|
106 |
+
%85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
|
107 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
108 |
+
%86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
|
109 |
+
%87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
|
110 |
+
%88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
|
111 |
+
store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
|
112 |
+
%89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
|
113 |
+
%90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
|
114 |
+
%91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
|
115 |
+
store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
|
116 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
117 |
+
%92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
|
118 |
+
%93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
|
119 |
+
%94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
|
120 |
+
%95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
|
121 |
+
%96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
|
122 |
+
%97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
|
123 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
124 |
+
%98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
|
125 |
+
%99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
|
126 |
+
store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
|
127 |
+
%100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
|
128 |
+
%101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
|
129 |
+
store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
|
130 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
131 |
+
%102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
|
132 |
+
%103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
|
133 |
+
%104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
|
134 |
+
%105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
|
135 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
|
136 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
|
137 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
|
138 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
|
139 |
+
ret void, !dbg !27
|
140 |
+
}
|
141 |
+
|
142 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
143 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
144 |
+
|
145 |
+
; Function Attrs: convergent nocallback nounwind
|
146 |
+
declare void @llvm.nvvm.barrier0() #1
|
147 |
+
|
148 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
149 |
+
attributes #1 = { convergent nocallback nounwind }
|
150 |
+
attributes #2 = { nounwind }
|
151 |
+
|
152 |
+
!llvm.module.flags = !{!0, !1}
|
153 |
+
!llvm.dbg.cu = !{!2}
|
154 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
155 |
+
!llvm.ident = !{!6}
|
156 |
+
|
157 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
158 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
159 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
160 |
+
!3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
|
161 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
162 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
163 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
164 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
165 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
166 |
+
!9 = !{}
|
167 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
168 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
169 |
+
!12 = !DILocation(line: 20, column: 34, scope: !7)
|
170 |
+
!13 = !DILocation(line: 20, column: 46, scope: !7)
|
171 |
+
!14 = !DILocation(line: 21, column: 23, scope: !7)
|
172 |
+
!15 = !DILocation(line: 24, column: 30, scope: !7)
|
173 |
+
!16 = !DILocation(line: 24, column: 35, scope: !7)
|
174 |
+
!17 = !DILocation(line: 26, column: 19, scope: !7)
|
175 |
+
!18 = !DILocation(line: 28, column: 32, scope: !7)
|
176 |
+
!19 = !DILocation(line: 29, column: 18, scope: !7)
|
177 |
+
!20 = !DILocation(line: 30, column: 18, scope: !7)
|
178 |
+
!21 = !DILocation(line: 31, column: 32, scope: !7)
|
179 |
+
!22 = !DILocation(line: 32, column: 36, scope: !7)
|
180 |
+
!23 = !DILocation(line: 32, column: 51, scope: !7)
|
181 |
+
!24 = !DILocation(line: 34, column: 39, scope: !7)
|
182 |
+
!25 = !DILocation(line: 34, column: 25, scope: !7)
|
183 |
+
!26 = !DILocation(line: 34, column: 51, scope: !7)
|
184 |
+
!27 = !DILocation(line: 34, column: 4, scope: !7)
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<50257> : tensor<512xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1>
|
9 |
+
%c512_i64 = arith.constant 512 : i64
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.extsi %0 : i32 to i64
|
12 |
+
%2 = arith.muli %1, %c512_i64 : i64
|
13 |
+
%3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
14 |
+
%4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
|
15 |
+
%5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
|
16 |
+
%6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
|
17 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>, #blocked>
|
18 |
+
%8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>, #blocked>, tensor<512xi64, #blocked>
|
19 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked>
|
20 |
+
%10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked>
|
21 |
+
%11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
|
22 |
+
%12 = arith.addi %11, %cst : tensor<512xi64, #blocked>
|
23 |
+
%13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked>
|
24 |
+
%14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
|
25 |
+
%15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked>
|
26 |
+
%16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked>
|
27 |
+
%17 = arith.andi %15, %16 : tensor<512xi1, #blocked>
|
28 |
+
%18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1>
|
29 |
+
tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1>
|
30 |
+
%19 = arith.muli %6, %cst : tensor<512xi64, #blocked>
|
31 |
+
%20 = arith.addi %14, %19 : tensor<512xi64, #blocked>
|
32 |
+
%21 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%22 = tt.addptr %21, %20 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
|
34 |
+
%23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr<f32, 1>, #blocked>) -> tensor<512x!tt.ptr<f32, 1>, #blocked1>
|
35 |
+
tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<50257> : tensor<512xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<512xi64>
|
5 |
+
%c512_i64 = arith.constant 512 : i64
|
6 |
+
%cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32>
|
7 |
+
%cst_2 = arith.constant dense<-1> : tensor<512xi64>
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.extsi %0 : i32 to i64
|
10 |
+
%2 = arith.muli %1, %c512_i64 : i64
|
11 |
+
%3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
12 |
+
%4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
|
13 |
+
%5 = tt.splat %2 : (i64) -> tensor<512xi64>
|
14 |
+
%6 = arith.addi %5, %4 : tensor<512xi64>
|
15 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>>
|
16 |
+
%8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>>, tensor<512xi64>
|
17 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64>
|
18 |
+
%10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64>
|
19 |
+
%11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64>
|
20 |
+
%12 = arith.addi %11, %cst : tensor<512xi64>
|
21 |
+
%13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64>
|
22 |
+
%14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64>
|
23 |
+
%15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64>
|
24 |
+
%16 = arith.cmpi slt, %14, %cst : tensor<512xi64>
|
25 |
+
%17 = arith.andi %15, %16 : tensor<512xi1>
|
26 |
+
tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1>
|
27 |
+
%18 = arith.muli %6, %cst : tensor<512xi64>
|
28 |
+
%19 = arith.addi %14, %18 : tensor<512xi64>
|
29 |
+
%20 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
30 |
+
%21 = tt.addptr %20, %19 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
|
31 |
+
tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
32 |
+
tt.return
|
33 |
+
}
|
34 |
+
}
|
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
12 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
|
13 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
%10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
16 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
17 |
+
tt.return
|
18 |
+
}
|
19 |
+
}
|
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
|
5 |
+
%c512_i64 = arith.constant 512 : i64
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.extsi %0 : i32 to i64
|
8 |
+
%2 = arith.muli %1, %c512_i64 : i64
|
9 |
+
%3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
10 |
+
%4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
|
11 |
+
%5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
|
12 |
+
%6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
|
13 |
+
%7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
|
15 |
+
tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin
ADDED
Binary file (10.5 kB). View file
|
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx
ADDED
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
12 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
21 |
+
)
|
22 |
+
.maxntid 256, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<16>;
|
25 |
+
.reg .b16 %rs<9>;
|
26 |
+
.reg .b32 %r<31>;
|
27 |
+
.reg .f32 %f<23>;
|
28 |
+
.reg .b64 %rd<51>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
34 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
35 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
36 |
+
ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
37 |
+
ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 22 44
|
40 |
+
mov.u32 %r13, %tid.x;
|
41 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
42 |
+
bfe.u32 %r14, %r13, 2, 6;
|
43 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
44 |
+
.loc 1 24 33
|
45 |
+
and.b32 %r1, %r13, 3;
|
46 |
+
.loc 1 21 28
|
47 |
+
mov.u32 %r6, %ctaid.x;
|
48 |
+
.loc 1 21 34
|
49 |
+
cvt.s64.s32 %rd1, %r6;
|
50 |
+
.loc 1 21 46
|
51 |
+
mul.wide.s32 %rd27, %r6, 64;
|
52 |
+
cvt.u64.u32 %rd2, %r14;
|
53 |
+
.loc 1 22 23
|
54 |
+
or.b64 %rd28, %rd27, %rd2;
|
55 |
+
.loc 1 26 30
|
56 |
+
shl.b64 %rd29, %rd28, 3;
|
57 |
+
add.s64 %rd22, %rd26, %rd29;
|
58 |
+
mov.pred %p1, -1;
|
59 |
+
.loc 1 26 35
|
60 |
+
mov.u64 %rd21, 0x0;
|
61 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ];
|
62 |
+
.loc 1 27 19
|
63 |
+
mov.u32 %r10, 0x0;
|
64 |
+
@%p1 ld.global.b32 { %r10 }, [ %rd23 + 0 ];
|
65 |
+
.loc 1 29 19
|
66 |
+
mov.u32 %r11, 0x0;
|
67 |
+
@%p1 ld.global.b32 { %r11 }, [ %rd24 + 0 ];
|
68 |
+
.loc 1 38 23
|
69 |
+
setp.eq.s64 %p4, %rd21, -1;
|
70 |
+
.loc 1 39 22
|
71 |
+
div.full.f32 %r9, %r10, %r11;
|
72 |
+
mov.b32 %f6, %r9;
|
73 |
+
.loc 1 41 37
|
74 |
+
selp.f32 %f1, 0f00000000, %f6, %p4;
|
75 |
+
.loc 1 32 36
|
76 |
+
mul.wide.s32 %rd30, %r6, 12865792;
|
77 |
+
mul.wide.u32 %rd31, %r14, 201028;
|
78 |
+
add.s64 %rd32, %rd30, %rd31;
|
79 |
+
cvt.u64.u32 %rd33, %r13;
|
80 |
+
and.b64 %rd3, %rd33, 3;
|
81 |
+
mul.wide.u32 %rd34, %r1, 4;
|
82 |
+
add.s64 %rd35, %rd32, %rd34;
|
83 |
+
add.s64 %rd50, %rd25, %rd35;
|
84 |
+
mov.f32 %f22, 0f00000000;
|
85 |
+
mov.b32 %r29, -4;
|
86 |
+
mov.u64 %rd46, %rd50;
|
87 |
+
$L__BB0_1:
|
88 |
+
add.s32 %r29, %r29, 4;
|
89 |
+
.loc 1 33 27
|
90 |
+
add.s32 %r17, %r29, %r1;
|
91 |
+
.loc 1 34 25
|
92 |
+
setp.lt.u32 %p5, %r17, 50257;
|
93 |
+
mov.b32 %r16, 0;
|
94 |
+
.loc 1 36 52
|
95 |
+
mov.u32 %r15, 0x0;
|
96 |
+
@%p5 ld.global.L1::evict_last.b32 { %r15 }, [ %rd46 + 0 ];
|
97 |
+
@!%p5 mov.u32 %r15, %r16;
|
98 |
+
mov.b32 %f7, %r15;
|
99 |
+
.loc 1 42 23
|
100 |
+
mul.f32 %f8, %f1, %f7;
|
101 |
+
.loc 1 45 40
|
102 |
+
selp.f32 %f9, %f8, 0f80000000, %p5;
|
103 |
+
add.f32 %f22, %f22, %f9;
|
104 |
+
.loc 1 32 36
|
105 |
+
add.s64 %rd46, %rd46, 16;
|
106 |
+
setp.lt.u32 %p7, %r29, 50253;
|
107 |
+
@%p7 bra $L__BB0_1;
|
108 |
+
$L__tmp1:
|
109 |
+
.loc 2 243 36
|
110 |
+
mov.b32 %r19, %f22;
|
111 |
+
shfl.sync.bfly.b32 %r20, %r19, 2, 31, -1;
|
112 |
+
mov.b32 %f10, %r20;
|
113 |
+
$L__tmp2:
|
114 |
+
.loc 2 233 15
|
115 |
+
add.f32 %f11, %f22, %f10;
|
116 |
+
$L__tmp3:
|
117 |
+
.loc 2 243 36
|
118 |
+
mov.b32 %r21, %f11;
|
119 |
+
shfl.sync.bfly.b32 %r22, %r21, 1, 31, -1;
|
120 |
+
mov.b32 %f12, %r22;
|
121 |
+
$L__tmp4:
|
122 |
+
.loc 2 233 15
|
123 |
+
add.f32 %f4, %f11, %f12;
|
124 |
+
$L__tmp5:
|
125 |
+
.loc 1 51 36
|
126 |
+
mul.lo.s64 %rd37, %rd1, 3216448;
|
127 |
+
mul.lo.s64 %rd38, %rd2, 50257;
|
128 |
+
add.s64 %rd39, %rd37, %rd38;
|
129 |
+
add.s64 %rd40, %rd39, %rd3;
|
130 |
+
shl.b64 %rd41, %rd40, 1;
|
131 |
+
add.s64 %rd49, %rd20, %rd41;
|
132 |
+
add.s64 %rd48, %rd19, %rd41;
|
133 |
+
add.s64 %rd47, %rd18, %rd41;
|
134 |
+
mov.b32 %r30, -4;
|
135 |
+
mov.u16 %rs2, 0;
|
136 |
+
$L__BB0_3:
|
137 |
+
add.s32 %r30, %r30, 4;
|
138 |
+
.loc 1 52 27
|
139 |
+
add.s32 %r28, %r30, %r1;
|
140 |
+
.loc 1 53 25
|
141 |
+
setp.lt.u32 %p8, %r28, 50257;
|
142 |
+
.loc 1 55 53
|
143 |
+
mov.u16 %rs1, 0x0;
|
144 |
+
@%p8 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
|
145 |
+
@!%p8 mov.u16 %rs1, %rs2;
|
146 |
+
.loc 1 55 105
|
147 |
+
cvt.f32.bf16 %r23, %rs1;
|
148 |
+
mov.b32 %f15, %r23;
|
149 |
+
.loc 1 56 53
|
150 |
+
mov.u32 %r24, 0x0;
|
151 |
+
@%p8 ld.global.L1::evict_first.b32 { %r24 }, [ %rd50 + 0 ];
|
152 |
+
@!%p8 mov.u32 %r24, %r16;
|
153 |
+
mov.b32 %f16, %r24;
|
154 |
+
.loc 1 57 53
|
155 |
+
mov.u16 %rs4, 0x0;
|
156 |
+
@%p8 ld.global.L1::evict_first.b16 { %rs4 }, [ %rd48 + 0 ];
|
157 |
+
@!%p8 mov.u16 %rs4, %rs2;
|
158 |
+
.loc 1 57 105
|
159 |
+
cvt.f32.bf16 %r26, %rs4;
|
160 |
+
mov.b32 %f17, %r26;
|
161 |
+
.loc 1 65 23
|
162 |
+
mul.f32 %f14, %f17, 0f3FB8AA3B;
|
163 |
+
ex2.approx.f32 %f13, %f14;
|
164 |
+
.loc 1 66 24
|
165 |
+
mul.f32 %f18, %f4, %f13;
|
166 |
+
.loc 1 67 24
|
167 |
+
neg.f32 %f19, %f18;
|
168 |
+
fma.rn.f32 %f20, %f1, %f16, %f19;
|
169 |
+
.loc 1 69 24
|
170 |
+
add.f32 %f21, %f15, %f20;
|
171 |
+
.loc 1 70 54
|
172 |
+
mov.b32 %r27, %f21;
|
173 |
+
cvt.rn.bf16.f32 %rs7, %r27;
|
174 |
+
@%p8 st.global.b16 [ %rd49 + 0 ], { %rs7 };
|
175 |
+
.loc 1 51 36
|
176 |
+
add.s64 %rd50, %rd50, 16;
|
177 |
+
add.s64 %rd49, %rd49, 8;
|
178 |
+
add.s64 %rd48, %rd48, 8;
|
179 |
+
add.s64 %rd47, %rd47, 8;
|
180 |
+
setp.lt.u32 %p15, %r30, 50253;
|
181 |
+
@%p15 bra $L__BB0_3;
|
182 |
+
.loc 1 51 4
|
183 |
+
ret;
|
184 |
+
$L__tmp6:
|
185 |
+
$L__func_end0:
|
186 |
+
|
187 |
+
}
|
188 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
189 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
190 |
+
.section .debug_abbrev
|
191 |
+
{
|
192 |
+
.b8 1
|
193 |
+
.b8 17
|
194 |
+
.b8 1
|
195 |
+
.b8 37
|
196 |
+
.b8 8
|
197 |
+
.b8 19
|
198 |
+
.b8 5
|
199 |
+
.b8 3
|
200 |
+
.b8 8
|
201 |
+
.b8 16
|
202 |
+
.b8 6
|
203 |
+
.b8 27
|
204 |
+
.b8 8
|
205 |
+
.b8 180
|
206 |
+
.b8 66
|
207 |
+
.b8 12
|
208 |
+
.b8 17
|
209 |
+
.b8 1
|
210 |
+
.b8 18
|
211 |
+
.b8 1
|
212 |
+
.b8 0
|
213 |
+
.b8 0
|
214 |
+
.b8 2
|
215 |
+
.b8 46
|
216 |
+
.b8 0
|
217 |
+
.b8 135
|
218 |
+
.b8 64
|
219 |
+
.b8 8
|
220 |
+
.b8 3
|
221 |
+
.b8 8
|
222 |
+
.b8 58
|
223 |
+
.b8 11
|
224 |
+
.b8 59
|
225 |
+
.b8 11
|
226 |
+
.b8 63
|
227 |
+
.b8 12
|
228 |
+
.b8 32
|
229 |
+
.b8 11
|
230 |
+
.b8 0
|
231 |
+
.b8 0
|
232 |
+
.b8 3
|
233 |
+
.b8 46
|
234 |
+
.b8 1
|
235 |
+
.b8 17
|
236 |
+
.b8 1
|
237 |
+
.b8 18
|
238 |
+
.b8 1
|
239 |
+
.b8 64
|
240 |
+
.b8 10
|
241 |
+
.b8 49
|
242 |
+
.b8 19
|
243 |
+
.b8 0
|
244 |
+
.b8 0
|
245 |
+
.b8 4
|
246 |
+
.b8 29
|
247 |
+
.b8 0
|
248 |
+
.b8 49
|
249 |
+
.b8 19
|
250 |
+
.b8 17
|
251 |
+
.b8 1
|
252 |
+
.b8 18
|
253 |
+
.b8 1
|
254 |
+
.b8 88
|
255 |
+
.b8 11
|
256 |
+
.b8 89
|
257 |
+
.b8 11
|
258 |
+
.b8 87
|
259 |
+
.b8 11
|
260 |
+
.b8 0
|
261 |
+
.b8 0
|
262 |
+
.b8 5
|
263 |
+
.b8 29
|
264 |
+
.b8 1
|
265 |
+
.b8 49
|
266 |
+
.b8 19
|
267 |
+
.b8 17
|
268 |
+
.b8 1
|
269 |
+
.b8 18
|
270 |
+
.b8 1
|
271 |
+
.b8 88
|
272 |
+
.b8 11
|
273 |
+
.b8 89
|
274 |
+
.b8 11
|
275 |
+
.b8 87
|
276 |
+
.b8 11
|
277 |
+
.b8 0
|
278 |
+
.b8 0
|
279 |
+
.b8 0
|
280 |
+
}
|
281 |
+
.section .debug_info
|
282 |
+
{
|
283 |
+
.b32 278
|
284 |
+
.b8 2
|
285 |
+
.b8 0
|
286 |
+
.b32 .debug_abbrev
|
287 |
+
.b8 8
|
288 |
+
.b8 1
|
289 |
+
.b8 116
|
290 |
+
.b8 114
|
291 |
+
.b8 105
|
292 |
+
.b8 116
|
293 |
+
.b8 111
|
294 |
+
.b8 110
|
295 |
+
.b8 0
|
296 |
+
.b8 2
|
297 |
+
.b8 0
|
298 |
+
.b8 99
|
299 |
+
.b8 107
|
300 |
+
.b8 122
|
301 |
+
.b8 103
|
302 |
+
.b8 108
|
303 |
+
.b8 55
|
304 |
+
.b8 116
|
305 |
+
.b8 104
|
306 |
+
.b8 98
|
307 |
+
.b8 52
|
308 |
+
.b8 120
|
309 |
+
.b8 100
|
310 |
+
.b8 102
|
311 |
+
.b8 107
|
312 |
+
.b8 102
|
313 |
+
.b8 110
|
314 |
+
.b8 100
|
315 |
+
.b8 50
|
316 |
+
.b8 116
|
317 |
+
.b8 105
|
318 |
+
.b8 100
|
319 |
+
.b8 107
|
320 |
+
.b8 115
|
321 |
+
.b8 54
|
322 |
+
.b8 109
|
323 |
+
.b8 116
|
324 |
+
.b8 53
|
325 |
+
.b8 102
|
326 |
+
.b8 51
|
327 |
+
.b8 104
|
328 |
+
.b8 97
|
329 |
+
.b8 117
|
330 |
+
.b8 119
|
331 |
+
.b8 102
|
332 |
+
.b8 121
|
333 |
+
.b8 106
|
334 |
+
.b8 102
|
335 |
+
.b8 108
|
336 |
+
.b8 98
|
337 |
+
.b8 116
|
338 |
+
.b8 122
|
339 |
+
.b8 121
|
340 |
+
.b8 101
|
341 |
+
.b8 112
|
342 |
+
.b8 111
|
343 |
+
.b8 53
|
344 |
+
.b8 111
|
345 |
+
.b8 120
|
346 |
+
.b8 107
|
347 |
+
.b8 118
|
348 |
+
.b8 104
|
349 |
+
.b8 107
|
350 |
+
.b8 46
|
351 |
+
.b8 112
|
352 |
+
.b8 121
|
353 |
+
.b8 0
|
354 |
+
.b32 .debug_line
|
355 |
+
.b8 47
|
356 |
+
.b8 116
|
357 |
+
.b8 109
|
358 |
+
.b8 112
|
359 |
+
.b8 47
|
360 |
+
.b8 116
|
361 |
+
.b8 111
|
362 |
+
.b8 114
|
363 |
+
.b8 99
|
364 |
+
.b8 104
|
365 |
+
.b8 105
|
366 |
+
.b8 110
|
367 |
+
.b8 100
|
368 |
+
.b8 117
|
369 |
+
.b8 99
|
370 |
+
.b8 116
|
371 |
+
.b8 111
|
372 |
+
.b8 114
|
373 |
+
.b8 95
|
374 |
+
.b8 114
|
375 |
+
.b8 111
|
376 |
+
.b8 111
|
377 |
+
.b8 116
|
378 |
+
.b8 47
|
379 |
+
.b8 107
|
380 |
+
.b8 122
|
381 |
+
.b8 0
|
382 |
+
.b8 1
|
383 |
+
.b64 $L__func_begin0
|
384 |
+
.b64 $L__func_end0
|
385 |
+
.b8 2
|
386 |
+
.b8 116
|
387 |
+
.b8 114
|
388 |
+
.b8 105
|
389 |
+
.b8 116
|
390 |
+
.b8 111
|
391 |
+
.b8 110
|
392 |
+
.b8 95
|
393 |
+
.b8 95
|
394 |
+
.b8 48
|
395 |
+
.b8 100
|
396 |
+
.b8 49
|
397 |
+
.b8 100
|
398 |
+
.b8 50
|
399 |
+
.b8 100
|
400 |
+
.b8 51
|
401 |
+
.b8 100
|
402 |
+
.b8 52
|
403 |
+
.b8 100
|
404 |
+
.b8 53
|
405 |
+
.b8 100
|
406 |
+
.b8 54
|
407 |
+
.b8 100
|
408 |
+
.b8 55
|
409 |
+
.b8 100
|
410 |
+
.b8 101
|
411 |
+
.b8 56
|
412 |
+
.b8 0
|
413 |
+
.b8 116
|
414 |
+
.b8 114
|
415 |
+
.b8 105
|
416 |
+
.b8 116
|
417 |
+
.b8 111
|
418 |
+
.b8 110
|
419 |
+
.b8 95
|
420 |
+
.b8 95
|
421 |
+
.b8 48
|
422 |
+
.b8 100
|
423 |
+
.b8 49
|
424 |
+
.b8 100
|
425 |
+
.b8 50
|
426 |
+
.b8 100
|
427 |
+
.b8 51
|
428 |
+
.b8 100
|
429 |
+
.b8 52
|
430 |
+
.b8 100
|
431 |
+
.b8 53
|
432 |
+
.b8 100
|
433 |
+
.b8 54
|
434 |
+
.b8 100
|
435 |
+
.b8 55
|
436 |
+
.b8 100
|
437 |
+
.b8 101
|
438 |
+
.b8 56
|
439 |
+
.b8 0
|
440 |
+
.b8 1
|
441 |
+
.b8 18
|
442 |
+
.b8 1
|
443 |
+
.b8 1
|
444 |
+
.b8 3
|
445 |
+
.b64 $L__func_begin0
|
446 |
+
.b64 $L__func_end0
|
447 |
+
.b8 1
|
448 |
+
.b8 156
|
449 |
+
.b32 125
|
450 |
+
.b8 4
|
451 |
+
.b32 125
|
452 |
+
.b64 $L__tmp1
|
453 |
+
.b64 $L__tmp4
|
454 |
+
.b8 2
|
455 |
+
.b8 46
|
456 |
+
.b8 27
|
457 |
+
.b8 5
|
458 |
+
.b32 125
|
459 |
+
.b64 $L__tmp2
|
460 |
+
.b64 $L__tmp5
|
461 |
+
.b8 2
|
462 |
+
.b8 46
|
463 |
+
.b8 27
|
464 |
+
.b8 4
|
465 |
+
.b32 125
|
466 |
+
.b64 $L__tmp2
|
467 |
+
.b64 $L__tmp5
|
468 |
+
.b8 2
|
469 |
+
.b8 243
|
470 |
+
.b8 36
|
471 |
+
.b8 0
|
472 |
+
.b8 0
|
473 |
+
.b8 0
|
474 |
+
}
|
475 |
+
.section .debug_pubnames
|
476 |
+
{
|
477 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
478 |
+
$L__pubNames_start0:
|
479 |
+
.b8 2
|
480 |
+
.b8 0
|
481 |
+
.b32 .debug_info
|
482 |
+
.b32 282
|
483 |
+
.b32 125
|
484 |
+
.b8 116
|
485 |
+
.b8 114
|
486 |
+
.b8 105
|
487 |
+
.b8 116
|
488 |
+
.b8 111
|
489 |
+
.b8 110
|
490 |
+
.b8 95
|
491 |
+
.b8 95
|
492 |
+
.b8 48
|
493 |
+
.b8 100
|
494 |
+
.b8 49
|
495 |
+
.b8 100
|
496 |
+
.b8 50
|
497 |
+
.b8 100
|
498 |
+
.b8 51
|
499 |
+
.b8 100
|
500 |
+
.b8 52
|
501 |
+
.b8 100
|
502 |
+
.b8 53
|
503 |
+
.b8 100
|
504 |
+
.b8 54
|
505 |
+
.b8 100
|
506 |
+
.b8 55
|
507 |
+
.b8 100
|
508 |
+
.b8 101
|
509 |
+
.b8 56
|
510 |
+
.b8 0
|
511 |
+
.b32 0
|
512 |
+
$L__pubNames_end0:
|
513 |
+
}
|
514 |
+
.section .debug_pubtypes
|
515 |
+
{
|
516 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
517 |
+
$L__pubTypes_start0:
|
518 |
+
.b8 2
|
519 |
+
.b8 0
|
520 |
+
.b32 .debug_info
|
521 |
+
.b32 282
|
522 |
+
.b32 0
|
523 |
+
$L__pubTypes_end0:
|
524 |
+
}
|
525 |
+
.section .debug_loc { }
|
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
10 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
11 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
12 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
14 |
+
%8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
15 |
+
%9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
16 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
17 |
+
%11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
18 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
19 |
+
%13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
20 |
+
%14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
21 |
+
tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
22 |
+
tt.return
|
23 |
+
}
|
24 |
+
}
|
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir
ADDED
@@ -0,0 +1,793 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
|
7 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%11 = lshr i32 %10, 5, !dbg !8
|
9 |
+
%urem = and i32 %10, 255, !dbg !9
|
10 |
+
%12 = or i32 %urem, 256, !dbg !9
|
11 |
+
%13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
12 |
+
%14 = sext i32 %13 to i64, !dbg !11
|
13 |
+
%15 = shl nsw i64 %14, 3, !dbg !12
|
14 |
+
%16 = or i64 %15, 1, !dbg !13
|
15 |
+
%17 = or i64 %15, 2, !dbg !13
|
16 |
+
%18 = or i64 %15, 3, !dbg !13
|
17 |
+
%19 = or i64 %15, 4, !dbg !13
|
18 |
+
%20 = or i64 %15, 5, !dbg !13
|
19 |
+
%21 = or i64 %15, 6, !dbg !13
|
20 |
+
%22 = or i64 %15, 7, !dbg !13
|
21 |
+
%23 = insertelement <2 x i32> poison, i32 %urem, i64 0
|
22 |
+
%24 = insertelement <2 x i32> %23, i32 %12, i64 1
|
23 |
+
%25 = zext nneg <2 x i32> %24 to <2 x i64>
|
24 |
+
%26 = getelementptr i64, ptr addrspace(1) %1, i64 %15, !dbg !14
|
25 |
+
%27 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !14
|
26 |
+
%28 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !14
|
27 |
+
%29 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
|
28 |
+
%30 = getelementptr i64, ptr addrspace(1) %1, i64 %19, !dbg !14
|
29 |
+
%31 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !14
|
30 |
+
%32 = getelementptr i64, ptr addrspace(1) %1, i64 %21, !dbg !14
|
31 |
+
%33 = getelementptr i64, ptr addrspace(1) %1, i64 %22, !dbg !14
|
32 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #3, !dbg !15
|
33 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #3, !dbg !15
|
34 |
+
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #3, !dbg !15
|
35 |
+
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #3, !dbg !15
|
36 |
+
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !15
|
37 |
+
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %31, i1 true) #3, !dbg !15
|
38 |
+
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %32, i1 true) #3, !dbg !15
|
39 |
+
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %33, i1 true) #3, !dbg !15
|
40 |
+
%42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16
|
41 |
+
%43 = bitcast i32 %42 to float, !dbg !16
|
42 |
+
%44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17
|
43 |
+
%45 = bitcast i32 %44 to float, !dbg !17
|
44 |
+
%46 = mul nsw i64 %14, 402056, !dbg !18
|
45 |
+
%47 = mul nsw i64 %16, 50257, !dbg !18
|
46 |
+
%48 = mul nsw i64 %17, 50257, !dbg !18
|
47 |
+
%49 = mul nsw i64 %18, 50257, !dbg !18
|
48 |
+
%50 = mul nsw i64 %19, 50257, !dbg !18
|
49 |
+
%51 = mul nsw i64 %20, 50257, !dbg !18
|
50 |
+
%52 = mul nsw i64 %21, 50257, !dbg !18
|
51 |
+
%53 = mul nsw i64 %22, 50257, !dbg !18
|
52 |
+
%54 = insertelement <8 x i64> poison, i64 %34, i64 0, !dbg !19
|
53 |
+
%55 = insertelement <8 x i64> %54, i64 %35, i64 1, !dbg !19
|
54 |
+
%56 = insertelement <8 x i64> %55, i64 %36, i64 2, !dbg !19
|
55 |
+
%57 = insertelement <8 x i64> %56, i64 %37, i64 3, !dbg !19
|
56 |
+
%58 = insertelement <8 x i64> %57, i64 %38, i64 4, !dbg !19
|
57 |
+
%59 = insertelement <8 x i64> %58, i64 %39, i64 5, !dbg !19
|
58 |
+
%60 = insertelement <8 x i64> %59, i64 %40, i64 6, !dbg !19
|
59 |
+
%61 = insertelement <8 x i64> %60, i64 %41, i64 7, !dbg !19
|
60 |
+
%62 = icmp eq <8 x i64> %61, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
|
61 |
+
%63 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %43, float %45) #3, !dbg !20
|
62 |
+
%64 = insertelement <8 x float> poison, float %63, i64 0, !dbg !21
|
63 |
+
%65 = shufflevector <8 x float> %64, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !21
|
64 |
+
%66 = select <8 x i1> %62, <8 x float> zeroinitializer, <8 x float> %65, !dbg !21
|
65 |
+
%67 = shufflevector <8 x float> %66, <8 x float> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>, !dbg !21
|
66 |
+
br label %68, !dbg !22
|
67 |
+
|
68 |
+
68: ; preds = %9, %68
|
69 |
+
%69 = phi i32 [ 0, %9 ], [ %135, %68 ]
|
70 |
+
%70 = phi <16 x float> [ zeroinitializer, %9 ], [ %134, %68 ]
|
71 |
+
%71 = zext nneg i32 %69 to i64, !dbg !23
|
72 |
+
%72 = insertelement <2 x i64> poison, i64 %71, i64 0, !dbg !23
|
73 |
+
%73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23
|
74 |
+
%74 = or <2 x i64> %73, %25, !dbg !23
|
75 |
+
%75 = icmp ult <2 x i64> %74, <i64 50257, i64 50257>, !dbg !24
|
76 |
+
%76 = shufflevector <2 x i1> %75, <2 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, !dbg !24
|
77 |
+
%77 = extractelement <2 x i64> %74, i64 0, !dbg !25
|
78 |
+
%78 = getelementptr float, ptr addrspace(1) %0, i64 %77, !dbg !25
|
79 |
+
%79 = getelementptr float, ptr addrspace(1) %78, i64 %46, !dbg !25
|
80 |
+
%80 = extractelement <2 x i64> %74, i64 1, !dbg !25
|
81 |
+
%81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !25
|
82 |
+
%82 = getelementptr float, ptr addrspace(1) %81, i64 %46, !dbg !25
|
83 |
+
%83 = getelementptr float, ptr addrspace(1) %78, i64 %47, !dbg !25
|
84 |
+
%84 = getelementptr float, ptr addrspace(1) %81, i64 %47, !dbg !25
|
85 |
+
%85 = getelementptr float, ptr addrspace(1) %78, i64 %48, !dbg !25
|
86 |
+
%86 = getelementptr float, ptr addrspace(1) %81, i64 %48, !dbg !25
|
87 |
+
%87 = getelementptr float, ptr addrspace(1) %78, i64 %49, !dbg !25
|
88 |
+
%88 = getelementptr float, ptr addrspace(1) %81, i64 %49, !dbg !25
|
89 |
+
%89 = getelementptr float, ptr addrspace(1) %78, i64 %50, !dbg !25
|
90 |
+
%90 = getelementptr float, ptr addrspace(1) %81, i64 %50, !dbg !25
|
91 |
+
%91 = getelementptr float, ptr addrspace(1) %78, i64 %51, !dbg !25
|
92 |
+
%92 = getelementptr float, ptr addrspace(1) %81, i64 %51, !dbg !25
|
93 |
+
%93 = getelementptr float, ptr addrspace(1) %78, i64 %52, !dbg !25
|
94 |
+
%94 = getelementptr float, ptr addrspace(1) %81, i64 %52, !dbg !25
|
95 |
+
%95 = getelementptr float, ptr addrspace(1) %78, i64 %53, !dbg !25
|
96 |
+
%96 = getelementptr float, ptr addrspace(1) %81, i64 %53, !dbg !25
|
97 |
+
%97 = extractelement <2 x i1> %75, i64 0, !dbg !26
|
98 |
+
%98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %79, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
99 |
+
%99 = extractelement <2 x i1> %75, i64 1, !dbg !26
|
100 |
+
%100 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
101 |
+
%101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %83, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
102 |
+
%102 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %84, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
103 |
+
%103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %85, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
104 |
+
%104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %86, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
105 |
+
%105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %87, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
106 |
+
%106 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %88, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
107 |
+
%107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %89, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
108 |
+
%108 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %90, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
109 |
+
%109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
110 |
+
%110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %92, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
111 |
+
%111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %93, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
112 |
+
%112 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %94, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
113 |
+
%113 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %95, i1 %97, i32 0, i1 %97) #3, !dbg !26
|
114 |
+
%114 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %96, i1 %99, i32 0, i1 %99) #3, !dbg !26
|
115 |
+
%115 = insertelement <16 x i32> poison, i32 %98, i64 0, !dbg !26
|
116 |
+
%116 = insertelement <16 x i32> %115, i32 %100, i64 1, !dbg !26
|
117 |
+
%117 = insertelement <16 x i32> %116, i32 %101, i64 2, !dbg !26
|
118 |
+
%118 = insertelement <16 x i32> %117, i32 %102, i64 3, !dbg !26
|
119 |
+
%119 = insertelement <16 x i32> %118, i32 %103, i64 4, !dbg !26
|
120 |
+
%120 = insertelement <16 x i32> %119, i32 %104, i64 5, !dbg !26
|
121 |
+
%121 = insertelement <16 x i32> %120, i32 %105, i64 6, !dbg !26
|
122 |
+
%122 = insertelement <16 x i32> %121, i32 %106, i64 7, !dbg !26
|
123 |
+
%123 = insertelement <16 x i32> %122, i32 %107, i64 8, !dbg !26
|
124 |
+
%124 = insertelement <16 x i32> %123, i32 %108, i64 9, !dbg !26
|
125 |
+
%125 = insertelement <16 x i32> %124, i32 %109, i64 10, !dbg !26
|
126 |
+
%126 = insertelement <16 x i32> %125, i32 %110, i64 11, !dbg !26
|
127 |
+
%127 = insertelement <16 x i32> %126, i32 %111, i64 12, !dbg !26
|
128 |
+
%128 = insertelement <16 x i32> %127, i32 %112, i64 13, !dbg !26
|
129 |
+
%129 = insertelement <16 x i32> %128, i32 %113, i64 14, !dbg !26
|
130 |
+
%130 = insertelement <16 x i32> %129, i32 %114, i64 15, !dbg !26
|
131 |
+
%131 = bitcast <16 x i32> %130 to <16 x float>, !dbg !26
|
132 |
+
%132 = fmul <16 x float> %67, %131, !dbg !27
|
133 |
+
%133 = select <16 x i1> %76, <16 x float> %132, <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
|
134 |
+
%134 = fadd <16 x float> %70, %133, !dbg !28
|
135 |
+
%135 = add nuw nsw i32 %69, 512, !dbg !22
|
136 |
+
%136 = icmp ult i32 %69, 49745, !dbg !22
|
137 |
+
br i1 %136, label %68, label %137, !dbg !22
|
138 |
+
|
139 |
+
137: ; preds = %68
|
140 |
+
%138 = and i32 %10, 31, !dbg !8
|
141 |
+
%139 = and i32 %11, 7, !dbg !9
|
142 |
+
%shift = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
143 |
+
%140 = fadd <16 x float> %134, %shift, !dbg !29
|
144 |
+
%141 = extractelement <16 x float> %140, i64 0, !dbg !29
|
145 |
+
%shift54 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
146 |
+
%142 = fadd <16 x float> %134, %shift54, !dbg !29
|
147 |
+
%143 = extractelement <16 x float> %142, i64 2, !dbg !29
|
148 |
+
%shift55 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
149 |
+
%144 = fadd <16 x float> %134, %shift55, !dbg !29
|
150 |
+
%145 = extractelement <16 x float> %144, i64 4, !dbg !29
|
151 |
+
%shift56 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
152 |
+
%146 = fadd <16 x float> %134, %shift56, !dbg !29
|
153 |
+
%147 = extractelement <16 x float> %146, i64 6, !dbg !29
|
154 |
+
%shift57 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
155 |
+
%148 = fadd <16 x float> %134, %shift57, !dbg !29
|
156 |
+
%149 = extractelement <16 x float> %148, i64 8, !dbg !29
|
157 |
+
%shift58 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
|
158 |
+
%150 = fadd <16 x float> %134, %shift58, !dbg !29
|
159 |
+
%151 = extractelement <16 x float> %150, i64 10, !dbg !29
|
160 |
+
%shift59 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 13, i32 poison, i32 poison, i32 poison>, !dbg !29
|
161 |
+
%152 = fadd <16 x float> %134, %shift59, !dbg !29
|
162 |
+
%153 = extractelement <16 x float> %152, i64 12, !dbg !29
|
163 |
+
%shift60 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>, !dbg !29
|
164 |
+
%154 = fadd <16 x float> %134, %shift60, !dbg !29
|
165 |
+
%155 = extractelement <16 x float> %154, i64 14, !dbg !29
|
166 |
+
%156 = bitcast float %141 to i32, !dbg !35
|
167 |
+
%157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 16, i32 31), !dbg !35
|
168 |
+
%158 = bitcast i32 %157 to float, !dbg !35
|
169 |
+
%159 = fadd float %141, %158, !dbg !29
|
170 |
+
%160 = bitcast float %159 to i32, !dbg !35
|
171 |
+
%161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 8, i32 31), !dbg !35
|
172 |
+
%162 = bitcast i32 %161 to float, !dbg !35
|
173 |
+
%163 = fadd float %159, %162, !dbg !29
|
174 |
+
%164 = bitcast float %163 to i32, !dbg !35
|
175 |
+
%165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 4, i32 31), !dbg !35
|
176 |
+
%166 = bitcast i32 %165 to float, !dbg !35
|
177 |
+
%167 = fadd float %163, %166, !dbg !29
|
178 |
+
%168 = bitcast float %167 to i32, !dbg !35
|
179 |
+
%169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !35
|
180 |
+
%170 = bitcast i32 %169 to float, !dbg !35
|
181 |
+
%171 = fadd float %167, %170, !dbg !29
|
182 |
+
%172 = bitcast float %171 to i32, !dbg !35
|
183 |
+
%173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !35
|
184 |
+
%174 = bitcast i32 %173 to float, !dbg !35
|
185 |
+
%175 = fadd float %171, %174, !dbg !29
|
186 |
+
%176 = bitcast float %143 to i32, !dbg !35
|
187 |
+
%177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 16, i32 31), !dbg !35
|
188 |
+
%178 = bitcast i32 %177 to float, !dbg !35
|
189 |
+
%179 = fadd float %143, %178, !dbg !29
|
190 |
+
%180 = bitcast float %179 to i32, !dbg !35
|
191 |
+
%181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !35
|
192 |
+
%182 = bitcast i32 %181 to float, !dbg !35
|
193 |
+
%183 = fadd float %179, %182, !dbg !29
|
194 |
+
%184 = bitcast float %183 to i32, !dbg !35
|
195 |
+
%185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 4, i32 31), !dbg !35
|
196 |
+
%186 = bitcast i32 %185 to float, !dbg !35
|
197 |
+
%187 = fadd float %183, %186, !dbg !29
|
198 |
+
%188 = bitcast float %187 to i32, !dbg !35
|
199 |
+
%189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 2, i32 31), !dbg !35
|
200 |
+
%190 = bitcast i32 %189 to float, !dbg !35
|
201 |
+
%191 = fadd float %187, %190, !dbg !29
|
202 |
+
%192 = bitcast float %191 to i32, !dbg !35
|
203 |
+
%193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !35
|
204 |
+
%194 = bitcast i32 %193 to float, !dbg !35
|
205 |
+
%195 = fadd float %191, %194, !dbg !29
|
206 |
+
%196 = bitcast float %145 to i32, !dbg !35
|
207 |
+
%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !35
|
208 |
+
%198 = bitcast i32 %197 to float, !dbg !35
|
209 |
+
%199 = fadd float %145, %198, !dbg !29
|
210 |
+
%200 = bitcast float %199 to i32, !dbg !35
|
211 |
+
%201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !35
|
212 |
+
%202 = bitcast i32 %201 to float, !dbg !35
|
213 |
+
%203 = fadd float %199, %202, !dbg !29
|
214 |
+
%204 = bitcast float %203 to i32, !dbg !35
|
215 |
+
%205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 4, i32 31), !dbg !35
|
216 |
+
%206 = bitcast i32 %205 to float, !dbg !35
|
217 |
+
%207 = fadd float %203, %206, !dbg !29
|
218 |
+
%208 = bitcast float %207 to i32, !dbg !35
|
219 |
+
%209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 2, i32 31), !dbg !35
|
220 |
+
%210 = bitcast i32 %209 to float, !dbg !35
|
221 |
+
%211 = fadd float %207, %210, !dbg !29
|
222 |
+
%212 = bitcast float %211 to i32, !dbg !35
|
223 |
+
%213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !35
|
224 |
+
%214 = bitcast i32 %213 to float, !dbg !35
|
225 |
+
%215 = fadd float %211, %214, !dbg !29
|
226 |
+
%216 = bitcast float %147 to i32, !dbg !35
|
227 |
+
%217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 16, i32 31), !dbg !35
|
228 |
+
%218 = bitcast i32 %217 to float, !dbg !35
|
229 |
+
%219 = fadd float %147, %218, !dbg !29
|
230 |
+
%220 = bitcast float %219 to i32, !dbg !35
|
231 |
+
%221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 8, i32 31), !dbg !35
|
232 |
+
%222 = bitcast i32 %221 to float, !dbg !35
|
233 |
+
%223 = fadd float %219, %222, !dbg !29
|
234 |
+
%224 = bitcast float %223 to i32, !dbg !35
|
235 |
+
%225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !35
|
236 |
+
%226 = bitcast i32 %225 to float, !dbg !35
|
237 |
+
%227 = fadd float %223, %226, !dbg !29
|
238 |
+
%228 = bitcast float %227 to i32, !dbg !35
|
239 |
+
%229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !35
|
240 |
+
%230 = bitcast i32 %229 to float, !dbg !35
|
241 |
+
%231 = fadd float %227, %230, !dbg !29
|
242 |
+
%232 = bitcast float %231 to i32, !dbg !35
|
243 |
+
%233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !35
|
244 |
+
%234 = bitcast i32 %233 to float, !dbg !35
|
245 |
+
%235 = fadd float %231, %234, !dbg !29
|
246 |
+
%236 = bitcast float %149 to i32, !dbg !35
|
247 |
+
%237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 16, i32 31), !dbg !35
|
248 |
+
%238 = bitcast i32 %237 to float, !dbg !35
|
249 |
+
%239 = fadd float %149, %238, !dbg !29
|
250 |
+
%240 = bitcast float %239 to i32, !dbg !35
|
251 |
+
%241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !35
|
252 |
+
%242 = bitcast i32 %241 to float, !dbg !35
|
253 |
+
%243 = fadd float %239, %242, !dbg !29
|
254 |
+
%244 = bitcast float %243 to i32, !dbg !35
|
255 |
+
%245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !35
|
256 |
+
%246 = bitcast i32 %245 to float, !dbg !35
|
257 |
+
%247 = fadd float %243, %246, !dbg !29
|
258 |
+
%248 = bitcast float %247 to i32, !dbg !35
|
259 |
+
%249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !35
|
260 |
+
%250 = bitcast i32 %249 to float, !dbg !35
|
261 |
+
%251 = fadd float %247, %250, !dbg !29
|
262 |
+
%252 = bitcast float %251 to i32, !dbg !35
|
263 |
+
%253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !35
|
264 |
+
%254 = bitcast i32 %253 to float, !dbg !35
|
265 |
+
%255 = fadd float %251, %254, !dbg !29
|
266 |
+
%256 = bitcast float %151 to i32, !dbg !35
|
267 |
+
%257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 16, i32 31), !dbg !35
|
268 |
+
%258 = bitcast i32 %257 to float, !dbg !35
|
269 |
+
%259 = fadd float %151, %258, !dbg !29
|
270 |
+
%260 = bitcast float %259 to i32, !dbg !35
|
271 |
+
%261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 8, i32 31), !dbg !35
|
272 |
+
%262 = bitcast i32 %261 to float, !dbg !35
|
273 |
+
%263 = fadd float %259, %262, !dbg !29
|
274 |
+
%264 = bitcast float %263 to i32, !dbg !35
|
275 |
+
%265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 4, i32 31), !dbg !35
|
276 |
+
%266 = bitcast i32 %265 to float, !dbg !35
|
277 |
+
%267 = fadd float %263, %266, !dbg !29
|
278 |
+
%268 = bitcast float %267 to i32, !dbg !35
|
279 |
+
%269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 2, i32 31), !dbg !35
|
280 |
+
%270 = bitcast i32 %269 to float, !dbg !35
|
281 |
+
%271 = fadd float %267, %270, !dbg !29
|
282 |
+
%272 = bitcast float %271 to i32, !dbg !35
|
283 |
+
%273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !35
|
284 |
+
%274 = bitcast i32 %273 to float, !dbg !35
|
285 |
+
%275 = fadd float %271, %274, !dbg !29
|
286 |
+
%276 = bitcast float %153 to i32, !dbg !35
|
287 |
+
%277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 16, i32 31), !dbg !35
|
288 |
+
%278 = bitcast i32 %277 to float, !dbg !35
|
289 |
+
%279 = fadd float %153, %278, !dbg !29
|
290 |
+
%280 = bitcast float %279 to i32, !dbg !35
|
291 |
+
%281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 8, i32 31), !dbg !35
|
292 |
+
%282 = bitcast i32 %281 to float, !dbg !35
|
293 |
+
%283 = fadd float %279, %282, !dbg !29
|
294 |
+
%284 = bitcast float %283 to i32, !dbg !35
|
295 |
+
%285 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 4, i32 31), !dbg !35
|
296 |
+
%286 = bitcast i32 %285 to float, !dbg !35
|
297 |
+
%287 = fadd float %283, %286, !dbg !29
|
298 |
+
%288 = bitcast float %287 to i32, !dbg !35
|
299 |
+
%289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !35
|
300 |
+
%290 = bitcast i32 %289 to float, !dbg !35
|
301 |
+
%291 = fadd float %287, %290, !dbg !29
|
302 |
+
%292 = bitcast float %291 to i32, !dbg !35
|
303 |
+
%293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !35
|
304 |
+
%294 = bitcast i32 %293 to float, !dbg !35
|
305 |
+
%295 = fadd float %291, %294, !dbg !29
|
306 |
+
%296 = bitcast float %155 to i32, !dbg !35
|
307 |
+
%297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 16, i32 31), !dbg !35
|
308 |
+
%298 = bitcast i32 %297 to float, !dbg !35
|
309 |
+
%299 = fadd float %155, %298, !dbg !29
|
310 |
+
%300 = bitcast float %299 to i32, !dbg !35
|
311 |
+
%301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %300, i32 8, i32 31), !dbg !35
|
312 |
+
%302 = bitcast i32 %301 to float, !dbg !35
|
313 |
+
%303 = fadd float %299, %302, !dbg !29
|
314 |
+
%304 = bitcast float %303 to i32, !dbg !35
|
315 |
+
%305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 4, i32 31), !dbg !35
|
316 |
+
%306 = bitcast i32 %305 to float, !dbg !35
|
317 |
+
%307 = fadd float %303, %306, !dbg !29
|
318 |
+
%308 = bitcast float %307 to i32, !dbg !35
|
319 |
+
%309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !35
|
320 |
+
%310 = bitcast i32 %309 to float, !dbg !35
|
321 |
+
%311 = fadd float %307, %310, !dbg !29
|
322 |
+
%312 = bitcast float %311 to i32, !dbg !35
|
323 |
+
%313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !35
|
324 |
+
%314 = bitcast i32 %313 to float, !dbg !35
|
325 |
+
%315 = fadd float %311, %314, !dbg !29
|
326 |
+
%316 = icmp eq i32 %138, 0, !dbg !35
|
327 |
+
%317 = zext nneg i32 %139 to i64, !dbg !35
|
328 |
+
%318 = getelementptr float, ptr addrspace(3) @global_smem, i64 %317, !dbg !35
|
329 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %318, float %175, i1 %316) #3, !dbg !35
|
330 |
+
%319 = or i32 %139, 8, !dbg !35
|
331 |
+
%320 = zext nneg i32 %319 to i64, !dbg !35
|
332 |
+
%321 = getelementptr float, ptr addrspace(3) @global_smem, i64 %320, !dbg !35
|
333 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %321, float %195, i1 %316) #3, !dbg !35
|
334 |
+
%322 = or i32 %139, 16, !dbg !35
|
335 |
+
%323 = zext nneg i32 %322 to i64, !dbg !35
|
336 |
+
%324 = getelementptr float, ptr addrspace(3) @global_smem, i64 %323, !dbg !35
|
337 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %324, float %215, i1 %316) #3, !dbg !35
|
338 |
+
%325 = or i32 %139, 24, !dbg !35
|
339 |
+
%326 = zext nneg i32 %325 to i64, !dbg !35
|
340 |
+
%327 = getelementptr float, ptr addrspace(3) @global_smem, i64 %326, !dbg !35
|
341 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %327, float %235, i1 %316) #3, !dbg !35
|
342 |
+
%328 = or i32 %139, 32, !dbg !35
|
343 |
+
%329 = zext nneg i32 %328 to i64, !dbg !35
|
344 |
+
%330 = getelementptr float, ptr addrspace(3) @global_smem, i64 %329, !dbg !35
|
345 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, float %255, i1 %316) #3, !dbg !35
|
346 |
+
%331 = or i32 %139, 40, !dbg !35
|
347 |
+
%332 = zext nneg i32 %331 to i64, !dbg !35
|
348 |
+
%333 = getelementptr float, ptr addrspace(3) @global_smem, i64 %332, !dbg !35
|
349 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %333, float %275, i1 %316) #3, !dbg !35
|
350 |
+
%334 = or i32 %139, 48, !dbg !35
|
351 |
+
%335 = zext nneg i32 %334 to i64, !dbg !35
|
352 |
+
%336 = getelementptr float, ptr addrspace(3) @global_smem, i64 %335, !dbg !35
|
353 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %336, float %295, i1 %316) #3, !dbg !35
|
354 |
+
%337 = or i32 %139, 56, !dbg !35
|
355 |
+
%338 = zext nneg i32 %337 to i64, !dbg !35
|
356 |
+
%339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !35
|
357 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %315, i1 %316) #3, !dbg !35
|
358 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
359 |
+
%340 = icmp slt i32 %10, 64, !dbg !35
|
360 |
+
%341 = sext i32 %10 to i64, !dbg !35
|
361 |
+
%342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !35
|
362 |
+
%343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !35
|
363 |
+
%344 = bitcast float %343 to i32, !dbg !35
|
364 |
+
%345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 4, i32 31), !dbg !35
|
365 |
+
%346 = bitcast i32 %345 to float, !dbg !35
|
366 |
+
%347 = fadd float %343, %346, !dbg !29
|
367 |
+
%348 = bitcast float %347 to i32, !dbg !35
|
368 |
+
%349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !35
|
369 |
+
%350 = bitcast i32 %349 to float, !dbg !35
|
370 |
+
%351 = fadd float %347, %350, !dbg !29
|
371 |
+
%352 = bitcast float %351 to i32, !dbg !35
|
372 |
+
%353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %352, i32 1, i32 31), !dbg !35
|
373 |
+
%354 = bitcast i32 %353 to float, !dbg !35
|
374 |
+
%355 = fadd float %351, %354, !dbg !29
|
375 |
+
%356 = and i32 %10, 7, !dbg !35
|
376 |
+
%357 = icmp eq i32 %356, 0, !dbg !35
|
377 |
+
%358 = and i1 %340, %357, !dbg !35
|
378 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %355, i1 %358) #3, !dbg !35
|
379 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
380 |
+
%359 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35
|
381 |
+
%360 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !35
|
382 |
+
%361 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 64), align 4, !dbg !35
|
383 |
+
%362 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 96), align 4, !dbg !35
|
384 |
+
%363 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 128), align 4, !dbg !35
|
385 |
+
%364 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 160), align 4, !dbg !35
|
386 |
+
%365 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 192), align 4, !dbg !35
|
387 |
+
%366 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 224), align 4, !dbg !35
|
388 |
+
%367 = extractelement <2 x i64> %25, i64 0, !dbg !37
|
389 |
+
%368 = extractelement <2 x i64> %25, i64 1, !dbg !37
|
390 |
+
%369 = extractelement <8 x float> %66, i64 0, !dbg !38
|
391 |
+
%370 = extractelement <8 x float> %66, i64 1, !dbg !38
|
392 |
+
%371 = extractelement <8 x float> %66, i64 2, !dbg !38
|
393 |
+
%372 = extractelement <8 x float> %66, i64 3, !dbg !38
|
394 |
+
%373 = extractelement <8 x float> %66, i64 4, !dbg !38
|
395 |
+
%374 = extractelement <8 x float> %66, i64 5, !dbg !38
|
396 |
+
%375 = extractelement <8 x float> %66, i64 6, !dbg !38
|
397 |
+
%376 = extractelement <8 x float> %66, i64 7, !dbg !38
|
398 |
+
br label %377, !dbg !39
|
399 |
+
|
400 |
+
377: ; preds = %137, %377
|
401 |
+
%378 = phi i32 [ 0, %137 ], [ %672, %377 ]
|
402 |
+
%379 = zext nneg i32 %378 to i64, !dbg !37
|
403 |
+
%380 = or i64 %367, %379, !dbg !37
|
404 |
+
%381 = or i64 %368, %379, !dbg !37
|
405 |
+
%382 = icmp ult i64 %380, 50257, !dbg !40
|
406 |
+
%383 = icmp ult i64 %381, 50257, !dbg !40
|
407 |
+
%384 = add nsw i64 %380, %46, !dbg !41
|
408 |
+
%385 = add nsw i64 %381, %46, !dbg !41
|
409 |
+
%386 = add nsw i64 %380, %47, !dbg !41
|
410 |
+
%387 = add nsw i64 %381, %47, !dbg !41
|
411 |
+
%388 = add nsw i64 %380, %48, !dbg !41
|
412 |
+
%389 = add nsw i64 %381, %48, !dbg !41
|
413 |
+
%390 = add nsw i64 %380, %49, !dbg !41
|
414 |
+
%391 = add nsw i64 %381, %49, !dbg !41
|
415 |
+
%392 = add nsw i64 %380, %50, !dbg !41
|
416 |
+
%393 = add nsw i64 %381, %50, !dbg !41
|
417 |
+
%394 = add nsw i64 %380, %51, !dbg !41
|
418 |
+
%395 = add nsw i64 %381, %51, !dbg !41
|
419 |
+
%396 = add nsw i64 %380, %52, !dbg !41
|
420 |
+
%397 = add nsw i64 %381, %52, !dbg !41
|
421 |
+
%398 = add nsw i64 %380, %53, !dbg !41
|
422 |
+
%399 = add nsw i64 %381, %53, !dbg !41
|
423 |
+
%400 = getelementptr i16, ptr addrspace(1) %4, i64 %384, !dbg !42
|
424 |
+
%401 = getelementptr i16, ptr addrspace(1) %4, i64 %385, !dbg !42
|
425 |
+
%402 = getelementptr i16, ptr addrspace(1) %4, i64 %386, !dbg !42
|
426 |
+
%403 = getelementptr i16, ptr addrspace(1) %4, i64 %387, !dbg !42
|
427 |
+
%404 = getelementptr i16, ptr addrspace(1) %4, i64 %388, !dbg !42
|
428 |
+
%405 = getelementptr i16, ptr addrspace(1) %4, i64 %389, !dbg !42
|
429 |
+
%406 = getelementptr i16, ptr addrspace(1) %4, i64 %390, !dbg !42
|
430 |
+
%407 = getelementptr i16, ptr addrspace(1) %4, i64 %391, !dbg !42
|
431 |
+
%408 = getelementptr i16, ptr addrspace(1) %4, i64 %392, !dbg !42
|
432 |
+
%409 = getelementptr i16, ptr addrspace(1) %4, i64 %393, !dbg !42
|
433 |
+
%410 = getelementptr i16, ptr addrspace(1) %4, i64 %394, !dbg !42
|
434 |
+
%411 = getelementptr i16, ptr addrspace(1) %4, i64 %395, !dbg !42
|
435 |
+
%412 = getelementptr i16, ptr addrspace(1) %4, i64 %396, !dbg !42
|
436 |
+
%413 = getelementptr i16, ptr addrspace(1) %4, i64 %397, !dbg !42
|
437 |
+
%414 = getelementptr i16, ptr addrspace(1) %4, i64 %398, !dbg !42
|
438 |
+
%415 = getelementptr i16, ptr addrspace(1) %4, i64 %399, !dbg !42
|
439 |
+
%416 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
440 |
+
%417 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
441 |
+
%418 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
442 |
+
%419 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
443 |
+
%420 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
444 |
+
%421 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
445 |
+
%422 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %406, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
446 |
+
%423 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %407, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
447 |
+
%424 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %408, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
448 |
+
%425 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %409, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
449 |
+
%426 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %410, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
450 |
+
%427 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %411, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
451 |
+
%428 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %412, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
452 |
+
%429 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %413, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
453 |
+
%430 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %414, i1 %382, i16 0, i1 %382) #3, !dbg !43
|
454 |
+
%431 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %415, i1 %383, i16 0, i1 %383) #3, !dbg !43
|
455 |
+
%432 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %416) #3, !dbg !44
|
456 |
+
%433 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %417) #3, !dbg !44
|
457 |
+
%434 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %418) #3, !dbg !44
|
458 |
+
%435 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %419) #3, !dbg !44
|
459 |
+
%436 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %420) #3, !dbg !44
|
460 |
+
%437 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %421) #3, !dbg !44
|
461 |
+
%438 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %422) #3, !dbg !44
|
462 |
+
%439 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %423) #3, !dbg !44
|
463 |
+
%440 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %424) #3, !dbg !44
|
464 |
+
%441 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %425) #3, !dbg !44
|
465 |
+
%442 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %426) #3, !dbg !44
|
466 |
+
%443 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %427) #3, !dbg !44
|
467 |
+
%444 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %428) #3, !dbg !44
|
468 |
+
%445 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %429) #3, !dbg !44
|
469 |
+
%446 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %430) #3, !dbg !44
|
470 |
+
%447 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %431) #3, !dbg !44
|
471 |
+
%448 = getelementptr float, ptr addrspace(1) %0, i64 %384, !dbg !45
|
472 |
+
%449 = getelementptr float, ptr addrspace(1) %0, i64 %385, !dbg !45
|
473 |
+
%450 = getelementptr float, ptr addrspace(1) %0, i64 %386, !dbg !45
|
474 |
+
%451 = getelementptr float, ptr addrspace(1) %0, i64 %387, !dbg !45
|
475 |
+
%452 = getelementptr float, ptr addrspace(1) %0, i64 %388, !dbg !45
|
476 |
+
%453 = getelementptr float, ptr addrspace(1) %0, i64 %389, !dbg !45
|
477 |
+
%454 = getelementptr float, ptr addrspace(1) %0, i64 %390, !dbg !45
|
478 |
+
%455 = getelementptr float, ptr addrspace(1) %0, i64 %391, !dbg !45
|
479 |
+
%456 = getelementptr float, ptr addrspace(1) %0, i64 %392, !dbg !45
|
480 |
+
%457 = getelementptr float, ptr addrspace(1) %0, i64 %393, !dbg !45
|
481 |
+
%458 = getelementptr float, ptr addrspace(1) %0, i64 %394, !dbg !45
|
482 |
+
%459 = getelementptr float, ptr addrspace(1) %0, i64 %395, !dbg !45
|
483 |
+
%460 = getelementptr float, ptr addrspace(1) %0, i64 %396, !dbg !45
|
484 |
+
%461 = getelementptr float, ptr addrspace(1) %0, i64 %397, !dbg !45
|
485 |
+
%462 = getelementptr float, ptr addrspace(1) %0, i64 %398, !dbg !45
|
486 |
+
%463 = getelementptr float, ptr addrspace(1) %0, i64 %399, !dbg !45
|
487 |
+
%464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %448, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
488 |
+
%465 = bitcast i32 %464 to float, !dbg !46
|
489 |
+
%466 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %449, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
490 |
+
%467 = bitcast i32 %466 to float, !dbg !46
|
491 |
+
%468 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %450, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
492 |
+
%469 = bitcast i32 %468 to float, !dbg !46
|
493 |
+
%470 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %451, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
494 |
+
%471 = bitcast i32 %470 to float, !dbg !46
|
495 |
+
%472 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %452, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
496 |
+
%473 = bitcast i32 %472 to float, !dbg !46
|
497 |
+
%474 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %453, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
498 |
+
%475 = bitcast i32 %474 to float, !dbg !46
|
499 |
+
%476 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %454, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
500 |
+
%477 = bitcast i32 %476 to float, !dbg !46
|
501 |
+
%478 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %455, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
502 |
+
%479 = bitcast i32 %478 to float, !dbg !46
|
503 |
+
%480 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %456, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
504 |
+
%481 = bitcast i32 %480 to float, !dbg !46
|
505 |
+
%482 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %457, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
506 |
+
%483 = bitcast i32 %482 to float, !dbg !46
|
507 |
+
%484 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %458, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
508 |
+
%485 = bitcast i32 %484 to float, !dbg !46
|
509 |
+
%486 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %459, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
510 |
+
%487 = bitcast i32 %486 to float, !dbg !46
|
511 |
+
%488 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %460, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
512 |
+
%489 = bitcast i32 %488 to float, !dbg !46
|
513 |
+
%490 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %461, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
514 |
+
%491 = bitcast i32 %490 to float, !dbg !46
|
515 |
+
%492 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %462, i1 %382, i32 0, i1 %382) #3, !dbg !46
|
516 |
+
%493 = bitcast i32 %492 to float, !dbg !46
|
517 |
+
%494 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %463, i1 %383, i32 0, i1 %383) #3, !dbg !46
|
518 |
+
%495 = bitcast i32 %494 to float, !dbg !46
|
519 |
+
%496 = getelementptr i16, ptr addrspace(1) %5, i64 %384, !dbg !47
|
520 |
+
%497 = getelementptr i16, ptr addrspace(1) %5, i64 %385, !dbg !47
|
521 |
+
%498 = getelementptr i16, ptr addrspace(1) %5, i64 %386, !dbg !47
|
522 |
+
%499 = getelementptr i16, ptr addrspace(1) %5, i64 %387, !dbg !47
|
523 |
+
%500 = getelementptr i16, ptr addrspace(1) %5, i64 %388, !dbg !47
|
524 |
+
%501 = getelementptr i16, ptr addrspace(1) %5, i64 %389, !dbg !47
|
525 |
+
%502 = getelementptr i16, ptr addrspace(1) %5, i64 %390, !dbg !47
|
526 |
+
%503 = getelementptr i16, ptr addrspace(1) %5, i64 %391, !dbg !47
|
527 |
+
%504 = getelementptr i16, ptr addrspace(1) %5, i64 %392, !dbg !47
|
528 |
+
%505 = getelementptr i16, ptr addrspace(1) %5, i64 %393, !dbg !47
|
529 |
+
%506 = getelementptr i16, ptr addrspace(1) %5, i64 %394, !dbg !47
|
530 |
+
%507 = getelementptr i16, ptr addrspace(1) %5, i64 %395, !dbg !47
|
531 |
+
%508 = getelementptr i16, ptr addrspace(1) %5, i64 %396, !dbg !47
|
532 |
+
%509 = getelementptr i16, ptr addrspace(1) %5, i64 %397, !dbg !47
|
533 |
+
%510 = getelementptr i16, ptr addrspace(1) %5, i64 %398, !dbg !47
|
534 |
+
%511 = getelementptr i16, ptr addrspace(1) %5, i64 %399, !dbg !47
|
535 |
+
%512 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %496, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
536 |
+
%513 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %497, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
537 |
+
%514 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %498, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
538 |
+
%515 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %499, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
539 |
+
%516 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %500, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
540 |
+
%517 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %501, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
541 |
+
%518 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %502, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
542 |
+
%519 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %503, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
543 |
+
%520 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %504, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
544 |
+
%521 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %505, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
545 |
+
%522 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %506, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
546 |
+
%523 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %507, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
547 |
+
%524 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %508, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
548 |
+
%525 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %509, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
549 |
+
%526 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %510, i1 %382, i16 0, i1 %382) #3, !dbg !48
|
550 |
+
%527 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %511, i1 %383, i16 0, i1 %383) #3, !dbg !48
|
551 |
+
%528 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #3, !dbg !49
|
552 |
+
%529 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #3, !dbg !49
|
553 |
+
%530 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #3, !dbg !49
|
554 |
+
%531 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #3, !dbg !49
|
555 |
+
%532 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #3, !dbg !49
|
556 |
+
%533 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %517) #3, !dbg !49
|
557 |
+
%534 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %518) #3, !dbg !49
|
558 |
+
%535 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %519) #3, !dbg !49
|
559 |
+
%536 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %520) #3, !dbg !49
|
560 |
+
%537 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %521) #3, !dbg !49
|
561 |
+
%538 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %522) #3, !dbg !49
|
562 |
+
%539 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %523) #3, !dbg !49
|
563 |
+
%540 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %524) #3, !dbg !49
|
564 |
+
%541 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %525) #3, !dbg !49
|
565 |
+
%542 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %526) #3, !dbg !49
|
566 |
+
%543 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %527) #3, !dbg !49
|
567 |
+
%544 = fmul float %369, %465, !dbg !38
|
568 |
+
%545 = fmul float %369, %467, !dbg !38
|
569 |
+
%546 = fmul float %370, %469, !dbg !38
|
570 |
+
%547 = fmul float %370, %471, !dbg !38
|
571 |
+
%548 = fmul float %371, %473, !dbg !38
|
572 |
+
%549 = fmul float %371, %475, !dbg !38
|
573 |
+
%550 = fmul float %372, %477, !dbg !38
|
574 |
+
%551 = fmul float %372, %479, !dbg !38
|
575 |
+
%552 = fmul float %373, %481, !dbg !38
|
576 |
+
%553 = fmul float %373, %483, !dbg !38
|
577 |
+
%554 = fmul float %374, %485, !dbg !38
|
578 |
+
%555 = fmul float %374, %487, !dbg !38
|
579 |
+
%556 = fmul float %375, %489, !dbg !38
|
580 |
+
%557 = fmul float %375, %491, !dbg !38
|
581 |
+
%558 = fmul float %376, %493, !dbg !38
|
582 |
+
%559 = fmul float %376, %495, !dbg !38
|
583 |
+
%560 = fmul float %528, 0x3FF7154760000000, !dbg !50
|
584 |
+
%561 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %560) #3, !dbg !50
|
585 |
+
%562 = fmul float %529, 0x3FF7154760000000, !dbg !50
|
586 |
+
%563 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %562) #3, !dbg !50
|
587 |
+
%564 = fmul float %530, 0x3FF7154760000000, !dbg !50
|
588 |
+
%565 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %564) #3, !dbg !50
|
589 |
+
%566 = fmul float %531, 0x3FF7154760000000, !dbg !50
|
590 |
+
%567 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %566) #3, !dbg !50
|
591 |
+
%568 = fmul float %532, 0x3FF7154760000000, !dbg !50
|
592 |
+
%569 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %568) #3, !dbg !50
|
593 |
+
%570 = fmul float %533, 0x3FF7154760000000, !dbg !50
|
594 |
+
%571 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %570) #3, !dbg !50
|
595 |
+
%572 = fmul float %534, 0x3FF7154760000000, !dbg !50
|
596 |
+
%573 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %572) #3, !dbg !50
|
597 |
+
%574 = fmul float %535, 0x3FF7154760000000, !dbg !50
|
598 |
+
%575 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %574) #3, !dbg !50
|
599 |
+
%576 = fmul float %536, 0x3FF7154760000000, !dbg !50
|
600 |
+
%577 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %576) #3, !dbg !50
|
601 |
+
%578 = fmul float %537, 0x3FF7154760000000, !dbg !50
|
602 |
+
%579 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %578) #3, !dbg !50
|
603 |
+
%580 = fmul float %538, 0x3FF7154760000000, !dbg !50
|
604 |
+
%581 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %580) #3, !dbg !50
|
605 |
+
%582 = fmul float %539, 0x3FF7154760000000, !dbg !50
|
606 |
+
%583 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %582) #3, !dbg !50
|
607 |
+
%584 = fmul float %540, 0x3FF7154760000000, !dbg !50
|
608 |
+
%585 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %584) #3, !dbg !50
|
609 |
+
%586 = fmul float %541, 0x3FF7154760000000, !dbg !50
|
610 |
+
%587 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %586) #3, !dbg !50
|
611 |
+
%588 = fmul float %542, 0x3FF7154760000000, !dbg !50
|
612 |
+
%589 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %588) #3, !dbg !50
|
613 |
+
%590 = fmul float %543, 0x3FF7154760000000, !dbg !50
|
614 |
+
%591 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %590) #3, !dbg !50
|
615 |
+
%592 = fmul float %359, %561, !dbg !51
|
616 |
+
%593 = fmul float %359, %563, !dbg !51
|
617 |
+
%594 = fmul float %360, %565, !dbg !51
|
618 |
+
%595 = fmul float %360, %567, !dbg !51
|
619 |
+
%596 = fmul float %361, %569, !dbg !51
|
620 |
+
%597 = fmul float %361, %571, !dbg !51
|
621 |
+
%598 = fmul float %362, %573, !dbg !51
|
622 |
+
%599 = fmul float %362, %575, !dbg !51
|
623 |
+
%600 = fmul float %363, %577, !dbg !51
|
624 |
+
%601 = fmul float %363, %579, !dbg !51
|
625 |
+
%602 = fmul float %364, %581, !dbg !51
|
626 |
+
%603 = fmul float %364, %583, !dbg !51
|
627 |
+
%604 = fmul float %365, %585, !dbg !51
|
628 |
+
%605 = fmul float %365, %587, !dbg !51
|
629 |
+
%606 = fmul float %366, %589, !dbg !51
|
630 |
+
%607 = fmul float %366, %591, !dbg !51
|
631 |
+
%608 = fsub float %544, %592, !dbg !52
|
632 |
+
%609 = fsub float %545, %593, !dbg !52
|
633 |
+
%610 = fsub float %546, %594, !dbg !52
|
634 |
+
%611 = fsub float %547, %595, !dbg !52
|
635 |
+
%612 = fsub float %548, %596, !dbg !52
|
636 |
+
%613 = fsub float %549, %597, !dbg !52
|
637 |
+
%614 = fsub float %550, %598, !dbg !52
|
638 |
+
%615 = fsub float %551, %599, !dbg !52
|
639 |
+
%616 = fsub float %552, %600, !dbg !52
|
640 |
+
%617 = fsub float %553, %601, !dbg !52
|
641 |
+
%618 = fsub float %554, %602, !dbg !52
|
642 |
+
%619 = fsub float %555, %603, !dbg !52
|
643 |
+
%620 = fsub float %556, %604, !dbg !52
|
644 |
+
%621 = fsub float %557, %605, !dbg !52
|
645 |
+
%622 = fsub float %558, %606, !dbg !52
|
646 |
+
%623 = fsub float %559, %607, !dbg !52
|
647 |
+
%624 = fadd float %432, %608, !dbg !53
|
648 |
+
%625 = fadd float %433, %609, !dbg !53
|
649 |
+
%626 = fadd float %434, %610, !dbg !53
|
650 |
+
%627 = fadd float %435, %611, !dbg !53
|
651 |
+
%628 = fadd float %436, %612, !dbg !53
|
652 |
+
%629 = fadd float %437, %613, !dbg !53
|
653 |
+
%630 = fadd float %438, %614, !dbg !53
|
654 |
+
%631 = fadd float %439, %615, !dbg !53
|
655 |
+
%632 = fadd float %440, %616, !dbg !53
|
656 |
+
%633 = fadd float %441, %617, !dbg !53
|
657 |
+
%634 = fadd float %442, %618, !dbg !53
|
658 |
+
%635 = fadd float %443, %619, !dbg !53
|
659 |
+
%636 = fadd float %444, %620, !dbg !53
|
660 |
+
%637 = fadd float %445, %621, !dbg !53
|
661 |
+
%638 = fadd float %446, %622, !dbg !53
|
662 |
+
%639 = fadd float %447, %623, !dbg !53
|
663 |
+
%640 = getelementptr i16, ptr addrspace(1) %6, i64 %384, !dbg !54
|
664 |
+
%641 = getelementptr i16, ptr addrspace(1) %6, i64 %385, !dbg !54
|
665 |
+
%642 = getelementptr i16, ptr addrspace(1) %6, i64 %386, !dbg !54
|
666 |
+
%643 = getelementptr i16, ptr addrspace(1) %6, i64 %387, !dbg !54
|
667 |
+
%644 = getelementptr i16, ptr addrspace(1) %6, i64 %388, !dbg !54
|
668 |
+
%645 = getelementptr i16, ptr addrspace(1) %6, i64 %389, !dbg !54
|
669 |
+
%646 = getelementptr i16, ptr addrspace(1) %6, i64 %390, !dbg !54
|
670 |
+
%647 = getelementptr i16, ptr addrspace(1) %6, i64 %391, !dbg !54
|
671 |
+
%648 = getelementptr i16, ptr addrspace(1) %6, i64 %392, !dbg !54
|
672 |
+
%649 = getelementptr i16, ptr addrspace(1) %6, i64 %393, !dbg !54
|
673 |
+
%650 = getelementptr i16, ptr addrspace(1) %6, i64 %394, !dbg !54
|
674 |
+
%651 = getelementptr i16, ptr addrspace(1) %6, i64 %395, !dbg !54
|
675 |
+
%652 = getelementptr i16, ptr addrspace(1) %6, i64 %396, !dbg !54
|
676 |
+
%653 = getelementptr i16, ptr addrspace(1) %6, i64 %397, !dbg !54
|
677 |
+
%654 = getelementptr i16, ptr addrspace(1) %6, i64 %398, !dbg !54
|
678 |
+
%655 = getelementptr i16, ptr addrspace(1) %6, i64 %399, !dbg !54
|
679 |
+
%656 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %624) #3, !dbg !55
|
680 |
+
%657 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %625) #3, !dbg !55
|
681 |
+
%658 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %626) #3, !dbg !55
|
682 |
+
%659 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %627) #3, !dbg !55
|
683 |
+
%660 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %628) #3, !dbg !55
|
684 |
+
%661 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %629) #3, !dbg !55
|
685 |
+
%662 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %630) #3, !dbg !55
|
686 |
+
%663 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %631) #3, !dbg !55
|
687 |
+
%664 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %632) #3, !dbg !55
|
688 |
+
%665 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %633) #3, !dbg !55
|
689 |
+
%666 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %634) #3, !dbg !55
|
690 |
+
%667 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %635) #3, !dbg !55
|
691 |
+
%668 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %636) #3, !dbg !55
|
692 |
+
%669 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %637) #3, !dbg !55
|
693 |
+
%670 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %638) #3, !dbg !55
|
694 |
+
%671 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %639) #3, !dbg !55
|
695 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %656, ptr addrspace(1) %640, i1 %382) #3, !dbg !55
|
696 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %657, ptr addrspace(1) %641, i1 %383) #3, !dbg !55
|
697 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %658, ptr addrspace(1) %642, i1 %382) #3, !dbg !55
|
698 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %659, ptr addrspace(1) %643, i1 %383) #3, !dbg !55
|
699 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %382) #3, !dbg !55
|
700 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %383) #3, !dbg !55
|
701 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %382) #3, !dbg !55
|
702 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %383) #3, !dbg !55
|
703 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %382) #3, !dbg !55
|
704 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %383) #3, !dbg !55
|
705 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %382) #3, !dbg !55
|
706 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %383) #3, !dbg !55
|
707 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %668, ptr addrspace(1) %652, i1 %382) #3, !dbg !55
|
708 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %669, ptr addrspace(1) %653, i1 %383) #3, !dbg !55
|
709 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %670, ptr addrspace(1) %654, i1 %382) #3, !dbg !55
|
710 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %671, ptr addrspace(1) %655, i1 %383) #3, !dbg !55
|
711 |
+
%672 = add nuw nsw i32 %378, 512, !dbg !39
|
712 |
+
%673 = icmp ult i32 %378, 49745, !dbg !39
|
713 |
+
br i1 %673, label %377, label %674, !dbg !39
|
714 |
+
|
715 |
+
674: ; preds = %377
|
716 |
+
ret void, !dbg !56
|
717 |
+
}
|
718 |
+
|
719 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
720 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
721 |
+
|
722 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
723 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
724 |
+
|
725 |
+
; Function Attrs: convergent nocallback nounwind
|
726 |
+
declare void @llvm.nvvm.barrier0() #2
|
727 |
+
|
728 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
729 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
730 |
+
attributes #2 = { convergent nocallback nounwind }
|
731 |
+
attributes #3 = { nounwind }
|
732 |
+
|
733 |
+
!llvm.module.flags = !{!0}
|
734 |
+
!llvm.dbg.cu = !{!1}
|
735 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
736 |
+
|
737 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
738 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
739 |
+
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
|
740 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
|
741 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
|
742 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
743 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
744 |
+
!7 = !{}
|
745 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
746 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
747 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
748 |
+
!11 = !DILocation(line: 21, column: 34, scope: !5)
|
749 |
+
!12 = !DILocation(line: 21, column: 46, scope: !5)
|
750 |
+
!13 = !DILocation(line: 22, column: 23, scope: !5)
|
751 |
+
!14 = !DILocation(line: 26, column: 30, scope: !5)
|
752 |
+
!15 = !DILocation(line: 26, column: 35, scope: !5)
|
753 |
+
!16 = !DILocation(line: 27, column: 19, scope: !5)
|
754 |
+
!17 = !DILocation(line: 29, column: 19, scope: !5)
|
755 |
+
!18 = !DILocation(line: 36, column: 46, scope: !5)
|
756 |
+
!19 = !DILocation(line: 38, column: 23, scope: !5)
|
757 |
+
!20 = !DILocation(line: 39, column: 22, scope: !5)
|
758 |
+
!21 = !DILocation(line: 41, column: 37, scope: !5)
|
759 |
+
!22 = !DILocation(line: 32, column: 36, scope: !5)
|
760 |
+
!23 = !DILocation(line: 33, column: 27, scope: !5)
|
761 |
+
!24 = !DILocation(line: 34, column: 25, scope: !5)
|
762 |
+
!25 = !DILocation(line: 36, column: 34, scope: !5)
|
763 |
+
!26 = !DILocation(line: 36, column: 52, scope: !5)
|
764 |
+
!27 = !DILocation(line: 42, column: 23, scope: !5)
|
765 |
+
!28 = !DILocation(line: 45, column: 40, scope: !5)
|
766 |
+
!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33)
|
767 |
+
!30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0)
|
768 |
+
!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
769 |
+
!32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
|
770 |
+
!33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34)
|
771 |
+
!34 = !DILocation(line: 46, column: 27, scope: !30)
|
772 |
+
!35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36)
|
773 |
+
!36 = !DILocation(line: 46, column: 27, scope: !32)
|
774 |
+
!37 = !DILocation(line: 52, column: 27, scope: !5)
|
775 |
+
!38 = !DILocation(line: 63, column: 24, scope: !5)
|
776 |
+
!39 = !DILocation(line: 51, column: 36, scope: !5)
|
777 |
+
!40 = !DILocation(line: 53, column: 25, scope: !5)
|
778 |
+
!41 = !DILocation(line: 55, column: 41, scope: !5)
|
779 |
+
!42 = !DILocation(line: 55, column: 35, scope: !5)
|
780 |
+
!43 = !DILocation(line: 55, column: 53, scope: !5)
|
781 |
+
!44 = !DILocation(line: 55, column: 105, scope: !5)
|
782 |
+
!45 = !DILocation(line: 56, column: 35, scope: !5)
|
783 |
+
!46 = !DILocation(line: 56, column: 53, scope: !5)
|
784 |
+
!47 = !DILocation(line: 57, column: 35, scope: !5)
|
785 |
+
!48 = !DILocation(line: 57, column: 53, scope: !5)
|
786 |
+
!49 = !DILocation(line: 57, column: 105, scope: !5)
|
787 |
+
!50 = !DILocation(line: 65, column: 23, scope: !5)
|
788 |
+
!51 = !DILocation(line: 66, column: 24, scope: !5)
|
789 |
+
!52 = !DILocation(line: 67, column: 24, scope: !5)
|
790 |
+
!53 = !DILocation(line: 69, column: 24, scope: !5)
|
791 |
+
!54 = !DILocation(line: 70, column: 29, scope: !5)
|
792 |
+
!55 = !DILocation(line: 70, column: 54, scope: !5)
|
793 |
+
!56 = !DILocation(line: 51, column: 4, scope: !5)
|
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx
ADDED
@@ -0,0 +1,1517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
22 |
+
)
|
23 |
+
.maxntid 256, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<176>;
|
26 |
+
.reg .b16 %rs<129>;
|
27 |
+
.reg .b32 %r<238>;
|
28 |
+
.reg .f32 %f<393>;
|
29 |
+
.reg .b64 %rd<166>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
35 |
+
ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
36 |
+
ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
37 |
+
ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 22 44
|
40 |
+
mov.u32 %r1, %tid.x;
|
41 |
+
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
42 |
+
shr.u32 %r2, %r1, 5;
|
43 |
+
ld.param.u64 %rd56, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
44 |
+
.loc 1 24 33
|
45 |
+
and.b32 %r9, %r1, 255;
|
46 |
+
ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
47 |
+
or.b32 %r10, %r9, 256;
|
48 |
+
.loc 1 21 28
|
49 |
+
mov.u32 %r3, %ctaid.x;
|
50 |
+
.loc 1 21 34
|
51 |
+
cvt.s64.s32 %rd1, %r3;
|
52 |
+
.loc 1 21 46
|
53 |
+
mul.wide.s32 %rd60, %r3, 8;
|
54 |
+
.loc 1 22 23
|
55 |
+
or.b64 %rd61, %rd60, 1;
|
56 |
+
cvt.u64.u32 %rd2, %r9;
|
57 |
+
cvt.u64.u32 %rd3, %r10;
|
58 |
+
.loc 1 26 30
|
59 |
+
shl.b64 %rd62, %rd60, 3;
|
60 |
+
add.s64 %rd41, %rd59, %rd62;
|
61 |
+
add.s64 %rd43, %rd41, 8;
|
62 |
+
add.s64 %rd45, %rd41, 16;
|
63 |
+
add.s64 %rd47, %rd41, 24;
|
64 |
+
add.s64 %rd49, %rd41, 32;
|
65 |
+
add.s64 %rd51, %rd41, 40;
|
66 |
+
add.s64 %rd53, %rd41, 48;
|
67 |
+
add.s64 %rd55, %rd41, 56;
|
68 |
+
mov.pred %p1, -1;
|
69 |
+
.loc 1 26 35
|
70 |
+
mov.u64 %rd40, 0x0;
|
71 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
|
72 |
+
mov.u64 %rd42, 0x0;
|
73 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
|
74 |
+
mov.u64 %rd44, 0x0;
|
75 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
|
76 |
+
mov.u64 %rd46, 0x0;
|
77 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
|
78 |
+
mov.u64 %rd48, 0x0;
|
79 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
|
80 |
+
mov.u64 %rd50, 0x0;
|
81 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd50 }, [ %rd51 + 0 ];
|
82 |
+
mov.u64 %rd52, 0x0;
|
83 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd52 }, [ %rd53 + 0 ];
|
84 |
+
mov.u64 %rd54, 0x0;
|
85 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd54 }, [ %rd55 + 0 ];
|
86 |
+
.loc 1 27 19
|
87 |
+
mov.u32 %r7, 0x0;
|
88 |
+
@%p1 ld.global.b32 { %r7 }, [ %rd56 + 0 ];
|
89 |
+
.loc 1 29 19
|
90 |
+
mov.u32 %r8, 0x0;
|
91 |
+
@%p1 ld.global.b32 { %r8 }, [ %rd57 + 0 ];
|
92 |
+
.loc 1 36 46
|
93 |
+
mul.wide.s32 %rd4, %r3, 402056;
|
94 |
+
mul.lo.s64 %rd5, %rd61, 50257;
|
95 |
+
.loc 1 38 23
|
96 |
+
setp.eq.s64 %p11, %rd40, -1;
|
97 |
+
setp.eq.s64 %p12, %rd42, -1;
|
98 |
+
setp.eq.s64 %p13, %rd44, -1;
|
99 |
+
setp.eq.s64 %p14, %rd46, -1;
|
100 |
+
setp.eq.s64 %p15, %rd48, -1;
|
101 |
+
setp.eq.s64 %p16, %rd50, -1;
|
102 |
+
setp.eq.s64 %p17, %rd52, -1;
|
103 |
+
setp.eq.s64 %p18, %rd54, -1;
|
104 |
+
.loc 1 39 22
|
105 |
+
div.full.f32 %r6, %r7, %r8;
|
106 |
+
mov.b32 %f89, %r6;
|
107 |
+
.loc 1 41 37
|
108 |
+
selp.f32 %f8, 0f00000000, %f89, %p18;
|
109 |
+
selp.f32 %f7, 0f00000000, %f89, %p17;
|
110 |
+
selp.f32 %f6, 0f00000000, %f89, %p16;
|
111 |
+
selp.f32 %f5, 0f00000000, %f89, %p15;
|
112 |
+
selp.f32 %f4, 0f00000000, %f89, %p14;
|
113 |
+
selp.f32 %f3, 0f00000000, %f89, %p13;
|
114 |
+
selp.f32 %f2, 0f00000000, %f89, %p12;
|
115 |
+
selp.f32 %f1, 0f00000000, %f89, %p11;
|
116 |
+
mov.f32 %f377, 0f00000000;
|
117 |
+
mov.u64 %rd157, 0;
|
118 |
+
shl.b64 %rd83, %rd4, 2;
|
119 |
+
shl.b64 %rd86, %rd5, 2;
|
120 |
+
mov.f32 %f378, %f377;
|
121 |
+
mov.f32 %f379, %f377;
|
122 |
+
mov.f32 %f380, %f377;
|
123 |
+
mov.f32 %f381, %f377;
|
124 |
+
mov.f32 %f382, %f377;
|
125 |
+
mov.f32 %f383, %f377;
|
126 |
+
mov.f32 %f384, %f377;
|
127 |
+
mov.f32 %f385, %f377;
|
128 |
+
mov.f32 %f386, %f377;
|
129 |
+
mov.f32 %f387, %f377;
|
130 |
+
mov.f32 %f388, %f377;
|
131 |
+
mov.f32 %f389, %f377;
|
132 |
+
mov.f32 %f390, %f377;
|
133 |
+
mov.f32 %f391, %f377;
|
134 |
+
mov.f32 %f392, %f377;
|
135 |
+
$L__BB0_1:
|
136 |
+
.loc 1 33 27
|
137 |
+
or.b64 %rd79, %rd157, %rd2;
|
138 |
+
or.b64 %rd80, %rd157, %rd3;
|
139 |
+
.loc 1 34 25
|
140 |
+
setp.lt.u64 %p22, %rd80, 50257;
|
141 |
+
setp.lt.u64 %p20, %rd79, 50257;
|
142 |
+
.loc 1 36 34
|
143 |
+
shl.b64 %rd81, %rd79, 2;
|
144 |
+
add.s64 %rd82, %rd36, %rd81;
|
145 |
+
add.s64 %rd63, %rd82, %rd83;
|
146 |
+
shl.b64 %rd84, %rd80, 2;
|
147 |
+
add.s64 %rd85, %rd36, %rd84;
|
148 |
+
add.s64 %rd64, %rd85, %rd83;
|
149 |
+
add.s64 %rd65, %rd82, %rd86;
|
150 |
+
add.s64 %rd66, %rd85, %rd86;
|
151 |
+
add.s64 %rd67, %rd65, 201028;
|
152 |
+
add.s64 %rd68, %rd66, 201028;
|
153 |
+
add.s64 %rd69, %rd65, 402056;
|
154 |
+
add.s64 %rd70, %rd66, 402056;
|
155 |
+
add.s64 %rd71, %rd65, 603084;
|
156 |
+
add.s64 %rd72, %rd66, 603084;
|
157 |
+
add.s64 %rd73, %rd65, 804112;
|
158 |
+
add.s64 %rd74, %rd66, 804112;
|
159 |
+
add.s64 %rd75, %rd65, 1005140;
|
160 |
+
add.s64 %rd76, %rd66, 1005140;
|
161 |
+
add.s64 %rd77, %rd65, 1206168;
|
162 |
+
add.s64 %rd78, %rd66, 1206168;
|
163 |
+
mov.b32 %r173, 0;
|
164 |
+
.loc 1 36 52
|
165 |
+
mov.u32 %r11, 0x0;
|
166 |
+
@%p20 ld.global.L1::evict_last.b32 { %r11 }, [ %rd63 + 0 ];
|
167 |
+
@!%p20 mov.u32 %r11, %r173;
|
168 |
+
mov.u32 %r13, 0x0;
|
169 |
+
@%p22 ld.global.L1::evict_last.b32 { %r13 }, [ %rd64 + 0 ];
|
170 |
+
@!%p22 mov.u32 %r13, %r173;
|
171 |
+
mov.u32 %r15, 0x0;
|
172 |
+
@%p20 ld.global.L1::evict_last.b32 { %r15 }, [ %rd65 + 0 ];
|
173 |
+
@!%p20 mov.u32 %r15, %r173;
|
174 |
+
mov.u32 %r17, 0x0;
|
175 |
+
@%p22 ld.global.L1::evict_last.b32 { %r17 }, [ %rd66 + 0 ];
|
176 |
+
@!%p22 mov.u32 %r17, %r173;
|
177 |
+
mov.u32 %r19, 0x0;
|
178 |
+
@%p20 ld.global.L1::evict_last.b32 { %r19 }, [ %rd67 + 0 ];
|
179 |
+
@!%p20 mov.u32 %r19, %r173;
|
180 |
+
mov.u32 %r21, 0x0;
|
181 |
+
@%p22 ld.global.L1::evict_last.b32 { %r21 }, [ %rd68 + 0 ];
|
182 |
+
@!%p22 mov.u32 %r21, %r173;
|
183 |
+
mov.u32 %r23, 0x0;
|
184 |
+
@%p20 ld.global.L1::evict_last.b32 { %r23 }, [ %rd69 + 0 ];
|
185 |
+
@!%p20 mov.u32 %r23, %r173;
|
186 |
+
mov.u32 %r25, 0x0;
|
187 |
+
@%p22 ld.global.L1::evict_last.b32 { %r25 }, [ %rd70 + 0 ];
|
188 |
+
@!%p22 mov.u32 %r25, %r173;
|
189 |
+
mov.u32 %r27, 0x0;
|
190 |
+
@%p20 ld.global.L1::evict_last.b32 { %r27 }, [ %rd71 + 0 ];
|
191 |
+
@!%p20 mov.u32 %r27, %r173;
|
192 |
+
mov.u32 %r29, 0x0;
|
193 |
+
@%p22 ld.global.L1::evict_last.b32 { %r29 }, [ %rd72 + 0 ];
|
194 |
+
@!%p22 mov.u32 %r29, %r173;
|
195 |
+
mov.u32 %r31, 0x0;
|
196 |
+
@%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd73 + 0 ];
|
197 |
+
@!%p20 mov.u32 %r31, %r173;
|
198 |
+
mov.u32 %r33, 0x0;
|
199 |
+
@%p22 ld.global.L1::evict_last.b32 { %r33 }, [ %rd74 + 0 ];
|
200 |
+
@!%p22 mov.u32 %r33, %r173;
|
201 |
+
mov.u32 %r35, 0x0;
|
202 |
+
@%p20 ld.global.L1::evict_last.b32 { %r35 }, [ %rd75 + 0 ];
|
203 |
+
@!%p20 mov.u32 %r35, %r173;
|
204 |
+
mov.u32 %r37, 0x0;
|
205 |
+
@%p22 ld.global.L1::evict_last.b32 { %r37 }, [ %rd76 + 0 ];
|
206 |
+
@!%p22 mov.u32 %r37, %r173;
|
207 |
+
mov.u32 %r39, 0x0;
|
208 |
+
@%p20 ld.global.L1::evict_last.b32 { %r39 }, [ %rd77 + 0 ];
|
209 |
+
@!%p20 mov.u32 %r39, %r173;
|
210 |
+
mov.u32 %r41, 0x0;
|
211 |
+
@%p22 ld.global.L1::evict_last.b32 { %r41 }, [ %rd78 + 0 ];
|
212 |
+
@!%p22 mov.u32 %r41, %r173;
|
213 |
+
mov.b32 %f90, %r41;
|
214 |
+
mov.b32 %f91, %r39;
|
215 |
+
mov.b32 %f92, %r37;
|
216 |
+
mov.b32 %f93, %r35;
|
217 |
+
mov.b32 %f94, %r33;
|
218 |
+
mov.b32 %f95, %r31;
|
219 |
+
mov.b32 %f96, %r29;
|
220 |
+
mov.b32 %f97, %r27;
|
221 |
+
mov.b32 %f98, %r25;
|
222 |
+
mov.b32 %f99, %r23;
|
223 |
+
mov.b32 %f100, %r21;
|
224 |
+
mov.b32 %f101, %r19;
|
225 |
+
mov.b32 %f102, %r17;
|
226 |
+
mov.b32 %f103, %r15;
|
227 |
+
mov.b32 %f104, %r13;
|
228 |
+
mov.b32 %f105, %r11;
|
229 |
+
.loc 1 42 23
|
230 |
+
mul.f32 %f106, %f1, %f105;
|
231 |
+
mul.f32 %f107, %f1, %f104;
|
232 |
+
mul.f32 %f108, %f2, %f103;
|
233 |
+
mul.f32 %f109, %f2, %f102;
|
234 |
+
mul.f32 %f110, %f3, %f101;
|
235 |
+
mul.f32 %f111, %f3, %f100;
|
236 |
+
mul.f32 %f112, %f4, %f99;
|
237 |
+
mul.f32 %f113, %f4, %f98;
|
238 |
+
mul.f32 %f114, %f5, %f97;
|
239 |
+
mul.f32 %f115, %f5, %f96;
|
240 |
+
mul.f32 %f116, %f6, %f95;
|
241 |
+
mul.f32 %f117, %f6, %f94;
|
242 |
+
mul.f32 %f118, %f7, %f93;
|
243 |
+
mul.f32 %f119, %f7, %f92;
|
244 |
+
mul.f32 %f120, %f8, %f91;
|
245 |
+
mul.f32 %f121, %f8, %f90;
|
246 |
+
.loc 1 45 40
|
247 |
+
selp.f32 %f122, %f121, 0f80000000, %p22;
|
248 |
+
selp.f32 %f123, %f120, 0f80000000, %p20;
|
249 |
+
selp.f32 %f124, %f119, 0f80000000, %p22;
|
250 |
+
selp.f32 %f125, %f118, 0f80000000, %p20;
|
251 |
+
selp.f32 %f126, %f117, 0f80000000, %p22;
|
252 |
+
selp.f32 %f127, %f116, 0f80000000, %p20;
|
253 |
+
selp.f32 %f128, %f115, 0f80000000, %p22;
|
254 |
+
selp.f32 %f129, %f114, 0f80000000, %p20;
|
255 |
+
selp.f32 %f130, %f113, 0f80000000, %p22;
|
256 |
+
selp.f32 %f131, %f112, 0f80000000, %p20;
|
257 |
+
selp.f32 %f132, %f111, 0f80000000, %p22;
|
258 |
+
selp.f32 %f133, %f110, 0f80000000, %p20;
|
259 |
+
selp.f32 %f134, %f109, 0f80000000, %p22;
|
260 |
+
selp.f32 %f135, %f108, 0f80000000, %p20;
|
261 |
+
selp.f32 %f136, %f107, 0f80000000, %p22;
|
262 |
+
selp.f32 %f137, %f106, 0f80000000, %p20;
|
263 |
+
add.f32 %f377, %f377, %f137;
|
264 |
+
add.f32 %f378, %f378, %f136;
|
265 |
+
add.f32 %f379, %f379, %f135;
|
266 |
+
add.f32 %f380, %f380, %f134;
|
267 |
+
add.f32 %f381, %f381, %f133;
|
268 |
+
add.f32 %f382, %f382, %f132;
|
269 |
+
add.f32 %f383, %f383, %f131;
|
270 |
+
add.f32 %f384, %f384, %f130;
|
271 |
+
add.f32 %f385, %f385, %f129;
|
272 |
+
add.f32 %f386, %f386, %f128;
|
273 |
+
add.f32 %f387, %f387, %f127;
|
274 |
+
add.f32 %f388, %f388, %f126;
|
275 |
+
add.f32 %f389, %f389, %f125;
|
276 |
+
add.f32 %f390, %f390, %f124;
|
277 |
+
add.f32 %f391, %f391, %f123;
|
278 |
+
add.f32 %f392, %f392, %f122;
|
279 |
+
.loc 1 32 36
|
280 |
+
add.s64 %rd157, %rd157, 512;
|
281 |
+
cvt.u32.u64 %r43, %rd157;
|
282 |
+
add.s32 %r44, %r43, -512;
|
283 |
+
setp.lt.u32 %p51, %r44, 49745;
|
284 |
+
@%p51 bra $L__BB0_1;
|
285 |
+
.loc 1 22 44
|
286 |
+
and.b32 %r65, %r1, 31;
|
287 |
+
.loc 1 24 33
|
288 |
+
and.b32 %r66, %r2, 7;
|
289 |
+
$L__tmp1:
|
290 |
+
.loc 2 233 15
|
291 |
+
add.f32 %f138, %f377, %f378;
|
292 |
+
add.f32 %f139, %f379, %f380;
|
293 |
+
add.f32 %f140, %f381, %f382;
|
294 |
+
add.f32 %f141, %f383, %f384;
|
295 |
+
add.f32 %f142, %f385, %f386;
|
296 |
+
add.f32 %f143, %f387, %f388;
|
297 |
+
add.f32 %f144, %f389, %f390;
|
298 |
+
add.f32 %f145, %f391, %f392;
|
299 |
+
$L__tmp2:
|
300 |
+
.loc 2 243 36
|
301 |
+
mov.b32 %r67, %f138;
|
302 |
+
shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
|
303 |
+
mov.b32 %f146, %r68;
|
304 |
+
$L__tmp3:
|
305 |
+
.loc 2 233 15
|
306 |
+
add.f32 %f147, %f138, %f146;
|
307 |
+
$L__tmp4:
|
308 |
+
.loc 2 243 36
|
309 |
+
mov.b32 %r69, %f147;
|
310 |
+
shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
|
311 |
+
mov.b32 %f148, %r70;
|
312 |
+
$L__tmp5:
|
313 |
+
.loc 2 233 15
|
314 |
+
add.f32 %f149, %f147, %f148;
|
315 |
+
$L__tmp6:
|
316 |
+
.loc 2 243 36
|
317 |
+
mov.b32 %r71, %f149;
|
318 |
+
shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
|
319 |
+
mov.b32 %f150, %r72;
|
320 |
+
$L__tmp7:
|
321 |
+
.loc 2 233 15
|
322 |
+
add.f32 %f151, %f149, %f150;
|
323 |
+
$L__tmp8:
|
324 |
+
.loc 2 243 36
|
325 |
+
mov.b32 %r73, %f151;
|
326 |
+
shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1;
|
327 |
+
mov.b32 %f152, %r74;
|
328 |
+
$L__tmp9:
|
329 |
+
.loc 2 233 15
|
330 |
+
add.f32 %f153, %f151, %f152;
|
331 |
+
$L__tmp10:
|
332 |
+
.loc 2 243 36
|
333 |
+
mov.b32 %r75, %f153;
|
334 |
+
shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1;
|
335 |
+
mov.b32 %f154, %r76;
|
336 |
+
$L__tmp11:
|
337 |
+
.loc 2 233 15
|
338 |
+
add.f32 %f155, %f153, %f154;
|
339 |
+
$L__tmp12:
|
340 |
+
.loc 2 243 36
|
341 |
+
mov.b32 %r77, %f139;
|
342 |
+
shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
|
343 |
+
mov.b32 %f156, %r78;
|
344 |
+
$L__tmp13:
|
345 |
+
.loc 2 233 15
|
346 |
+
add.f32 %f157, %f139, %f156;
|
347 |
+
$L__tmp14:
|
348 |
+
.loc 2 243 36
|
349 |
+
mov.b32 %r79, %f157;
|
350 |
+
shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
|
351 |
+
mov.b32 %f158, %r80;
|
352 |
+
$L__tmp15:
|
353 |
+
.loc 2 233 15
|
354 |
+
add.f32 %f159, %f157, %f158;
|
355 |
+
$L__tmp16:
|
356 |
+
.loc 2 243 36
|
357 |
+
mov.b32 %r81, %f159;
|
358 |
+
shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
|
359 |
+
mov.b32 %f160, %r82;
|
360 |
+
$L__tmp17:
|
361 |
+
.loc 2 233 15
|
362 |
+
add.f32 %f161, %f159, %f160;
|
363 |
+
$L__tmp18:
|
364 |
+
.loc 2 243 36
|
365 |
+
mov.b32 %r83, %f161;
|
366 |
+
shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
|
367 |
+
mov.b32 %f162, %r84;
|
368 |
+
$L__tmp19:
|
369 |
+
.loc 2 233 15
|
370 |
+
add.f32 %f163, %f161, %f162;
|
371 |
+
$L__tmp20:
|
372 |
+
.loc 2 243 36
|
373 |
+
mov.b32 %r85, %f163;
|
374 |
+
shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
|
375 |
+
mov.b32 %f164, %r86;
|
376 |
+
$L__tmp21:
|
377 |
+
.loc 2 233 15
|
378 |
+
add.f32 %f165, %f163, %f164;
|
379 |
+
$L__tmp22:
|
380 |
+
.loc 2 243 36
|
381 |
+
mov.b32 %r87, %f140;
|
382 |
+
shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
|
383 |
+
mov.b32 %f166, %r88;
|
384 |
+
$L__tmp23:
|
385 |
+
.loc 2 233 15
|
386 |
+
add.f32 %f167, %f140, %f166;
|
387 |
+
$L__tmp24:
|
388 |
+
.loc 2 243 36
|
389 |
+
mov.b32 %r89, %f167;
|
390 |
+
shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
|
391 |
+
mov.b32 %f168, %r90;
|
392 |
+
$L__tmp25:
|
393 |
+
.loc 2 233 15
|
394 |
+
add.f32 %f169, %f167, %f168;
|
395 |
+
$L__tmp26:
|
396 |
+
.loc 2 243 36
|
397 |
+
mov.b32 %r91, %f169;
|
398 |
+
shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
|
399 |
+
mov.b32 %f170, %r92;
|
400 |
+
$L__tmp27:
|
401 |
+
.loc 2 233 15
|
402 |
+
add.f32 %f171, %f169, %f170;
|
403 |
+
$L__tmp28:
|
404 |
+
.loc 2 243 36
|
405 |
+
mov.b32 %r93, %f171;
|
406 |
+
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
|
407 |
+
mov.b32 %f172, %r94;
|
408 |
+
$L__tmp29:
|
409 |
+
.loc 2 233 15
|
410 |
+
add.f32 %f173, %f171, %f172;
|
411 |
+
$L__tmp30:
|
412 |
+
.loc 2 243 36
|
413 |
+
mov.b32 %r95, %f173;
|
414 |
+
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
|
415 |
+
mov.b32 %f174, %r96;
|
416 |
+
$L__tmp31:
|
417 |
+
.loc 2 233 15
|
418 |
+
add.f32 %f175, %f173, %f174;
|
419 |
+
$L__tmp32:
|
420 |
+
.loc 2 243 36
|
421 |
+
mov.b32 %r97, %f141;
|
422 |
+
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
|
423 |
+
mov.b32 %f176, %r98;
|
424 |
+
$L__tmp33:
|
425 |
+
.loc 2 233 15
|
426 |
+
add.f32 %f177, %f141, %f176;
|
427 |
+
$L__tmp34:
|
428 |
+
.loc 2 243 36
|
429 |
+
mov.b32 %r99, %f177;
|
430 |
+
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
|
431 |
+
mov.b32 %f178, %r100;
|
432 |
+
$L__tmp35:
|
433 |
+
.loc 2 233 15
|
434 |
+
add.f32 %f179, %f177, %f178;
|
435 |
+
$L__tmp36:
|
436 |
+
.loc 2 243 36
|
437 |
+
mov.b32 %r101, %f179;
|
438 |
+
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
|
439 |
+
mov.b32 %f180, %r102;
|
440 |
+
$L__tmp37:
|
441 |
+
.loc 2 233 15
|
442 |
+
add.f32 %f181, %f179, %f180;
|
443 |
+
$L__tmp38:
|
444 |
+
.loc 2 243 36
|
445 |
+
mov.b32 %r103, %f181;
|
446 |
+
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
|
447 |
+
mov.b32 %f182, %r104;
|
448 |
+
$L__tmp39:
|
449 |
+
.loc 2 233 15
|
450 |
+
add.f32 %f183, %f181, %f182;
|
451 |
+
$L__tmp40:
|
452 |
+
.loc 2 243 36
|
453 |
+
mov.b32 %r105, %f183;
|
454 |
+
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
|
455 |
+
mov.b32 %f184, %r106;
|
456 |
+
$L__tmp41:
|
457 |
+
.loc 2 233 15
|
458 |
+
add.f32 %f185, %f183, %f184;
|
459 |
+
$L__tmp42:
|
460 |
+
.loc 2 243 36
|
461 |
+
mov.b32 %r107, %f142;
|
462 |
+
shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1;
|
463 |
+
mov.b32 %f186, %r108;
|
464 |
+
$L__tmp43:
|
465 |
+
.loc 2 233 15
|
466 |
+
add.f32 %f187, %f142, %f186;
|
467 |
+
$L__tmp44:
|
468 |
+
.loc 2 243 36
|
469 |
+
mov.b32 %r109, %f187;
|
470 |
+
shfl.sync.bfly.b32 %r110, %r109, 8, 31, -1;
|
471 |
+
mov.b32 %f188, %r110;
|
472 |
+
$L__tmp45:
|
473 |
+
.loc 2 233 15
|
474 |
+
add.f32 %f189, %f187, %f188;
|
475 |
+
$L__tmp46:
|
476 |
+
.loc 2 243 36
|
477 |
+
mov.b32 %r111, %f189;
|
478 |
+
shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
|
479 |
+
mov.b32 %f190, %r112;
|
480 |
+
$L__tmp47:
|
481 |
+
.loc 2 233 15
|
482 |
+
add.f32 %f191, %f189, %f190;
|
483 |
+
$L__tmp48:
|
484 |
+
.loc 2 243 36
|
485 |
+
mov.b32 %r113, %f191;
|
486 |
+
shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
|
487 |
+
mov.b32 %f192, %r114;
|
488 |
+
$L__tmp49:
|
489 |
+
.loc 2 233 15
|
490 |
+
add.f32 %f193, %f191, %f192;
|
491 |
+
$L__tmp50:
|
492 |
+
.loc 2 243 36
|
493 |
+
mov.b32 %r115, %f193;
|
494 |
+
shfl.sync.bfly.b32 %r116, %r115, 1, 31, -1;
|
495 |
+
mov.b32 %f194, %r116;
|
496 |
+
$L__tmp51:
|
497 |
+
.loc 2 233 15
|
498 |
+
add.f32 %f195, %f193, %f194;
|
499 |
+
$L__tmp52:
|
500 |
+
.loc 2 243 36
|
501 |
+
mov.b32 %r117, %f143;
|
502 |
+
shfl.sync.bfly.b32 %r118, %r117, 16, 31, -1;
|
503 |
+
mov.b32 %f196, %r118;
|
504 |
+
$L__tmp53:
|
505 |
+
.loc 2 233 15
|
506 |
+
add.f32 %f197, %f143, %f196;
|
507 |
+
$L__tmp54:
|
508 |
+
.loc 2 243 36
|
509 |
+
mov.b32 %r119, %f197;
|
510 |
+
shfl.sync.bfly.b32 %r120, %r119, 8, 31, -1;
|
511 |
+
mov.b32 %f198, %r120;
|
512 |
+
$L__tmp55:
|
513 |
+
.loc 2 233 15
|
514 |
+
add.f32 %f199, %f197, %f198;
|
515 |
+
$L__tmp56:
|
516 |
+
.loc 2 243 36
|
517 |
+
mov.b32 %r121, %f199;
|
518 |
+
shfl.sync.bfly.b32 %r122, %r121, 4, 31, -1;
|
519 |
+
mov.b32 %f200, %r122;
|
520 |
+
$L__tmp57:
|
521 |
+
.loc 2 233 15
|
522 |
+
add.f32 %f201, %f199, %f200;
|
523 |
+
$L__tmp58:
|
524 |
+
.loc 2 243 36
|
525 |
+
mov.b32 %r123, %f201;
|
526 |
+
shfl.sync.bfly.b32 %r124, %r123, 2, 31, -1;
|
527 |
+
mov.b32 %f202, %r124;
|
528 |
+
$L__tmp59:
|
529 |
+
.loc 2 233 15
|
530 |
+
add.f32 %f203, %f201, %f202;
|
531 |
+
$L__tmp60:
|
532 |
+
.loc 2 243 36
|
533 |
+
mov.b32 %r125, %f203;
|
534 |
+
shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1;
|
535 |
+
mov.b32 %f204, %r126;
|
536 |
+
$L__tmp61:
|
537 |
+
.loc 2 233 15
|
538 |
+
add.f32 %f205, %f203, %f204;
|
539 |
+
$L__tmp62:
|
540 |
+
.loc 2 243 36
|
541 |
+
mov.b32 %r127, %f144;
|
542 |
+
shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
|
543 |
+
mov.b32 %f206, %r128;
|
544 |
+
$L__tmp63:
|
545 |
+
.loc 2 233 15
|
546 |
+
add.f32 %f207, %f144, %f206;
|
547 |
+
$L__tmp64:
|
548 |
+
.loc 2 243 36
|
549 |
+
mov.b32 %r129, %f207;
|
550 |
+
shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
|
551 |
+
mov.b32 %f208, %r130;
|
552 |
+
$L__tmp65:
|
553 |
+
.loc 2 233 15
|
554 |
+
add.f32 %f209, %f207, %f208;
|
555 |
+
$L__tmp66:
|
556 |
+
.loc 2 243 36
|
557 |
+
mov.b32 %r131, %f209;
|
558 |
+
shfl.sync.bfly.b32 %r132, %r131, 4, 31, -1;
|
559 |
+
mov.b32 %f210, %r132;
|
560 |
+
$L__tmp67:
|
561 |
+
.loc 2 233 15
|
562 |
+
add.f32 %f211, %f209, %f210;
|
563 |
+
$L__tmp68:
|
564 |
+
.loc 2 243 36
|
565 |
+
mov.b32 %r133, %f211;
|
566 |
+
shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
|
567 |
+
mov.b32 %f212, %r134;
|
568 |
+
$L__tmp69:
|
569 |
+
.loc 2 233 15
|
570 |
+
add.f32 %f213, %f211, %f212;
|
571 |
+
$L__tmp70:
|
572 |
+
.loc 2 243 36
|
573 |
+
mov.b32 %r135, %f213;
|
574 |
+
shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
|
575 |
+
mov.b32 %f214, %r136;
|
576 |
+
$L__tmp71:
|
577 |
+
.loc 2 233 15
|
578 |
+
add.f32 %f215, %f213, %f214;
|
579 |
+
$L__tmp72:
|
580 |
+
.loc 2 243 36
|
581 |
+
mov.b32 %r137, %f145;
|
582 |
+
shfl.sync.bfly.b32 %r138, %r137, 16, 31, -1;
|
583 |
+
mov.b32 %f216, %r138;
|
584 |
+
$L__tmp73:
|
585 |
+
.loc 2 233 15
|
586 |
+
add.f32 %f217, %f145, %f216;
|
587 |
+
$L__tmp74:
|
588 |
+
.loc 2 243 36
|
589 |
+
mov.b32 %r139, %f217;
|
590 |
+
shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1;
|
591 |
+
mov.b32 %f218, %r140;
|
592 |
+
$L__tmp75:
|
593 |
+
.loc 2 233 15
|
594 |
+
add.f32 %f219, %f217, %f218;
|
595 |
+
$L__tmp76:
|
596 |
+
.loc 2 243 36
|
597 |
+
mov.b32 %r141, %f219;
|
598 |
+
shfl.sync.bfly.b32 %r142, %r141, 4, 31, -1;
|
599 |
+
mov.b32 %f220, %r142;
|
600 |
+
$L__tmp77:
|
601 |
+
.loc 2 233 15
|
602 |
+
add.f32 %f221, %f219, %f220;
|
603 |
+
$L__tmp78:
|
604 |
+
.loc 2 243 36
|
605 |
+
mov.b32 %r143, %f221;
|
606 |
+
shfl.sync.bfly.b32 %r144, %r143, 2, 31, -1;
|
607 |
+
mov.b32 %f222, %r144;
|
608 |
+
$L__tmp79:
|
609 |
+
.loc 2 233 15
|
610 |
+
add.f32 %f223, %f221, %f222;
|
611 |
+
$L__tmp80:
|
612 |
+
.loc 2 243 36
|
613 |
+
mov.b32 %r145, %f223;
|
614 |
+
shfl.sync.bfly.b32 %r146, %r145, 1, 31, -1;
|
615 |
+
mov.b32 %f224, %r146;
|
616 |
+
$L__tmp81:
|
617 |
+
.loc 2 233 15
|
618 |
+
add.f32 %f225, %f223, %f224;
|
619 |
+
$L__tmp82:
|
620 |
+
.loc 2 243 36
|
621 |
+
setp.eq.s32 %p52, %r65, 0;
|
622 |
+
shl.b32 %r147, %r66, 2;
|
623 |
+
mov.u32 %r148, global_smem;
|
624 |
+
add.s32 %r45, %r148, %r147;
|
625 |
+
mov.b32 %r46, %f155;
|
626 |
+
@%p52 st.shared.b32 [ %r45 + 0 ], %r46;
|
627 |
+
add.s32 %r47, %r45, 32;
|
628 |
+
mov.b32 %r48, %f165;
|
629 |
+
@%p52 st.shared.b32 [ %r47 + 0 ], %r48;
|
630 |
+
add.s32 %r49, %r45, 64;
|
631 |
+
mov.b32 %r50, %f175;
|
632 |
+
@%p52 st.shared.b32 [ %r49 + 0 ], %r50;
|
633 |
+
add.s32 %r51, %r45, 96;
|
634 |
+
mov.b32 %r52, %f185;
|
635 |
+
@%p52 st.shared.b32 [ %r51 + 0 ], %r52;
|
636 |
+
add.s32 %r53, %r45, 128;
|
637 |
+
mov.b32 %r54, %f195;
|
638 |
+
@%p52 st.shared.b32 [ %r53 + 0 ], %r54;
|
639 |
+
add.s32 %r55, %r45, 160;
|
640 |
+
mov.b32 %r56, %f205;
|
641 |
+
@%p52 st.shared.b32 [ %r55 + 0 ], %r56;
|
642 |
+
add.s32 %r57, %r45, 192;
|
643 |
+
mov.b32 %r58, %f215;
|
644 |
+
@%p52 st.shared.b32 [ %r57 + 0 ], %r58;
|
645 |
+
add.s32 %r59, %r45, 224;
|
646 |
+
mov.b32 %r60, %f225;
|
647 |
+
@%p52 st.shared.b32 [ %r59 + 0 ], %r60;
|
648 |
+
bar.sync 0;
|
649 |
+
setp.lt.s32 %p60, %r1, 64;
|
650 |
+
shl.b32 %r149, %r1, 2;
|
651 |
+
add.s32 %r62, %r148, %r149;
|
652 |
+
@%p60 ld.shared.b32 %r61, [ %r62 + 0 ];
|
653 |
+
mov.b32 %f226, %r61;
|
654 |
+
shfl.sync.bfly.b32 %r150, %r61, 4, 31, -1;
|
655 |
+
mov.b32 %f227, %r150;
|
656 |
+
$L__tmp83:
|
657 |
+
.loc 2 233 15
|
658 |
+
add.f32 %f228, %f226, %f227;
|
659 |
+
$L__tmp84:
|
660 |
+
.loc 2 243 36
|
661 |
+
mov.b32 %r151, %f228;
|
662 |
+
shfl.sync.bfly.b32 %r152, %r151, 2, 31, -1;
|
663 |
+
mov.b32 %f229, %r152;
|
664 |
+
$L__tmp85:
|
665 |
+
.loc 2 233 15
|
666 |
+
add.f32 %f230, %f228, %f229;
|
667 |
+
$L__tmp86:
|
668 |
+
.loc 2 243 36
|
669 |
+
mov.b32 %r153, %f230;
|
670 |
+
shfl.sync.bfly.b32 %r154, %r153, 1, 31, -1;
|
671 |
+
mov.b32 %f231, %r154;
|
672 |
+
$L__tmp87:
|
673 |
+
.loc 2 233 15
|
674 |
+
add.f32 %f232, %f230, %f231;
|
675 |
+
$L__tmp88:
|
676 |
+
.loc 2 243 36
|
677 |
+
and.b32 %r155, %r1, 7;
|
678 |
+
setp.eq.s32 %p62, %r155, 0;
|
679 |
+
and.pred %p61, %p60, %p62;
|
680 |
+
mov.b32 %r64, %f232;
|
681 |
+
@%p61 st.shared.b32 [ %r62 + 0 ], %r64;
|
682 |
+
bar.sync 0;
|
683 |
+
ld.shared.f32 %f57, [global_smem];
|
684 |
+
ld.shared.f32 %f58, [global_smem+32];
|
685 |
+
ld.shared.f32 %f59, [global_smem+64];
|
686 |
+
ld.shared.f32 %f60, [global_smem+96];
|
687 |
+
ld.shared.f32 %f61, [global_smem+128];
|
688 |
+
ld.shared.f32 %f62, [global_smem+160];
|
689 |
+
ld.shared.f32 %f63, [global_smem+192];
|
690 |
+
ld.shared.f32 %f64, [global_smem+224];
|
691 |
+
$L__tmp89:
|
692 |
+
.loc 1 51 36
|
693 |
+
mul.lo.s64 %rd10, %rd1, 804112;
|
694 |
+
shl.b64 %rd88, %rd3, 1;
|
695 |
+
add.s64 %rd164, %rd39, %rd88;
|
696 |
+
add.s64 %rd163, %rd38, %rd88;
|
697 |
+
shl.b64 %rd13, %rd3, 2;
|
698 |
+
mul.lo.s64 %rd89, %rd1, 1608224;
|
699 |
+
add.s64 %rd162, %rd36, %rd89;
|
700 |
+
add.s64 %rd161, %rd37, %rd88;
|
701 |
+
shl.b64 %rd90, %rd2, 1;
|
702 |
+
add.s64 %rd160, %rd39, %rd90;
|
703 |
+
add.s64 %rd159, %rd38, %rd90;
|
704 |
+
shl.b64 %rd18, %rd2, 2;
|
705 |
+
add.s64 %rd158, %rd37, %rd90;
|
706 |
+
mov.u64 %rd165, 0;
|
707 |
+
mov.u16 %rs2, 0;
|
708 |
+
$L__BB0_3:
|
709 |
+
.loc 1 52 27
|
710 |
+
add.s64 %rd155, %rd2, %rd165;
|
711 |
+
.loc 1 53 25
|
712 |
+
add.s64 %rd156, %rd3, %rd165;
|
713 |
+
setp.lt.u64 %p63, %rd155, 50257;
|
714 |
+
setp.lt.u64 %p65, %rd156, 50257;
|
715 |
+
.loc 1 55 35
|
716 |
+
add.s64 %rd91, %rd158, %rd10;
|
717 |
+
add.s64 %rd92, %rd161, %rd10;
|
718 |
+
add.s64 %rd93, %rd91, 100514;
|
719 |
+
add.s64 %rd94, %rd92, 100514;
|
720 |
+
add.s64 %rd95, %rd91, 201028;
|
721 |
+
add.s64 %rd96, %rd92, 201028;
|
722 |
+
add.s64 %rd97, %rd91, 301542;
|
723 |
+
add.s64 %rd98, %rd92, 301542;
|
724 |
+
add.s64 %rd99, %rd91, 402056;
|
725 |
+
add.s64 %rd100, %rd92, 402056;
|
726 |
+
add.s64 %rd101, %rd91, 502570;
|
727 |
+
add.s64 %rd102, %rd92, 502570;
|
728 |
+
add.s64 %rd103, %rd91, 603084;
|
729 |
+
add.s64 %rd104, %rd92, 603084;
|
730 |
+
add.s64 %rd105, %rd91, 703598;
|
731 |
+
.loc 1 55 53
|
732 |
+
add.s64 %rd106, %rd92, 703598;
|
733 |
+
mov.u16 %rs1, 0x0;
|
734 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd91 + 0 ];
|
735 |
+
@!%p63 mov.u16 %rs1, %rs2;
|
736 |
+
mov.u16 %rs3, 0x0;
|
737 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd92 + 0 ];
|
738 |
+
@!%p65 mov.u16 %rs3, %rs2;
|
739 |
+
mov.u16 %rs5, 0x0;
|
740 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd93 + 0 ];
|
741 |
+
@!%p63 mov.u16 %rs5, %rs2;
|
742 |
+
mov.u16 %rs7, 0x0;
|
743 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd94 + 0 ];
|
744 |
+
@!%p65 mov.u16 %rs7, %rs2;
|
745 |
+
mov.u16 %rs9, 0x0;
|
746 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd95 + 0 ];
|
747 |
+
@!%p63 mov.u16 %rs9, %rs2;
|
748 |
+
mov.u16 %rs11, 0x0;
|
749 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd96 + 0 ];
|
750 |
+
@!%p65 mov.u16 %rs11, %rs2;
|
751 |
+
mov.u16 %rs13, 0x0;
|
752 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd97 + 0 ];
|
753 |
+
@!%p63 mov.u16 %rs13, %rs2;
|
754 |
+
mov.u16 %rs15, 0x0;
|
755 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd98 + 0 ];
|
756 |
+
@!%p65 mov.u16 %rs15, %rs2;
|
757 |
+
mov.u16 %rs17, 0x0;
|
758 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd99 + 0 ];
|
759 |
+
@!%p63 mov.u16 %rs17, %rs2;
|
760 |
+
mov.u16 %rs19, 0x0;
|
761 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd100 + 0 ];
|
762 |
+
@!%p65 mov.u16 %rs19, %rs2;
|
763 |
+
mov.u16 %rs21, 0x0;
|
764 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd101 + 0 ];
|
765 |
+
@!%p63 mov.u16 %rs21, %rs2;
|
766 |
+
mov.u16 %rs23, 0x0;
|
767 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd102 + 0 ];
|
768 |
+
@!%p65 mov.u16 %rs23, %rs2;
|
769 |
+
mov.u16 %rs25, 0x0;
|
770 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd103 + 0 ];
|
771 |
+
@!%p63 mov.u16 %rs25, %rs2;
|
772 |
+
mov.u16 %rs27, 0x0;
|
773 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd104 + 0 ];
|
774 |
+
@!%p65 mov.u16 %rs27, %rs2;
|
775 |
+
mov.u16 %rs29, 0x0;
|
776 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd105 + 0 ];
|
777 |
+
@!%p63 mov.u16 %rs29, %rs2;
|
778 |
+
mov.u16 %rs31, 0x0;
|
779 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd106 + 0 ];
|
780 |
+
@!%p65 mov.u16 %rs31, %rs2;
|
781 |
+
.loc 1 55 105
|
782 |
+
cvt.f32.bf16 %r156, %rs1;
|
783 |
+
mov.b32 %f265, %r156;
|
784 |
+
cvt.f32.bf16 %r157, %rs3;
|
785 |
+
mov.b32 %f266, %r157;
|
786 |
+
cvt.f32.bf16 %r158, %rs5;
|
787 |
+
mov.b32 %f267, %r158;
|
788 |
+
cvt.f32.bf16 %r159, %rs7;
|
789 |
+
mov.b32 %f268, %r159;
|
790 |
+
cvt.f32.bf16 %r160, %rs9;
|
791 |
+
mov.b32 %f269, %r160;
|
792 |
+
cvt.f32.bf16 %r161, %rs11;
|
793 |
+
mov.b32 %f270, %r161;
|
794 |
+
cvt.f32.bf16 %r162, %rs13;
|
795 |
+
mov.b32 %f271, %r162;
|
796 |
+
cvt.f32.bf16 %r163, %rs15;
|
797 |
+
mov.b32 %f272, %r163;
|
798 |
+
cvt.f32.bf16 %r164, %rs17;
|
799 |
+
mov.b32 %f273, %r164;
|
800 |
+
cvt.f32.bf16 %r165, %rs19;
|
801 |
+
mov.b32 %f274, %r165;
|
802 |
+
cvt.f32.bf16 %r166, %rs21;
|
803 |
+
mov.b32 %f275, %r166;
|
804 |
+
cvt.f32.bf16 %r167, %rs23;
|
805 |
+
mov.b32 %f276, %r167;
|
806 |
+
cvt.f32.bf16 %r168, %rs25;
|
807 |
+
mov.b32 %f277, %r168;
|
808 |
+
cvt.f32.bf16 %r169, %rs27;
|
809 |
+
mov.b32 %f278, %r169;
|
810 |
+
cvt.f32.bf16 %r170, %rs29;
|
811 |
+
mov.b32 %f279, %r170;
|
812 |
+
cvt.f32.bf16 %r171, %rs31;
|
813 |
+
mov.b32 %f280, %r171;
|
814 |
+
.loc 1 56 35
|
815 |
+
add.s64 %rd107, %rd162, %rd18;
|
816 |
+
add.s64 %rd108, %rd162, %rd13;
|
817 |
+
add.s64 %rd109, %rd107, 201028;
|
818 |
+
add.s64 %rd110, %rd108, 201028;
|
819 |
+
add.s64 %rd111, %rd107, 402056;
|
820 |
+
add.s64 %rd112, %rd108, 402056;
|
821 |
+
add.s64 %rd113, %rd107, 603084;
|
822 |
+
add.s64 %rd114, %rd108, 603084;
|
823 |
+
add.s64 %rd115, %rd107, 804112;
|
824 |
+
add.s64 %rd116, %rd108, 804112;
|
825 |
+
add.s64 %rd117, %rd107, 1005140;
|
826 |
+
add.s64 %rd118, %rd108, 1005140;
|
827 |
+
add.s64 %rd119, %rd107, 1206168;
|
828 |
+
add.s64 %rd120, %rd108, 1206168;
|
829 |
+
add.s64 %rd121, %rd107, 1407196;
|
830 |
+
.loc 1 56 53
|
831 |
+
add.s64 %rd122, %rd108, 1407196;
|
832 |
+
mov.u32 %r172, 0x0;
|
833 |
+
@%p63 ld.global.L1::evict_first.b32 { %r172 }, [ %rd107 + 0 ];
|
834 |
+
@!%p63 mov.u32 %r172, %r173;
|
835 |
+
mov.b32 %f281, %r172;
|
836 |
+
mov.u32 %r174, 0x0;
|
837 |
+
@%p65 ld.global.L1::evict_first.b32 { %r174 }, [ %rd108 + 0 ];
|
838 |
+
@!%p65 mov.u32 %r174, %r173;
|
839 |
+
mov.b32 %f282, %r174;
|
840 |
+
mov.u32 %r176, 0x0;
|
841 |
+
@%p63 ld.global.L1::evict_first.b32 { %r176 }, [ %rd109 + 0 ];
|
842 |
+
@!%p63 mov.u32 %r176, %r173;
|
843 |
+
mov.b32 %f283, %r176;
|
844 |
+
mov.u32 %r178, 0x0;
|
845 |
+
@%p65 ld.global.L1::evict_first.b32 { %r178 }, [ %rd110 + 0 ];
|
846 |
+
@!%p65 mov.u32 %r178, %r173;
|
847 |
+
mov.b32 %f284, %r178;
|
848 |
+
mov.u32 %r180, 0x0;
|
849 |
+
@%p63 ld.global.L1::evict_first.b32 { %r180 }, [ %rd111 + 0 ];
|
850 |
+
@!%p63 mov.u32 %r180, %r173;
|
851 |
+
mov.b32 %f285, %r180;
|
852 |
+
mov.u32 %r182, 0x0;
|
853 |
+
@%p65 ld.global.L1::evict_first.b32 { %r182 }, [ %rd112 + 0 ];
|
854 |
+
@!%p65 mov.u32 %r182, %r173;
|
855 |
+
mov.b32 %f286, %r182;
|
856 |
+
mov.u32 %r184, 0x0;
|
857 |
+
@%p63 ld.global.L1::evict_first.b32 { %r184 }, [ %rd113 + 0 ];
|
858 |
+
@!%p63 mov.u32 %r184, %r173;
|
859 |
+
mov.b32 %f287, %r184;
|
860 |
+
mov.u32 %r186, 0x0;
|
861 |
+
@%p65 ld.global.L1::evict_first.b32 { %r186 }, [ %rd114 + 0 ];
|
862 |
+
@!%p65 mov.u32 %r186, %r173;
|
863 |
+
mov.b32 %f288, %r186;
|
864 |
+
mov.u32 %r188, 0x0;
|
865 |
+
@%p63 ld.global.L1::evict_first.b32 { %r188 }, [ %rd115 + 0 ];
|
866 |
+
@!%p63 mov.u32 %r188, %r173;
|
867 |
+
mov.b32 %f289, %r188;
|
868 |
+
mov.u32 %r190, 0x0;
|
869 |
+
@%p65 ld.global.L1::evict_first.b32 { %r190 }, [ %rd116 + 0 ];
|
870 |
+
@!%p65 mov.u32 %r190, %r173;
|
871 |
+
mov.b32 %f290, %r190;
|
872 |
+
mov.u32 %r192, 0x0;
|
873 |
+
@%p63 ld.global.L1::evict_first.b32 { %r192 }, [ %rd117 + 0 ];
|
874 |
+
@!%p63 mov.u32 %r192, %r173;
|
875 |
+
mov.b32 %f291, %r192;
|
876 |
+
mov.u32 %r194, 0x0;
|
877 |
+
@%p65 ld.global.L1::evict_first.b32 { %r194 }, [ %rd118 + 0 ];
|
878 |
+
@!%p65 mov.u32 %r194, %r173;
|
879 |
+
mov.b32 %f292, %r194;
|
880 |
+
mov.u32 %r196, 0x0;
|
881 |
+
@%p63 ld.global.L1::evict_first.b32 { %r196 }, [ %rd119 + 0 ];
|
882 |
+
@!%p63 mov.u32 %r196, %r173;
|
883 |
+
mov.b32 %f293, %r196;
|
884 |
+
mov.u32 %r198, 0x0;
|
885 |
+
@%p65 ld.global.L1::evict_first.b32 { %r198 }, [ %rd120 + 0 ];
|
886 |
+
@!%p65 mov.u32 %r198, %r173;
|
887 |
+
mov.b32 %f294, %r198;
|
888 |
+
mov.u32 %r200, 0x0;
|
889 |
+
@%p63 ld.global.L1::evict_first.b32 { %r200 }, [ %rd121 + 0 ];
|
890 |
+
@!%p63 mov.u32 %r200, %r173;
|
891 |
+
mov.b32 %f295, %r200;
|
892 |
+
mov.u32 %r202, 0x0;
|
893 |
+
@%p65 ld.global.L1::evict_first.b32 { %r202 }, [ %rd122 + 0 ];
|
894 |
+
@!%p65 mov.u32 %r202, %r173;
|
895 |
+
mov.b32 %f296, %r202;
|
896 |
+
.loc 1 57 35
|
897 |
+
add.s64 %rd123, %rd159, %rd10;
|
898 |
+
add.s64 %rd124, %rd163, %rd10;
|
899 |
+
add.s64 %rd125, %rd123, 100514;
|
900 |
+
add.s64 %rd126, %rd124, 100514;
|
901 |
+
add.s64 %rd127, %rd123, 201028;
|
902 |
+
add.s64 %rd128, %rd124, 201028;
|
903 |
+
add.s64 %rd129, %rd123, 301542;
|
904 |
+
add.s64 %rd130, %rd124, 301542;
|
905 |
+
add.s64 %rd131, %rd123, 402056;
|
906 |
+
add.s64 %rd132, %rd124, 402056;
|
907 |
+
add.s64 %rd133, %rd123, 502570;
|
908 |
+
add.s64 %rd134, %rd124, 502570;
|
909 |
+
add.s64 %rd135, %rd123, 603084;
|
910 |
+
add.s64 %rd136, %rd124, 603084;
|
911 |
+
add.s64 %rd137, %rd123, 703598;
|
912 |
+
.loc 1 57 53
|
913 |
+
add.s64 %rd138, %rd124, 703598;
|
914 |
+
mov.u16 %rs49, 0x0;
|
915 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd123 + 0 ];
|
916 |
+
@!%p63 mov.u16 %rs49, %rs2;
|
917 |
+
mov.u16 %rs51, 0x0;
|
918 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd124 + 0 ];
|
919 |
+
@!%p65 mov.u16 %rs51, %rs2;
|
920 |
+
mov.u16 %rs53, 0x0;
|
921 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd125 + 0 ];
|
922 |
+
@!%p63 mov.u16 %rs53, %rs2;
|
923 |
+
mov.u16 %rs55, 0x0;
|
924 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd126 + 0 ];
|
925 |
+
@!%p65 mov.u16 %rs55, %rs2;
|
926 |
+
mov.u16 %rs57, 0x0;
|
927 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd127 + 0 ];
|
928 |
+
@!%p63 mov.u16 %rs57, %rs2;
|
929 |
+
mov.u16 %rs59, 0x0;
|
930 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd128 + 0 ];
|
931 |
+
@!%p65 mov.u16 %rs59, %rs2;
|
932 |
+
mov.u16 %rs61, 0x0;
|
933 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd129 + 0 ];
|
934 |
+
@!%p63 mov.u16 %rs61, %rs2;
|
935 |
+
mov.u16 %rs63, 0x0;
|
936 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd130 + 0 ];
|
937 |
+
@!%p65 mov.u16 %rs63, %rs2;
|
938 |
+
mov.u16 %rs65, 0x0;
|
939 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd131 + 0 ];
|
940 |
+
@!%p63 mov.u16 %rs65, %rs2;
|
941 |
+
mov.u16 %rs67, 0x0;
|
942 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd132 + 0 ];
|
943 |
+
@!%p65 mov.u16 %rs67, %rs2;
|
944 |
+
mov.u16 %rs69, 0x0;
|
945 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd133 + 0 ];
|
946 |
+
@!%p63 mov.u16 %rs69, %rs2;
|
947 |
+
mov.u16 %rs71, 0x0;
|
948 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd134 + 0 ];
|
949 |
+
@!%p65 mov.u16 %rs71, %rs2;
|
950 |
+
mov.u16 %rs73, 0x0;
|
951 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd135 + 0 ];
|
952 |
+
@!%p63 mov.u16 %rs73, %rs2;
|
953 |
+
mov.u16 %rs75, 0x0;
|
954 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd136 + 0 ];
|
955 |
+
@!%p65 mov.u16 %rs75, %rs2;
|
956 |
+
mov.u16 %rs77, 0x0;
|
957 |
+
@%p63 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd137 + 0 ];
|
958 |
+
@!%p63 mov.u16 %rs77, %rs2;
|
959 |
+
mov.u16 %rs79, 0x0;
|
960 |
+
@%p65 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd138 + 0 ];
|
961 |
+
@!%p65 mov.u16 %rs79, %rs2;
|
962 |
+
.loc 1 57 105
|
963 |
+
cvt.f32.bf16 %r204, %rs49;
|
964 |
+
mov.b32 %f297, %r204;
|
965 |
+
cvt.f32.bf16 %r205, %rs51;
|
966 |
+
mov.b32 %f298, %r205;
|
967 |
+
cvt.f32.bf16 %r206, %rs53;
|
968 |
+
mov.b32 %f299, %r206;
|
969 |
+
cvt.f32.bf16 %r207, %rs55;
|
970 |
+
mov.b32 %f300, %r207;
|
971 |
+
cvt.f32.bf16 %r208, %rs57;
|
972 |
+
mov.b32 %f301, %r208;
|
973 |
+
cvt.f32.bf16 %r209, %rs59;
|
974 |
+
mov.b32 %f302, %r209;
|
975 |
+
cvt.f32.bf16 %r210, %rs61;
|
976 |
+
mov.b32 %f303, %r210;
|
977 |
+
cvt.f32.bf16 %r211, %rs63;
|
978 |
+
mov.b32 %f304, %r211;
|
979 |
+
cvt.f32.bf16 %r212, %rs65;
|
980 |
+
mov.b32 %f305, %r212;
|
981 |
+
cvt.f32.bf16 %r213, %rs67;
|
982 |
+
mov.b32 %f306, %r213;
|
983 |
+
cvt.f32.bf16 %r214, %rs69;
|
984 |
+
mov.b32 %f307, %r214;
|
985 |
+
cvt.f32.bf16 %r215, %rs71;
|
986 |
+
mov.b32 %f308, %r215;
|
987 |
+
cvt.f32.bf16 %r216, %rs73;
|
988 |
+
mov.b32 %f309, %r216;
|
989 |
+
cvt.f32.bf16 %r217, %rs75;
|
990 |
+
mov.b32 %f310, %r217;
|
991 |
+
cvt.f32.bf16 %r218, %rs77;
|
992 |
+
mov.b32 %f311, %r218;
|
993 |
+
cvt.f32.bf16 %r219, %rs79;
|
994 |
+
mov.b32 %f312, %r219;
|
995 |
+
.loc 1 65 23
|
996 |
+
mul.f32 %f234, %f297, 0f3FB8AA3B;
|
997 |
+
ex2.approx.f32 %f233, %f234;
|
998 |
+
mul.f32 %f236, %f298, 0f3FB8AA3B;
|
999 |
+
ex2.approx.f32 %f235, %f236;
|
1000 |
+
mul.f32 %f238, %f299, 0f3FB8AA3B;
|
1001 |
+
ex2.approx.f32 %f237, %f238;
|
1002 |
+
mul.f32 %f240, %f300, 0f3FB8AA3B;
|
1003 |
+
ex2.approx.f32 %f239, %f240;
|
1004 |
+
mul.f32 %f242, %f301, 0f3FB8AA3B;
|
1005 |
+
ex2.approx.f32 %f241, %f242;
|
1006 |
+
mul.f32 %f244, %f302, 0f3FB8AA3B;
|
1007 |
+
ex2.approx.f32 %f243, %f244;
|
1008 |
+
mul.f32 %f246, %f303, 0f3FB8AA3B;
|
1009 |
+
ex2.approx.f32 %f245, %f246;
|
1010 |
+
mul.f32 %f248, %f304, 0f3FB8AA3B;
|
1011 |
+
ex2.approx.f32 %f247, %f248;
|
1012 |
+
mul.f32 %f250, %f305, 0f3FB8AA3B;
|
1013 |
+
ex2.approx.f32 %f249, %f250;
|
1014 |
+
mul.f32 %f252, %f306, 0f3FB8AA3B;
|
1015 |
+
ex2.approx.f32 %f251, %f252;
|
1016 |
+
mul.f32 %f254, %f307, 0f3FB8AA3B;
|
1017 |
+
ex2.approx.f32 %f253, %f254;
|
1018 |
+
mul.f32 %f256, %f308, 0f3FB8AA3B;
|
1019 |
+
ex2.approx.f32 %f255, %f256;
|
1020 |
+
mul.f32 %f258, %f309, 0f3FB8AA3B;
|
1021 |
+
ex2.approx.f32 %f257, %f258;
|
1022 |
+
mul.f32 %f260, %f310, 0f3FB8AA3B;
|
1023 |
+
ex2.approx.f32 %f259, %f260;
|
1024 |
+
mul.f32 %f262, %f311, 0f3FB8AA3B;
|
1025 |
+
ex2.approx.f32 %f261, %f262;
|
1026 |
+
mul.f32 %f264, %f312, 0f3FB8AA3B;
|
1027 |
+
ex2.approx.f32 %f263, %f264;
|
1028 |
+
.loc 1 66 24
|
1029 |
+
mul.f32 %f313, %f57, %f233;
|
1030 |
+
mul.f32 %f314, %f57, %f235;
|
1031 |
+
mul.f32 %f315, %f58, %f237;
|
1032 |
+
mul.f32 %f316, %f58, %f239;
|
1033 |
+
mul.f32 %f317, %f59, %f241;
|
1034 |
+
mul.f32 %f318, %f59, %f243;
|
1035 |
+
mul.f32 %f319, %f60, %f245;
|
1036 |
+
mul.f32 %f320, %f60, %f247;
|
1037 |
+
mul.f32 %f321, %f61, %f249;
|
1038 |
+
mul.f32 %f322, %f61, %f251;
|
1039 |
+
mul.f32 %f323, %f62, %f253;
|
1040 |
+
mul.f32 %f324, %f62, %f255;
|
1041 |
+
mul.f32 %f325, %f63, %f257;
|
1042 |
+
mul.f32 %f326, %f63, %f259;
|
1043 |
+
mul.f32 %f327, %f64, %f261;
|
1044 |
+
mul.f32 %f328, %f64, %f263;
|
1045 |
+
.loc 1 67 24
|
1046 |
+
neg.f32 %f329, %f313;
|
1047 |
+
fma.rn.f32 %f330, %f1, %f281, %f329;
|
1048 |
+
neg.f32 %f331, %f314;
|
1049 |
+
fma.rn.f32 %f332, %f1, %f282, %f331;
|
1050 |
+
neg.f32 %f333, %f315;
|
1051 |
+
fma.rn.f32 %f334, %f2, %f283, %f333;
|
1052 |
+
neg.f32 %f335, %f316;
|
1053 |
+
fma.rn.f32 %f336, %f2, %f284, %f335;
|
1054 |
+
neg.f32 %f337, %f317;
|
1055 |
+
fma.rn.f32 %f338, %f3, %f285, %f337;
|
1056 |
+
neg.f32 %f339, %f318;
|
1057 |
+
fma.rn.f32 %f340, %f3, %f286, %f339;
|
1058 |
+
neg.f32 %f341, %f319;
|
1059 |
+
fma.rn.f32 %f342, %f4, %f287, %f341;
|
1060 |
+
neg.f32 %f343, %f320;
|
1061 |
+
fma.rn.f32 %f344, %f4, %f288, %f343;
|
1062 |
+
neg.f32 %f345, %f321;
|
1063 |
+
fma.rn.f32 %f346, %f5, %f289, %f345;
|
1064 |
+
neg.f32 %f347, %f322;
|
1065 |
+
fma.rn.f32 %f348, %f5, %f290, %f347;
|
1066 |
+
neg.f32 %f349, %f323;
|
1067 |
+
fma.rn.f32 %f350, %f6, %f291, %f349;
|
1068 |
+
neg.f32 %f351, %f324;
|
1069 |
+
fma.rn.f32 %f352, %f6, %f292, %f351;
|
1070 |
+
neg.f32 %f353, %f325;
|
1071 |
+
fma.rn.f32 %f354, %f7, %f293, %f353;
|
1072 |
+
neg.f32 %f355, %f326;
|
1073 |
+
fma.rn.f32 %f356, %f7, %f294, %f355;
|
1074 |
+
neg.f32 %f357, %f327;
|
1075 |
+
fma.rn.f32 %f358, %f8, %f295, %f357;
|
1076 |
+
neg.f32 %f359, %f328;
|
1077 |
+
fma.rn.f32 %f360, %f8, %f296, %f359;
|
1078 |
+
.loc 1 69 24
|
1079 |
+
add.f32 %f361, %f265, %f330;
|
1080 |
+
add.f32 %f362, %f266, %f332;
|
1081 |
+
add.f32 %f363, %f267, %f334;
|
1082 |
+
add.f32 %f364, %f268, %f336;
|
1083 |
+
add.f32 %f365, %f269, %f338;
|
1084 |
+
add.f32 %f366, %f270, %f340;
|
1085 |
+
add.f32 %f367, %f271, %f342;
|
1086 |
+
add.f32 %f368, %f272, %f344;
|
1087 |
+
add.f32 %f369, %f273, %f346;
|
1088 |
+
add.f32 %f370, %f274, %f348;
|
1089 |
+
add.f32 %f371, %f275, %f350;
|
1090 |
+
add.f32 %f372, %f276, %f352;
|
1091 |
+
add.f32 %f373, %f277, %f354;
|
1092 |
+
add.f32 %f374, %f278, %f356;
|
1093 |
+
add.f32 %f375, %f279, %f358;
|
1094 |
+
add.f32 %f376, %f280, %f360;
|
1095 |
+
.loc 1 70 29
|
1096 |
+
add.s64 %rd139, %rd160, %rd10;
|
1097 |
+
add.s64 %rd140, %rd164, %rd10;
|
1098 |
+
add.s64 %rd141, %rd139, 100514;
|
1099 |
+
add.s64 %rd142, %rd140, 100514;
|
1100 |
+
add.s64 %rd143, %rd139, 201028;
|
1101 |
+
add.s64 %rd144, %rd140, 201028;
|
1102 |
+
add.s64 %rd145, %rd139, 301542;
|
1103 |
+
add.s64 %rd146, %rd140, 301542;
|
1104 |
+
add.s64 %rd147, %rd139, 402056;
|
1105 |
+
add.s64 %rd148, %rd140, 402056;
|
1106 |
+
add.s64 %rd149, %rd139, 502570;
|
1107 |
+
add.s64 %rd150, %rd140, 502570;
|
1108 |
+
add.s64 %rd151, %rd139, 603084;
|
1109 |
+
add.s64 %rd152, %rd140, 603084;
|
1110 |
+
add.s64 %rd153, %rd139, 703598;
|
1111 |
+
.loc 1 70 54
|
1112 |
+
add.s64 %rd154, %rd140, 703598;
|
1113 |
+
mov.b32 %r220, %f361;
|
1114 |
+
cvt.rn.bf16.f32 %rs97, %r220;
|
1115 |
+
mov.b32 %r221, %f362;
|
1116 |
+
cvt.rn.bf16.f32 %rs98, %r221;
|
1117 |
+
mov.b32 %r222, %f363;
|
1118 |
+
cvt.rn.bf16.f32 %rs99, %r222;
|
1119 |
+
mov.b32 %r223, %f364;
|
1120 |
+
cvt.rn.bf16.f32 %rs100, %r223;
|
1121 |
+
mov.b32 %r224, %f365;
|
1122 |
+
cvt.rn.bf16.f32 %rs101, %r224;
|
1123 |
+
mov.b32 %r225, %f366;
|
1124 |
+
cvt.rn.bf16.f32 %rs102, %r225;
|
1125 |
+
mov.b32 %r226, %f367;
|
1126 |
+
cvt.rn.bf16.f32 %rs103, %r226;
|
1127 |
+
mov.b32 %r227, %f368;
|
1128 |
+
cvt.rn.bf16.f32 %rs104, %r227;
|
1129 |
+
mov.b32 %r228, %f369;
|
1130 |
+
cvt.rn.bf16.f32 %rs105, %r228;
|
1131 |
+
mov.b32 %r229, %f370;
|
1132 |
+
cvt.rn.bf16.f32 %rs106, %r229;
|
1133 |
+
mov.b32 %r230, %f371;
|
1134 |
+
cvt.rn.bf16.f32 %rs107, %r230;
|
1135 |
+
mov.b32 %r231, %f372;
|
1136 |
+
cvt.rn.bf16.f32 %rs108, %r231;
|
1137 |
+
mov.b32 %r232, %f373;
|
1138 |
+
cvt.rn.bf16.f32 %rs109, %r232;
|
1139 |
+
mov.b32 %r233, %f374;
|
1140 |
+
cvt.rn.bf16.f32 %rs110, %r233;
|
1141 |
+
mov.b32 %r234, %f375;
|
1142 |
+
cvt.rn.bf16.f32 %rs111, %r234;
|
1143 |
+
mov.b32 %r235, %f376;
|
1144 |
+
cvt.rn.bf16.f32 %rs112, %r235;
|
1145 |
+
@%p63 st.global.b16 [ %rd139 + 0 ], { %rs97 };
|
1146 |
+
@%p65 st.global.b16 [ %rd140 + 0 ], { %rs98 };
|
1147 |
+
@%p63 st.global.b16 [ %rd141 + 0 ], { %rs99 };
|
1148 |
+
@%p65 st.global.b16 [ %rd142 + 0 ], { %rs100 };
|
1149 |
+
@%p63 st.global.b16 [ %rd143 + 0 ], { %rs101 };
|
1150 |
+
@%p65 st.global.b16 [ %rd144 + 0 ], { %rs102 };
|
1151 |
+
@%p63 st.global.b16 [ %rd145 + 0 ], { %rs103 };
|
1152 |
+
@%p65 st.global.b16 [ %rd146 + 0 ], { %rs104 };
|
1153 |
+
@%p63 st.global.b16 [ %rd147 + 0 ], { %rs105 };
|
1154 |
+
@%p65 st.global.b16 [ %rd148 + 0 ], { %rs106 };
|
1155 |
+
@%p63 st.global.b16 [ %rd149 + 0 ], { %rs107 };
|
1156 |
+
@%p65 st.global.b16 [ %rd150 + 0 ], { %rs108 };
|
1157 |
+
@%p63 st.global.b16 [ %rd151 + 0 ], { %rs109 };
|
1158 |
+
@%p65 st.global.b16 [ %rd152 + 0 ], { %rs110 };
|
1159 |
+
@%p63 st.global.b16 [ %rd153 + 0 ], { %rs111 };
|
1160 |
+
@%p65 st.global.b16 [ %rd154 + 0 ], { %rs112 };
|
1161 |
+
.loc 1 51 36
|
1162 |
+
add.s64 %rd165, %rd165, 512;
|
1163 |
+
cvt.u32.u64 %r236, %rd165;
|
1164 |
+
add.s32 %r237, %r236, -512;
|
1165 |
+
add.s64 %rd164, %rd164, 1024;
|
1166 |
+
add.s64 %rd163, %rd163, 1024;
|
1167 |
+
add.s64 %rd162, %rd162, 2048;
|
1168 |
+
add.s64 %rd161, %rd161, 1024;
|
1169 |
+
add.s64 %rd160, %rd160, 1024;
|
1170 |
+
add.s64 %rd159, %rd159, 1024;
|
1171 |
+
add.s64 %rd158, %rd158, 1024;
|
1172 |
+
setp.lt.u32 %p175, %r237, 49745;
|
1173 |
+
@%p175 bra $L__BB0_3;
|
1174 |
+
.loc 1 51 4
|
1175 |
+
ret;
|
1176 |
+
$L__tmp90:
|
1177 |
+
$L__func_end0:
|
1178 |
+
|
1179 |
+
}
|
1180 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
1181 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
1182 |
+
.section .debug_abbrev
|
1183 |
+
{
|
1184 |
+
.b8 1
|
1185 |
+
.b8 17
|
1186 |
+
.b8 1
|
1187 |
+
.b8 37
|
1188 |
+
.b8 8
|
1189 |
+
.b8 19
|
1190 |
+
.b8 5
|
1191 |
+
.b8 3
|
1192 |
+
.b8 8
|
1193 |
+
.b8 16
|
1194 |
+
.b8 6
|
1195 |
+
.b8 27
|
1196 |
+
.b8 8
|
1197 |
+
.b8 180
|
1198 |
+
.b8 66
|
1199 |
+
.b8 12
|
1200 |
+
.b8 17
|
1201 |
+
.b8 1
|
1202 |
+
.b8 18
|
1203 |
+
.b8 1
|
1204 |
+
.b8 0
|
1205 |
+
.b8 0
|
1206 |
+
.b8 2
|
1207 |
+
.b8 46
|
1208 |
+
.b8 0
|
1209 |
+
.b8 135
|
1210 |
+
.b8 64
|
1211 |
+
.b8 8
|
1212 |
+
.b8 3
|
1213 |
+
.b8 8
|
1214 |
+
.b8 58
|
1215 |
+
.b8 11
|
1216 |
+
.b8 59
|
1217 |
+
.b8 11
|
1218 |
+
.b8 63
|
1219 |
+
.b8 12
|
1220 |
+
.b8 32
|
1221 |
+
.b8 11
|
1222 |
+
.b8 0
|
1223 |
+
.b8 0
|
1224 |
+
.b8 3
|
1225 |
+
.b8 46
|
1226 |
+
.b8 1
|
1227 |
+
.b8 17
|
1228 |
+
.b8 1
|
1229 |
+
.b8 18
|
1230 |
+
.b8 1
|
1231 |
+
.b8 64
|
1232 |
+
.b8 10
|
1233 |
+
.b8 49
|
1234 |
+
.b8 19
|
1235 |
+
.b8 0
|
1236 |
+
.b8 0
|
1237 |
+
.b8 4
|
1238 |
+
.b8 29
|
1239 |
+
.b8 1
|
1240 |
+
.b8 49
|
1241 |
+
.b8 19
|
1242 |
+
.b8 17
|
1243 |
+
.b8 1
|
1244 |
+
.b8 18
|
1245 |
+
.b8 1
|
1246 |
+
.b8 88
|
1247 |
+
.b8 11
|
1248 |
+
.b8 89
|
1249 |
+
.b8 11
|
1250 |
+
.b8 87
|
1251 |
+
.b8 11
|
1252 |
+
.b8 0
|
1253 |
+
.b8 0
|
1254 |
+
.b8 5
|
1255 |
+
.b8 29
|
1256 |
+
.b8 0
|
1257 |
+
.b8 49
|
1258 |
+
.b8 19
|
1259 |
+
.b8 17
|
1260 |
+
.b8 1
|
1261 |
+
.b8 18
|
1262 |
+
.b8 1
|
1263 |
+
.b8 88
|
1264 |
+
.b8 11
|
1265 |
+
.b8 89
|
1266 |
+
.b8 11
|
1267 |
+
.b8 87
|
1268 |
+
.b8 11
|
1269 |
+
.b8 0
|
1270 |
+
.b8 0
|
1271 |
+
.b8 0
|
1272 |
+
}
|
1273 |
+
.section .debug_info
|
1274 |
+
{
|
1275 |
+
.b32 278
|
1276 |
+
.b8 2
|
1277 |
+
.b8 0
|
1278 |
+
.b32 .debug_abbrev
|
1279 |
+
.b8 8
|
1280 |
+
.b8 1
|
1281 |
+
.b8 116
|
1282 |
+
.b8 114
|
1283 |
+
.b8 105
|
1284 |
+
.b8 116
|
1285 |
+
.b8 111
|
1286 |
+
.b8 110
|
1287 |
+
.b8 0
|
1288 |
+
.b8 2
|
1289 |
+
.b8 0
|
1290 |
+
.b8 99
|
1291 |
+
.b8 107
|
1292 |
+
.b8 122
|
1293 |
+
.b8 103
|
1294 |
+
.b8 108
|
1295 |
+
.b8 55
|
1296 |
+
.b8 116
|
1297 |
+
.b8 104
|
1298 |
+
.b8 98
|
1299 |
+
.b8 52
|
1300 |
+
.b8 120
|
1301 |
+
.b8 100
|
1302 |
+
.b8 102
|
1303 |
+
.b8 107
|
1304 |
+
.b8 102
|
1305 |
+
.b8 110
|
1306 |
+
.b8 100
|
1307 |
+
.b8 50
|
1308 |
+
.b8 116
|
1309 |
+
.b8 105
|
1310 |
+
.b8 100
|
1311 |
+
.b8 107
|
1312 |
+
.b8 115
|
1313 |
+
.b8 54
|
1314 |
+
.b8 109
|
1315 |
+
.b8 116
|
1316 |
+
.b8 53
|
1317 |
+
.b8 102
|
1318 |
+
.b8 51
|
1319 |
+
.b8 104
|
1320 |
+
.b8 97
|
1321 |
+
.b8 117
|
1322 |
+
.b8 119
|
1323 |
+
.b8 102
|
1324 |
+
.b8 121
|
1325 |
+
.b8 106
|
1326 |
+
.b8 102
|
1327 |
+
.b8 108
|
1328 |
+
.b8 98
|
1329 |
+
.b8 116
|
1330 |
+
.b8 122
|
1331 |
+
.b8 121
|
1332 |
+
.b8 101
|
1333 |
+
.b8 112
|
1334 |
+
.b8 111
|
1335 |
+
.b8 53
|
1336 |
+
.b8 111
|
1337 |
+
.b8 120
|
1338 |
+
.b8 107
|
1339 |
+
.b8 118
|
1340 |
+
.b8 104
|
1341 |
+
.b8 107
|
1342 |
+
.b8 46
|
1343 |
+
.b8 112
|
1344 |
+
.b8 121
|
1345 |
+
.b8 0
|
1346 |
+
.b32 .debug_line
|
1347 |
+
.b8 47
|
1348 |
+
.b8 116
|
1349 |
+
.b8 109
|
1350 |
+
.b8 112
|
1351 |
+
.b8 47
|
1352 |
+
.b8 116
|
1353 |
+
.b8 111
|
1354 |
+
.b8 114
|
1355 |
+
.b8 99
|
1356 |
+
.b8 104
|
1357 |
+
.b8 105
|
1358 |
+
.b8 110
|
1359 |
+
.b8 100
|
1360 |
+
.b8 117
|
1361 |
+
.b8 99
|
1362 |
+
.b8 116
|
1363 |
+
.b8 111
|
1364 |
+
.b8 114
|
1365 |
+
.b8 95
|
1366 |
+
.b8 114
|
1367 |
+
.b8 111
|
1368 |
+
.b8 111
|
1369 |
+
.b8 116
|
1370 |
+
.b8 47
|
1371 |
+
.b8 107
|
1372 |
+
.b8 122
|
1373 |
+
.b8 0
|
1374 |
+
.b8 1
|
1375 |
+
.b64 $L__func_begin0
|
1376 |
+
.b64 $L__func_end0
|
1377 |
+
.b8 2
|
1378 |
+
.b8 116
|
1379 |
+
.b8 114
|
1380 |
+
.b8 105
|
1381 |
+
.b8 116
|
1382 |
+
.b8 111
|
1383 |
+
.b8 110
|
1384 |
+
.b8 95
|
1385 |
+
.b8 95
|
1386 |
+
.b8 48
|
1387 |
+
.b8 100
|
1388 |
+
.b8 49
|
1389 |
+
.b8 100
|
1390 |
+
.b8 50
|
1391 |
+
.b8 100
|
1392 |
+
.b8 51
|
1393 |
+
.b8 100
|
1394 |
+
.b8 52
|
1395 |
+
.b8 100
|
1396 |
+
.b8 53
|
1397 |
+
.b8 100
|
1398 |
+
.b8 54
|
1399 |
+
.b8 100
|
1400 |
+
.b8 55
|
1401 |
+
.b8 100
|
1402 |
+
.b8 101
|
1403 |
+
.b8 56
|
1404 |
+
.b8 0
|
1405 |
+
.b8 116
|
1406 |
+
.b8 114
|
1407 |
+
.b8 105
|
1408 |
+
.b8 116
|
1409 |
+
.b8 111
|
1410 |
+
.b8 110
|
1411 |
+
.b8 95
|
1412 |
+
.b8 95
|
1413 |
+
.b8 48
|
1414 |
+
.b8 100
|
1415 |
+
.b8 49
|
1416 |
+
.b8 100
|
1417 |
+
.b8 50
|
1418 |
+
.b8 100
|
1419 |
+
.b8 51
|
1420 |
+
.b8 100
|
1421 |
+
.b8 52
|
1422 |
+
.b8 100
|
1423 |
+
.b8 53
|
1424 |
+
.b8 100
|
1425 |
+
.b8 54
|
1426 |
+
.b8 100
|
1427 |
+
.b8 55
|
1428 |
+
.b8 100
|
1429 |
+
.b8 101
|
1430 |
+
.b8 56
|
1431 |
+
.b8 0
|
1432 |
+
.b8 1
|
1433 |
+
.b8 18
|
1434 |
+
.b8 1
|
1435 |
+
.b8 1
|
1436 |
+
.b8 3
|
1437 |
+
.b64 $L__func_begin0
|
1438 |
+
.b64 $L__func_end0
|
1439 |
+
.b8 1
|
1440 |
+
.b8 156
|
1441 |
+
.b32 125
|
1442 |
+
.b8 4
|
1443 |
+
.b32 125
|
1444 |
+
.b64 $L__tmp1
|
1445 |
+
.b64 $L__tmp88
|
1446 |
+
.b8 2
|
1447 |
+
.b8 46
|
1448 |
+
.b8 27
|
1449 |
+
.b8 5
|
1450 |
+
.b32 125
|
1451 |
+
.b64 $L__tmp1
|
1452 |
+
.b64 $L__tmp88
|
1453 |
+
.b8 2
|
1454 |
+
.b8 243
|
1455 |
+
.b8 36
|
1456 |
+
.b8 0
|
1457 |
+
.b8 5
|
1458 |
+
.b32 125
|
1459 |
+
.b64 $L__tmp2
|
1460 |
+
.b64 $L__tmp89
|
1461 |
+
.b8 2
|
1462 |
+
.b8 46
|
1463 |
+
.b8 27
|
1464 |
+
.b8 0
|
1465 |
+
.b8 0
|
1466 |
+
}
|
1467 |
+
.section .debug_pubnames
|
1468 |
+
{
|
1469 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1470 |
+
$L__pubNames_start0:
|
1471 |
+
.b8 2
|
1472 |
+
.b8 0
|
1473 |
+
.b32 .debug_info
|
1474 |
+
.b32 282
|
1475 |
+
.b32 125
|
1476 |
+
.b8 116
|
1477 |
+
.b8 114
|
1478 |
+
.b8 105
|
1479 |
+
.b8 116
|
1480 |
+
.b8 111
|
1481 |
+
.b8 110
|
1482 |
+
.b8 95
|
1483 |
+
.b8 95
|
1484 |
+
.b8 48
|
1485 |
+
.b8 100
|
1486 |
+
.b8 49
|
1487 |
+
.b8 100
|
1488 |
+
.b8 50
|
1489 |
+
.b8 100
|
1490 |
+
.b8 51
|
1491 |
+
.b8 100
|
1492 |
+
.b8 52
|
1493 |
+
.b8 100
|
1494 |
+
.b8 53
|
1495 |
+
.b8 100
|
1496 |
+
.b8 54
|
1497 |
+
.b8 100
|
1498 |
+
.b8 55
|
1499 |
+
.b8 100
|
1500 |
+
.b8 101
|
1501 |
+
.b8 56
|
1502 |
+
.b8 0
|
1503 |
+
.b32 0
|
1504 |
+
$L__pubNames_end0:
|
1505 |
+
}
|
1506 |
+
.section .debug_pubtypes
|
1507 |
+
{
|
1508 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1509 |
+
$L__pubTypes_start0:
|
1510 |
+
.b8 2
|
1511 |
+
.b8 0
|
1512 |
+
.b32 .debug_info
|
1513 |
+
.b32 282
|
1514 |
+
.b32 0
|
1515 |
+
$L__pubTypes_end0:
|
1516 |
+
}
|
1517 |
+
.section .debug_loc { }
|
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<8x1xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<50257> : tensor<8x1xi64, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<-1> : tensor<8x1xi64, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked>
|
8 |
+
%c8_i64 = arith.constant 8 : i64
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<1x512xi64, #blocked>
|
10 |
+
%c0_i32 = arith.constant 0 : i32
|
11 |
+
%c512_i32 = arith.constant 512 : i32
|
12 |
+
%c50257_i32 = arith.constant 50257 : i32
|
13 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xbf16, #blocked>
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.extsi %0 : i32 to i64
|
16 |
+
%2 = arith.muli %1, %c8_i64 : i64
|
17 |
+
%3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xi32, #blocked>
|
19 |
+
%5 = arith.extsi %4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked>
|
20 |
+
%6 = tt.splat %2 : (i64) -> tensor<8x1xi64, #blocked>
|
21 |
+
%7 = arith.addi %6, %5 : tensor<8x1xi64, #blocked>
|
22 |
+
%8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
23 |
+
%9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x512xi32, #blocked>
|
24 |
+
%10 = arith.extsi %9 : tensor<1x512xi32, #blocked> to tensor<1x512xi64, #blocked>
|
25 |
+
%11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<8x1x!tt.ptr<i64, 1>, #blocked>
|
26 |
+
%12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr<i64, 1>, #blocked>, tensor<8x1xi64, #blocked>
|
27 |
+
%13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64, #blocked>
|
28 |
+
%14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
29 |
+
%15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
30 |
+
%16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
31 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
32 |
+
%18 = arith.muli %7, %cst_0 : tensor<8x1xi64, #blocked>
|
33 |
+
%19 = tt.broadcast %18 : (tensor<8x1xi64, #blocked>) -> tensor<8x512xi64, #blocked>
|
34 |
+
%20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>, #blocked>
|
35 |
+
%21 = arith.cmpi ne, %13, %cst_1 : tensor<8x1xi64, #blocked>
|
36 |
+
%22 = arith.divf %15, %17 : f32
|
37 |
+
%23 = tt.splat %22 : (f32) -> tensor<8x1xf32, #blocked>
|
38 |
+
%24 = arith.select %21, %23, %cst : tensor<8x1xi1, #blocked>, tensor<8x1xf32, #blocked>
|
39 |
+
%25 = tt.broadcast %24 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
|
40 |
+
%26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_2) -> (tensor<8x512xf32, #blocked>) : i32 {
|
41 |
+
%33 = arith.extsi %arg9 : i32 to i64
|
42 |
+
%34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
|
43 |
+
%35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
|
44 |
+
%36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
|
45 |
+
%37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
|
46 |
+
%38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
|
47 |
+
%39 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
|
48 |
+
%40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
|
49 |
+
%41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
|
50 |
+
%42 = arith.mulf %41, %25 : tensor<8x512xf32, #blocked>
|
51 |
+
%43 = arith.addf %arg10, %42 : tensor<8x512xf32, #blocked>
|
52 |
+
%44 = arith.select %40, %43, %arg10 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked>
|
53 |
+
scf.yield %44 : tensor<8x512xf32, #blocked>
|
54 |
+
}
|
55 |
+
%27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
|
56 |
+
^bb0(%arg9: f32, %arg10: f32):
|
57 |
+
%33 = arith.addf %arg9, %arg10 : f32
|
58 |
+
tt.reduce.return %33 : f32
|
59 |
+
}) : (tensor<8x512xf32, #blocked>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
60 |
+
%28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xf32, #blocked>
|
61 |
+
%29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
|
62 |
+
%30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
|
63 |
+
%31 = tt.broadcast %28 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
|
64 |
+
%32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
|
65 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 {
|
66 |
+
%33 = arith.extsi %arg9 : i32 to i64
|
67 |
+
%34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
|
68 |
+
%35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
|
69 |
+
%36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
|
70 |
+
%37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
|
71 |
+
%38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
|
72 |
+
%39 = tt.addptr %29, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
|
73 |
+
%40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
|
74 |
+
%41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
|
75 |
+
%42 = arith.extf %41 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
|
76 |
+
%43 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
|
77 |
+
%44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
|
78 |
+
%45 = tt.addptr %30, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
|
79 |
+
%46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
|
80 |
+
%47 = arith.extf %46 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
|
81 |
+
%48 = arith.mulf %44, %25 : tensor<8x512xf32, #blocked>
|
82 |
+
%49 = math.exp %47 : tensor<8x512xf32, #blocked>
|
83 |
+
%50 = arith.mulf %49, %31 : tensor<8x512xf32, #blocked>
|
84 |
+
%51 = arith.subf %48, %50 : tensor<8x512xf32, #blocked>
|
85 |
+
%52 = arith.addf %42, %51 : tensor<8x512xf32, #blocked>
|
86 |
+
%53 = tt.addptr %32, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
|
87 |
+
%54 = arith.truncf %52 : tensor<8x512xf32, #blocked> to tensor<8x512xbf16, #blocked>
|
88 |
+
tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16, #blocked>
|
89 |
+
}
|
90 |
+
tt.return
|
91 |
+
}
|
92 |
+
}
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 1, !dbg !8
|
7 |
+
%5 = and i32 %4, 510, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 9, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = icmp slt i32 %8, 12865792, !dbg !12
|
12 |
+
%10 = sext i32 %8 to i64, !dbg !13
|
13 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13
|
14 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 %9) #1, !dbg !14
|
15 |
+
ret void, !dbg !15
|
16 |
+
}
|
17 |
+
|
18 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
19 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
20 |
+
|
21 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
22 |
+
attributes #1 = { nounwind }
|
23 |
+
|
24 |
+
!llvm.module.flags = !{!0}
|
25 |
+
!llvm.dbg.cu = !{!1}
|
26 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
27 |
+
|
28 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
29 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
30 |
+
!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
|
31 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
32 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
|
33 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
34 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
35 |
+
!7 = !{}
|
36 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
37 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
38 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
39 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
40 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
41 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
42 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
43 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u32 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 256, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<2>;
|
18 |
+
.reg .b32 %r<9>;
|
19 |
+
.reg .b64 %rd<4>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd2, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r4, %tid.x;
|
28 |
+
shl.b32 %r5, %r4, 1;
|
29 |
+
and.b32 %r6, %r5, 510;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 33
|
33 |
+
shl.b32 %r7, %r1, 9;
|
34 |
+
.loc 1 21 23
|
35 |
+
or.b32 %r8, %r7, %r6;
|
36 |
+
.loc 1 22 21
|
37 |
+
setp.lt.s32 %p1, %r8, 12865792;
|
38 |
+
.loc 1 25 25
|
39 |
+
mul.wide.s32 %rd3, %r8, 4;
|
40 |
+
add.s64 %rd1, %rd2, %rd3;
|
41 |
+
mov.b32 %r2, 0;
|
42 |
+
.loc 1 25 36
|
43 |
+
@%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
|
44 |
+
.loc 1 25 4
|
45 |
+
ret;
|
46 |
+
$L__tmp1:
|
47 |
+
$L__func_end0:
|
48 |
+
|
49 |
+
}
|
50 |
+
.file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
|
51 |
+
.section .debug_abbrev
|
52 |
+
{
|
53 |
+
.b8 1
|
54 |
+
.b8 17
|
55 |
+
.b8 1
|
56 |
+
.b8 37
|
57 |
+
.b8 8
|
58 |
+
.b8 19
|
59 |
+
.b8 5
|
60 |
+
.b8 3
|
61 |
+
.b8 8
|
62 |
+
.b8 16
|
63 |
+
.b8 6
|
64 |
+
.b8 27
|
65 |
+
.b8 8
|
66 |
+
.b8 180
|
67 |
+
.b8 66
|
68 |
+
.b8 12
|
69 |
+
.b8 17
|
70 |
+
.b8 1
|
71 |
+
.b8 18
|
72 |
+
.b8 1
|
73 |
+
.b8 0
|
74 |
+
.b8 0
|
75 |
+
.b8 2
|
76 |
+
.b8 46
|
77 |
+
.b8 0
|
78 |
+
.b8 17
|
79 |
+
.b8 1
|
80 |
+
.b8 18
|
81 |
+
.b8 1
|
82 |
+
.b8 64
|
83 |
+
.b8 10
|
84 |
+
.b8 135
|
85 |
+
.b8 64
|
86 |
+
.b8 8
|
87 |
+
.b8 3
|
88 |
+
.b8 8
|
89 |
+
.b8 58
|
90 |
+
.b8 11
|
91 |
+
.b8 59
|
92 |
+
.b8 11
|
93 |
+
.b8 63
|
94 |
+
.b8 12
|
95 |
+
.b8 0
|
96 |
+
.b8 0
|
97 |
+
.b8 0
|
98 |
+
}
|
99 |
+
.section .debug_info
|
100 |
+
{
|
101 |
+
.b32 172
|
102 |
+
.b8 2
|
103 |
+
.b8 0
|
104 |
+
.b32 .debug_abbrev
|
105 |
+
.b8 8
|
106 |
+
.b8 1
|
107 |
+
.b8 116
|
108 |
+
.b8 114
|
109 |
+
.b8 105
|
110 |
+
.b8 116
|
111 |
+
.b8 111
|
112 |
+
.b8 110
|
113 |
+
.b8 0
|
114 |
+
.b8 2
|
115 |
+
.b8 0
|
116 |
+
.b8 99
|
117 |
+
.b8 52
|
118 |
+
.b8 121
|
119 |
+
.b8 115
|
120 |
+
.b8 101
|
121 |
+
.b8 108
|
122 |
+
.b8 100
|
123 |
+
.b8 119
|
124 |
+
.b8 109
|
125 |
+
.b8 117
|
126 |
+
.b8 51
|
127 |
+
.b8 116
|
128 |
+
.b8 111
|
129 |
+
.b8 53
|
130 |
+
.b8 50
|
131 |
+
.b8 112
|
132 |
+
.b8 98
|
133 |
+
.b8 104
|
134 |
+
.b8 50
|
135 |
+
.b8 109
|
136 |
+
.b8 100
|
137 |
+
.b8 50
|
138 |
+
.b8 111
|
139 |
+
.b8 101
|
140 |
+
.b8 117
|
141 |
+
.b8 102
|
142 |
+
.b8 114
|
143 |
+
.b8 113
|
144 |
+
.b8 51
|
145 |
+
.b8 102
|
146 |
+
.b8 99
|
147 |
+
.b8 100
|
148 |
+
.b8 109
|
149 |
+
.b8 97
|
150 |
+
.b8 112
|
151 |
+
.b8 107
|
152 |
+
.b8 116
|
153 |
+
.b8 52
|
154 |
+
.b8 110
|
155 |
+
.b8 120
|
156 |
+
.b8 100
|
157 |
+
.b8 122
|
158 |
+
.b8 109
|
159 |
+
.b8 121
|
160 |
+
.b8 113
|
161 |
+
.b8 116
|
162 |
+
.b8 103
|
163 |
+
.b8 100
|
164 |
+
.b8 50
|
165 |
+
.b8 121
|
166 |
+
.b8 115
|
167 |
+
.b8 112
|
168 |
+
.b8 46
|
169 |
+
.b8 112
|
170 |
+
.b8 121
|
171 |
+
.b8 0
|
172 |
+
.b32 .debug_line
|
173 |
+
.b8 47
|
174 |
+
.b8 116
|
175 |
+
.b8 109
|
176 |
+
.b8 112
|
177 |
+
.b8 47
|
178 |
+
.b8 116
|
179 |
+
.b8 111
|
180 |
+
.b8 114
|
181 |
+
.b8 99
|
182 |
+
.b8 104
|
183 |
+
.b8 105
|
184 |
+
.b8 110
|
185 |
+
.b8 100
|
186 |
+
.b8 117
|
187 |
+
.b8 99
|
188 |
+
.b8 116
|
189 |
+
.b8 111
|
190 |
+
.b8 114
|
191 |
+
.b8 95
|
192 |
+
.b8 114
|
193 |
+
.b8 111
|
194 |
+
.b8 111
|
195 |
+
.b8 116
|
196 |
+
.b8 47
|
197 |
+
.b8 52
|
198 |
+
.b8 121
|
199 |
+
.b8 0
|
200 |
+
.b8 1
|
201 |
+
.b64 $L__func_begin0
|
202 |
+
.b64 $L__func_end0
|
203 |
+
.b8 2
|
204 |
+
.b64 $L__func_begin0
|
205 |
+
.b64 $L__func_end0
|
206 |
+
.b8 1
|
207 |
+
.b8 156
|
208 |
+
.b8 116
|
209 |
+
.b8 114
|
210 |
+
.b8 105
|
211 |
+
.b8 116
|
212 |
+
.b8 111
|
213 |
+
.b8 110
|
214 |
+
.b8 95
|
215 |
+
.b8 95
|
216 |
+
.b8 48
|
217 |
+
.b8 100
|
218 |
+
.b8 49
|
219 |
+
.b8 100
|
220 |
+
.b8 101
|
221 |
+
.b8 0
|
222 |
+
.b8 116
|
223 |
+
.b8 114
|
224 |
+
.b8 105
|
225 |
+
.b8 116
|
226 |
+
.b8 111
|
227 |
+
.b8 110
|
228 |
+
.b8 95
|
229 |
+
.b8 95
|
230 |
+
.b8 48
|
231 |
+
.b8 100
|
232 |
+
.b8 49
|
233 |
+
.b8 100
|
234 |
+
.b8 101
|
235 |
+
.b8 0
|
236 |
+
.b8 1
|
237 |
+
.b8 18
|
238 |
+
.b8 1
|
239 |
+
.b8 0
|
240 |
+
}
|
241 |
+
.section .debug_pubnames
|
242 |
+
{
|
243 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
244 |
+
$L__pubNames_start0:
|
245 |
+
.b8 2
|
246 |
+
.b8 0
|
247 |
+
.b32 .debug_info
|
248 |
+
.b32 176
|
249 |
+
.b32 125
|
250 |
+
.b8 116
|
251 |
+
.b8 114
|
252 |
+
.b8 105
|
253 |
+
.b8 116
|
254 |
+
.b8 111
|
255 |
+
.b8 110
|
256 |
+
.b8 95
|
257 |
+
.b8 95
|
258 |
+
.b8 48
|
259 |
+
.b8 100
|
260 |
+
.b8 49
|
261 |
+
.b8 100
|
262 |
+
.b8 101
|
263 |
+
.b8 0
|
264 |
+
.b32 0
|
265 |
+
$L__pubNames_end0:
|
266 |
+
}
|
267 |
+
.section .debug_pubtypes
|
268 |
+
{
|
269 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
270 |
+
$L__pubTypes_start0:
|
271 |
+
.b8 2
|
272 |
+
.b8 0
|
273 |
+
.b32 .debug_info
|
274 |
+
.b32 176
|
275 |
+
.b32 0
|
276 |
+
$L__pubTypes_end0:
|
277 |
+
}
|
278 |
+
.section .debug_loc { }
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
12 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
|
13 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
|
15 |
+
tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<512xf32>
|
4 |
+
%cst_0 = arith.constant dense<12865792> : tensor<512xi32>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst_0 : tensor<512xi32>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
|
14 |
+
tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin
ADDED
Binary file (15 kB). View file
|
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !5 {
|
7 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%11 = and i32 %10, 31, !dbg !8
|
9 |
+
%12 = lshr i32 %10, 5, !dbg !8
|
10 |
+
%13 = and i32 %12, 1, !dbg !8
|
11 |
+
%urem = shl i32 %10, 2, !dbg !8
|
12 |
+
%14 = and i32 %urem, 252, !dbg !8
|
13 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
14 |
+
%16 = shl i32 %15, 8, !dbg !10
|
15 |
+
%17 = or i32 %16, %14, !dbg !11
|
16 |
+
%18 = sext i32 %17 to i64, !dbg !12
|
17 |
+
%19 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !12
|
18 |
+
%20 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
|
19 |
+
%21 = extractvalue { i32, i32 } %20, 0, !dbg !13
|
20 |
+
%22 = extractvalue { i32, i32 } %20, 1, !dbg !13
|
21 |
+
%23 = trunc i32 %21 to i16, !dbg !13
|
22 |
+
%extelt.offset = lshr i32 %21, 16, !dbg !13
|
23 |
+
%24 = trunc i32 %extelt.offset to i16, !dbg !13
|
24 |
+
%25 = trunc i32 %22 to i16, !dbg !13
|
25 |
+
%extelt.offset1 = lshr i32 %22, 16, !dbg !13
|
26 |
+
%26 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
27 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
|
28 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
|
29 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
|
30 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
|
31 |
+
%31 = zext nneg i32 %14 to i64, !dbg !15
|
32 |
+
%32 = getelementptr float, ptr addrspace(1) %2, i64 %31, !dbg !15
|
33 |
+
%33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
|
34 |
+
%34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !16
|
35 |
+
%35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !16
|
36 |
+
%36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !16
|
37 |
+
%37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !16
|
38 |
+
%38 = bitcast i32 %34 to float, !dbg !16
|
39 |
+
%39 = bitcast i32 %35 to float, !dbg !16
|
40 |
+
%40 = bitcast i32 %36 to float, !dbg !16
|
41 |
+
%41 = bitcast i32 %37 to float, !dbg !16
|
42 |
+
%42 = getelementptr float, ptr addrspace(1) %3, i64 %18, !dbg !17
|
43 |
+
%43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
44 |
+
%44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
|
45 |
+
%45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
|
46 |
+
%46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
|
47 |
+
%47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
|
48 |
+
%48 = bitcast i32 %44 to float, !dbg !18
|
49 |
+
%49 = bitcast i32 %45 to float, !dbg !18
|
50 |
+
%50 = bitcast i32 %46 to float, !dbg !18
|
51 |
+
%51 = bitcast i32 %47 to float, !dbg !18
|
52 |
+
%52 = sext i32 %15 to i64, !dbg !19
|
53 |
+
%53 = getelementptr float, ptr addrspace(1) %4, i64 %52, !dbg !19
|
54 |
+
%54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
55 |
+
%55 = bitcast i32 %54 to float, !dbg !20
|
56 |
+
%56 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
57 |
+
%57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
58 |
+
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
59 |
+
%59 = getelementptr float, ptr addrspace(1) %5, i64 %52, !dbg !21
|
60 |
+
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
61 |
+
%61 = bitcast i32 %60 to float, !dbg !22
|
62 |
+
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
63 |
+
%63 = bitcast i32 %62 to float, !dbg !22
|
64 |
+
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
65 |
+
%65 = bitcast i32 %64 to float, !dbg !22
|
66 |
+
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
67 |
+
%67 = bitcast i32 %66 to float, !dbg !22
|
68 |
+
%68 = getelementptr float, ptr addrspace(1) %0, i64 %18, !dbg !23
|
69 |
+
%69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
|
70 |
+
%70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !24
|
71 |
+
%71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !24
|
72 |
+
%72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !24
|
73 |
+
%73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !24
|
74 |
+
%74 = bitcast i32 %70 to float, !dbg !24
|
75 |
+
%75 = bitcast i32 %71 to float, !dbg !24
|
76 |
+
%76 = bitcast i32 %72 to float, !dbg !24
|
77 |
+
%77 = bitcast i32 %73 to float, !dbg !24
|
78 |
+
%78 = fmul float %27, %38, !dbg !25
|
79 |
+
%79 = fmul float %28, %39, !dbg !25
|
80 |
+
%80 = fmul float %29, %40, !dbg !25
|
81 |
+
%81 = fmul float %30, %41, !dbg !25
|
82 |
+
%82 = fadd float %78, %79, !dbg !26
|
83 |
+
%83 = fadd float %80, %82, !dbg !26
|
84 |
+
%84 = fadd float %81, %83, !dbg !26
|
85 |
+
%85 = bitcast float %84 to i32, !dbg !32
|
86 |
+
%86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !32
|
87 |
+
%87 = bitcast i32 %86 to float, !dbg !32
|
88 |
+
%88 = fadd float %84, %87, !dbg !26
|
89 |
+
%89 = bitcast float %88 to i32, !dbg !32
|
90 |
+
%90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !32
|
91 |
+
%91 = bitcast i32 %90 to float, !dbg !32
|
92 |
+
%92 = fadd float %88, %91, !dbg !26
|
93 |
+
%93 = bitcast float %92 to i32, !dbg !32
|
94 |
+
%94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !32
|
95 |
+
%95 = bitcast i32 %94 to float, !dbg !32
|
96 |
+
%96 = fadd float %92, %95, !dbg !26
|
97 |
+
%97 = bitcast float %96 to i32, !dbg !32
|
98 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !32
|
99 |
+
%99 = bitcast i32 %98 to float, !dbg !32
|
100 |
+
%100 = fadd float %96, %99, !dbg !26
|
101 |
+
%101 = bitcast float %100 to i32, !dbg !32
|
102 |
+
%102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !32
|
103 |
+
%103 = bitcast i32 %102 to float, !dbg !32
|
104 |
+
%104 = fadd float %100, %103, !dbg !26
|
105 |
+
%105 = icmp eq i32 %11, 0, !dbg !32
|
106 |
+
%106 = zext nneg i32 %13 to i64, !dbg !32
|
107 |
+
%107 = getelementptr float, ptr addrspace(3) @global_smem, i64 %106, !dbg !32
|
108 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %104, i1 %105) #3, !dbg !32
|
109 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
110 |
+
%108 = icmp slt i32 %10, 2, !dbg !32
|
111 |
+
%109 = sext i32 %10 to i64, !dbg !32
|
112 |
+
%110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !32
|
113 |
+
%111 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !32
|
114 |
+
%112 = bitcast float %111 to i32, !dbg !32
|
115 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !32
|
116 |
+
%114 = bitcast i32 %113 to float, !dbg !32
|
117 |
+
%115 = fadd float %111, %114, !dbg !26
|
118 |
+
%116 = and i32 %10, 1, !dbg !32
|
119 |
+
%117 = icmp eq i32 %116, 0, !dbg !32
|
120 |
+
%118 = and i1 %108, %117, !dbg !32
|
121 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %115, i1 %118) #3, !dbg !32
|
122 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
123 |
+
%119 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
124 |
+
%120 = fadd float %119, 0.000000e+00, !dbg !34
|
125 |
+
%121 = fsub float %48, %55, !dbg !38
|
126 |
+
%122 = fsub float %49, %55, !dbg !38
|
127 |
+
%123 = fsub float %50, %55, !dbg !38
|
128 |
+
%124 = fsub float %51, %55, !dbg !38
|
129 |
+
%125 = fmul float %121, %61, !dbg !39
|
130 |
+
%126 = fmul float %122, %61, !dbg !39
|
131 |
+
%127 = fmul float %123, %61, !dbg !39
|
132 |
+
%128 = fmul float %124, %61, !dbg !39
|
133 |
+
%129 = fmul float %78, %125, !dbg !40
|
134 |
+
%130 = fmul float %79, %126, !dbg !40
|
135 |
+
%131 = fmul float %80, %127, !dbg !40
|
136 |
+
%132 = fmul float %81, %128, !dbg !40
|
137 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
138 |
+
%133 = fadd float %129, %130, !dbg !43
|
139 |
+
%134 = fadd float %131, %133, !dbg !43
|
140 |
+
%135 = fadd float %132, %134, !dbg !43
|
141 |
+
%136 = bitcast float %135 to i32, !dbg !41
|
142 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !41
|
143 |
+
%138 = bitcast i32 %137 to float, !dbg !41
|
144 |
+
%139 = fadd float %135, %138, !dbg !43
|
145 |
+
%140 = bitcast float %139 to i32, !dbg !41
|
146 |
+
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !41
|
147 |
+
%142 = bitcast i32 %141 to float, !dbg !41
|
148 |
+
%143 = fadd float %139, %142, !dbg !43
|
149 |
+
%144 = bitcast float %143 to i32, !dbg !41
|
150 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !41
|
151 |
+
%146 = bitcast i32 %145 to float, !dbg !41
|
152 |
+
%147 = fadd float %143, %146, !dbg !43
|
153 |
+
%148 = bitcast float %147 to i32, !dbg !41
|
154 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !41
|
155 |
+
%150 = bitcast i32 %149 to float, !dbg !41
|
156 |
+
%151 = fadd float %147, %150, !dbg !43
|
157 |
+
%152 = bitcast float %151 to i32, !dbg !41
|
158 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !41
|
159 |
+
%154 = bitcast i32 %153 to float, !dbg !41
|
160 |
+
%155 = fadd float %151, %154, !dbg !43
|
161 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %155, i1 %105) #3, !dbg !41
|
162 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
163 |
+
%156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !41
|
164 |
+
%157 = bitcast float %156 to i32, !dbg !41
|
165 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !41
|
166 |
+
%159 = bitcast i32 %158 to float, !dbg !41
|
167 |
+
%160 = fadd float %156, %159, !dbg !43
|
168 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %160, i1 %118) #3, !dbg !41
|
169 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
170 |
+
%161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
|
171 |
+
%162 = fadd float %161, 0.000000e+00, !dbg !46
|
172 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %61, float 2.560000e+02) #3, !dbg !48
|
173 |
+
%164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %63, float 2.560000e+02) #3, !dbg !48
|
174 |
+
%165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !48
|
175 |
+
%166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !48
|
176 |
+
%167 = fmul float %78, 2.560000e+02, !dbg !49
|
177 |
+
%168 = fmul float %79, 2.560000e+02, !dbg !49
|
178 |
+
%169 = fmul float %80, 2.560000e+02, !dbg !49
|
179 |
+
%170 = fmul float %81, 2.560000e+02, !dbg !49
|
180 |
+
%171 = fsub float %167, %120, !dbg !50
|
181 |
+
%172 = fsub float %168, %120, !dbg !50
|
182 |
+
%173 = fsub float %169, %120, !dbg !50
|
183 |
+
%174 = fsub float %170, %120, !dbg !50
|
184 |
+
%175 = fmul float %125, %162, !dbg !51
|
185 |
+
%176 = fmul float %126, %162, !dbg !51
|
186 |
+
%177 = fmul float %127, %162, !dbg !51
|
187 |
+
%178 = fmul float %128, %162, !dbg !51
|
188 |
+
%179 = fsub float %171, %175, !dbg !52
|
189 |
+
%180 = fsub float %172, %176, !dbg !52
|
190 |
+
%181 = fsub float %173, %177, !dbg !52
|
191 |
+
%182 = fsub float %174, %178, !dbg !52
|
192 |
+
%183 = fmul float %163, %179, !dbg !53
|
193 |
+
%184 = fmul float %163, %180, !dbg !53
|
194 |
+
%185 = fmul float %163, %181, !dbg !53
|
195 |
+
%186 = fmul float %163, %182, !dbg !53
|
196 |
+
%187 = fadd float %183, %74, !dbg !54
|
197 |
+
%188 = fadd float %184, %75, !dbg !54
|
198 |
+
%189 = fadd float %185, %76, !dbg !54
|
199 |
+
%190 = fadd float %186, %77, !dbg !54
|
200 |
+
%191 = bitcast float %187 to i32, !dbg !55
|
201 |
+
%192 = bitcast float %188 to i32, !dbg !55
|
202 |
+
%193 = bitcast float %189 to i32, !dbg !55
|
203 |
+
%194 = bitcast float %190 to i32, !dbg !55
|
204 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %191, i32 %192, i32 %193, i32 %194, ptr addrspace(1) %68, i1 true) #3, !dbg !55
|
205 |
+
%195 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56
|
206 |
+
%196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #3, !dbg !57
|
207 |
+
%197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !57
|
208 |
+
%198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !57
|
209 |
+
%199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !57
|
210 |
+
%200 = insertelement <2 x i16> undef, i16 %196, i64 0, !dbg !57
|
211 |
+
%201 = insertelement <2 x i16> %200, i16 %197, i64 1, !dbg !57
|
212 |
+
%202 = bitcast <2 x i16> %201 to i32, !dbg !57
|
213 |
+
%203 = insertelement <2 x i16> undef, i16 %198, i64 0, !dbg !57
|
214 |
+
%204 = insertelement <2 x i16> %203, i16 %199, i64 1, !dbg !57
|
215 |
+
%205 = bitcast <2 x i16> %204 to i32, !dbg !57
|
216 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %202, i32 %205, ptr addrspace(1) %195, i1 true) #3, !dbg !57
|
217 |
+
ret void, !dbg !58
|
218 |
+
}
|
219 |
+
|
220 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
221 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
222 |
+
|
223 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
224 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
225 |
+
|
226 |
+
; Function Attrs: convergent nocallback nounwind
|
227 |
+
declare void @llvm.nvvm.barrier0() #2
|
228 |
+
|
229 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
230 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
231 |
+
attributes #2 = { convergent nocallback nounwind }
|
232 |
+
attributes #3 = { nounwind }
|
233 |
+
|
234 |
+
!llvm.module.flags = !{!0}
|
235 |
+
!llvm.dbg.cu = !{!1}
|
236 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
237 |
+
|
238 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
239 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
240 |
+
!2 = !DIFile(filename: "csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py", directory: "/tmp/torchinductor_root/sn")
|
241 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1}
|
242 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64}
|
243 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
244 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
245 |
+
!7 = !{}
|
246 |
+
!8 = !DILocation(line: 26, column: 26, scope: !5)
|
247 |
+
!9 = !DILocation(line: 23, column: 28, scope: !5)
|
248 |
+
!10 = !DILocation(line: 30, column: 40, scope: !5)
|
249 |
+
!11 = !DILocation(line: 30, column: 36, scope: !5)
|
250 |
+
!12 = !DILocation(line: 30, column: 30, scope: !5)
|
251 |
+
!13 = !DILocation(line: 30, column: 46, scope: !5)
|
252 |
+
!14 = !DILocation(line: 30, column: 67, scope: !5)
|
253 |
+
!15 = !DILocation(line: 31, column: 30, scope: !5)
|
254 |
+
!16 = !DILocation(line: 31, column: 35, scope: !5)
|
255 |
+
!17 = !DILocation(line: 32, column: 30, scope: !5)
|
256 |
+
!18 = !DILocation(line: 32, column: 46, scope: !5)
|
257 |
+
!19 = !DILocation(line: 33, column: 30, scope: !5)
|
258 |
+
!20 = !DILocation(line: 33, column: 35, scope: !5)
|
259 |
+
!21 = !DILocation(line: 34, column: 31, scope: !5)
|
260 |
+
!22 = !DILocation(line: 34, column: 36, scope: !5)
|
261 |
+
!23 = !DILocation(line: 35, column: 35, scope: !5)
|
262 |
+
!24 = !DILocation(line: 35, column: 51, scope: !5)
|
263 |
+
!25 = !DILocation(line: 37, column: 18, scope: !5)
|
264 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
265 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
266 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
267 |
+
!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
|
268 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
269 |
+
!31 = !DILocation(line: 40, column: 57, scope: !27)
|
270 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
271 |
+
!33 = !DILocation(line: 40, column: 57, scope: !29)
|
272 |
+
!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
|
273 |
+
!35 = distinct !DILexicalBlockFile(scope: !5, file: !36, discriminator: 0)
|
274 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
275 |
+
!37 = !DILocation(line: 40, column: 44, scope: !35)
|
276 |
+
!38 = !DILocation(line: 41, column: 19, scope: !5)
|
277 |
+
!39 = !DILocation(line: 42, column: 20, scope: !5)
|
278 |
+
!40 = !DILocation(line: 43, column: 19, scope: !5)
|
279 |
+
!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
|
280 |
+
!42 = !DILocation(line: 46, column: 59, scope: !29)
|
281 |
+
!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
|
282 |
+
!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
|
283 |
+
!45 = !DILocation(line: 46, column: 59, scope: !27)
|
284 |
+
!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
|
285 |
+
!47 = !DILocation(line: 46, column: 45, scope: !35)
|
286 |
+
!48 = !DILocation(line: 48, column: 20, scope: !5)
|
287 |
+
!49 = !DILocation(line: 49, column: 19, scope: !5)
|
288 |
+
!50 = !DILocation(line: 50, column: 20, scope: !5)
|
289 |
+
!51 = !DILocation(line: 51, column: 20, scope: !5)
|
290 |
+
!52 = !DILocation(line: 52, column: 20, scope: !5)
|
291 |
+
!53 = !DILocation(line: 53, column: 20, scope: !5)
|
292 |
+
!54 = !DILocation(line: 54, column: 20, scope: !5)
|
293 |
+
!55 = !DILocation(line: 56, column: 51, scope: !5)
|
294 |
+
!56 = !DILocation(line: 57, column: 25, scope: !5)
|
295 |
+
!57 = !DILocation(line: 57, column: 48, scope: !5)
|
296 |
+
!58 = !DILocation(line: 57, column: 4, scope: !5)
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
7 |
+
%c256_i32 = arith.constant 256 : i32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
20 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
21 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
22 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
28 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
29 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
30 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
31 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
33 |
+
%22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
34 |
+
%23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
35 |
+
%24 = tt.load %23, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
36 |
+
%25 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
37 |
+
%26 = arith.select %2, %25, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
38 |
+
%27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
|
39 |
+
^bb0(%arg9: f32, %arg10: f32):
|
40 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
41 |
+
tt.reduce.return %50 : f32
|
42 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
43 |
+
%28 = arith.addf %27, %cst_1 : f32
|
44 |
+
%29 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
45 |
+
%30 = arith.subf %15, %29 : tensor<256xf32, #blocked>
|
46 |
+
%31 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
47 |
+
%32 = arith.mulf %30, %31 : tensor<256xf32, #blocked>
|
48 |
+
%33 = arith.mulf %25, %32 : tensor<256xf32, #blocked>
|
49 |
+
%34 = arith.select %2, %33, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
50 |
+
%35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
|
51 |
+
^bb0(%arg9: f32, %arg10: f32):
|
52 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
53 |
+
tt.reduce.return %50 : f32
|
54 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
55 |
+
%36 = arith.addf %35, %cst_1 : f32
|
56 |
+
%37 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
|
57 |
+
%38 = arith.mulf %25, %cst_3 : tensor<256xf32, #blocked>
|
58 |
+
%39 = tt.splat %28 : (f32) -> tensor<256xf32, #blocked>
|
59 |
+
%40 = arith.subf %38, %39 : tensor<256xf32, #blocked>
|
60 |
+
%41 = tt.splat %36 : (f32) -> tensor<256xf32, #blocked>
|
61 |
+
%42 = arith.mulf %32, %41 : tensor<256xf32, #blocked>
|
62 |
+
%43 = arith.subf %40, %42 : tensor<256xf32, #blocked>
|
63 |
+
%44 = tt.broadcast %37 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
64 |
+
%45 = arith.mulf %44, %43 : tensor<256xf32, #blocked>
|
65 |
+
%46 = arith.addf %24, %45 : tensor<256xf32, #blocked>
|
66 |
+
tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
67 |
+
%47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
68 |
+
%48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
69 |
+
%49 = arith.truncf %46 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
70 |
+
tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
71 |
+
tt.return
|
72 |
+
}
|
73 |
+
}
|
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx
ADDED
@@ -0,0 +1,1927 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
22 |
+
)
|
23 |
+
.maxntid 256, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<201>;
|
26 |
+
.reg .b16 %rs<129>;
|
27 |
+
.reg .b32 %r<399>;
|
28 |
+
.reg .f32 %f<469>;
|
29 |
+
.reg .b64 %rd<150>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
35 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
36 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
37 |
+
ld.param.u64 %rd52, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 22 44
|
40 |
+
mov.u32 %r1, %tid.x;
|
41 |
+
ld.param.u64 %rd53, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
42 |
+
and.b32 %r2, %r1, 31;
|
43 |
+
ld.param.u64 %rd50, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
44 |
+
ld.param.u64 %rd51, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
45 |
+
bfe.u32 %r30, %r1, 6, 2;
|
46 |
+
or.b32 %r3, %r30, 4;
|
47 |
+
or.b32 %r4, %r30, 8;
|
48 |
+
or.b32 %r5, %r30, 12;
|
49 |
+
or.b32 %r6, %r30, 16;
|
50 |
+
or.b32 %r7, %r30, 20;
|
51 |
+
or.b32 %r8, %r30, 24;
|
52 |
+
or.b32 %r9, %r30, 28;
|
53 |
+
or.b32 %r10, %r30, 32;
|
54 |
+
or.b32 %r11, %r30, 36;
|
55 |
+
or.b32 %r12, %r30, 40;
|
56 |
+
or.b32 %r13, %r30, 44;
|
57 |
+
or.b32 %r14, %r30, 48;
|
58 |
+
or.b32 %r15, %r30, 52;
|
59 |
+
or.b32 %r16, %r30, 56;
|
60 |
+
or.b32 %r17, %r30, 60;
|
61 |
+
.loc 1 24 33
|
62 |
+
and.b32 %r18, %r1, 63;
|
63 |
+
.loc 1 21 28
|
64 |
+
mov.u32 %r23, %ctaid.x;
|
65 |
+
.loc 1 21 34
|
66 |
+
cvt.s64.s32 %rd1, %r23;
|
67 |
+
.loc 1 21 46
|
68 |
+
mul.wide.s32 %rd54, %r23, 64;
|
69 |
+
cvt.u64.u32 %rd2, %r30;
|
70 |
+
.loc 1 22 23
|
71 |
+
or.b64 %rd55, %rd54, %rd2;
|
72 |
+
.loc 1 26 30
|
73 |
+
shl.b64 %rd56, %rd55, 3;
|
74 |
+
add.s64 %rd19, %rd53, %rd56;
|
75 |
+
add.s64 %rd21, %rd19, 32;
|
76 |
+
add.s64 %rd23, %rd19, 64;
|
77 |
+
add.s64 %rd25, %rd19, 96;
|
78 |
+
add.s64 %rd27, %rd19, 128;
|
79 |
+
add.s64 %rd29, %rd19, 160;
|
80 |
+
add.s64 %rd31, %rd19, 192;
|
81 |
+
add.s64 %rd33, %rd19, 224;
|
82 |
+
add.s64 %rd35, %rd19, 256;
|
83 |
+
add.s64 %rd37, %rd19, 288;
|
84 |
+
add.s64 %rd39, %rd19, 320;
|
85 |
+
add.s64 %rd41, %rd19, 352;
|
86 |
+
add.s64 %rd43, %rd19, 384;
|
87 |
+
add.s64 %rd45, %rd19, 416;
|
88 |
+
add.s64 %rd47, %rd19, 448;
|
89 |
+
add.s64 %rd49, %rd19, 480;
|
90 |
+
mov.pred %p1, -1;
|
91 |
+
.loc 1 26 35
|
92 |
+
mov.u64 %rd18, 0x0;
|
93 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
|
94 |
+
mov.u64 %rd20, 0x0;
|
95 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
|
96 |
+
mov.u64 %rd22, 0x0;
|
97 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
|
98 |
+
mov.u64 %rd24, 0x0;
|
99 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
|
100 |
+
mov.u64 %rd26, 0x0;
|
101 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
|
102 |
+
mov.u64 %rd28, 0x0;
|
103 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
|
104 |
+
mov.u64 %rd30, 0x0;
|
105 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
|
106 |
+
mov.u64 %rd32, 0x0;
|
107 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd33 + 0 ];
|
108 |
+
mov.u64 %rd34, 0x0;
|
109 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd35 + 0 ];
|
110 |
+
mov.u64 %rd36, 0x0;
|
111 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd37 + 0 ];
|
112 |
+
mov.u64 %rd38, 0x0;
|
113 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd39 + 0 ];
|
114 |
+
mov.u64 %rd40, 0x0;
|
115 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
|
116 |
+
mov.u64 %rd42, 0x0;
|
117 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
|
118 |
+
mov.u64 %rd44, 0x0;
|
119 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
|
120 |
+
mov.u64 %rd46, 0x0;
|
121 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
|
122 |
+
mov.u64 %rd48, 0x0;
|
123 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
|
124 |
+
.loc 1 27 19
|
125 |
+
mov.u32 %r27, 0x0;
|
126 |
+
@%p1 ld.global.b32 { %r27 }, [ %rd50 + 0 ];
|
127 |
+
.loc 1 29 19
|
128 |
+
mov.u32 %r28, 0x0;
|
129 |
+
@%p1 ld.global.b32 { %r28 }, [ %rd51 + 0 ];
|
130 |
+
.loc 1 38 23
|
131 |
+
setp.eq.s64 %p19, %rd18, -1;
|
132 |
+
setp.eq.s64 %p20, %rd20, -1;
|
133 |
+
setp.eq.s64 %p21, %rd22, -1;
|
134 |
+
setp.eq.s64 %p22, %rd24, -1;
|
135 |
+
setp.eq.s64 %p23, %rd26, -1;
|
136 |
+
setp.eq.s64 %p24, %rd28, -1;
|
137 |
+
setp.eq.s64 %p25, %rd30, -1;
|
138 |
+
setp.eq.s64 %p26, %rd32, -1;
|
139 |
+
setp.eq.s64 %p27, %rd34, -1;
|
140 |
+
setp.eq.s64 %p28, %rd36, -1;
|
141 |
+
setp.eq.s64 %p29, %rd38, -1;
|
142 |
+
setp.eq.s64 %p30, %rd40, -1;
|
143 |
+
setp.eq.s64 %p31, %rd42, -1;
|
144 |
+
setp.eq.s64 %p32, %rd44, -1;
|
145 |
+
setp.eq.s64 %p33, %rd46, -1;
|
146 |
+
setp.eq.s64 %p34, %rd48, -1;
|
147 |
+
.loc 1 39 22
|
148 |
+
div.full.f32 %r26, %r27, %r28;
|
149 |
+
mov.b32 %f97, %r26;
|
150 |
+
.loc 1 41 37
|
151 |
+
selp.f32 %f16, 0f00000000, %f97, %p34;
|
152 |
+
selp.f32 %f15, 0f00000000, %f97, %p33;
|
153 |
+
selp.f32 %f14, 0f00000000, %f97, %p32;
|
154 |
+
selp.f32 %f13, 0f00000000, %f97, %p31;
|
155 |
+
selp.f32 %f12, 0f00000000, %f97, %p30;
|
156 |
+
selp.f32 %f11, 0f00000000, %f97, %p29;
|
157 |
+
selp.f32 %f10, 0f00000000, %f97, %p28;
|
158 |
+
selp.f32 %f9, 0f00000000, %f97, %p27;
|
159 |
+
selp.f32 %f8, 0f00000000, %f97, %p26;
|
160 |
+
selp.f32 %f7, 0f00000000, %f97, %p25;
|
161 |
+
selp.f32 %f6, 0f00000000, %f97, %p24;
|
162 |
+
selp.f32 %f5, 0f00000000, %f97, %p23;
|
163 |
+
selp.f32 %f4, 0f00000000, %f97, %p22;
|
164 |
+
selp.f32 %f3, 0f00000000, %f97, %p21;
|
165 |
+
selp.f32 %f2, 0f00000000, %f97, %p20;
|
166 |
+
selp.f32 %f1, 0f00000000, %f97, %p19;
|
167 |
+
.loc 1 32 36
|
168 |
+
mul.wide.s32 %rd57, %r23, 12865792;
|
169 |
+
mul.wide.u32 %rd58, %r30, 201028;
|
170 |
+
add.s64 %rd59, %rd57, %rd58;
|
171 |
+
cvt.u64.u32 %rd60, %r1;
|
172 |
+
and.b64 %rd3, %rd60, 63;
|
173 |
+
mul.wide.u32 %rd61, %r18, 4;
|
174 |
+
add.s64 %rd62, %rd59, %rd61;
|
175 |
+
add.s64 %rd63, %rd62, %rd52;
|
176 |
+
add.s64 %rd149, %rd63, 12061680;
|
177 |
+
mov.f32 %f453, 0f00000000;
|
178 |
+
mov.b32 %r397, -64;
|
179 |
+
mov.u64 %rd147, %rd149;
|
180 |
+
mov.f32 %f454, %f453;
|
181 |
+
mov.f32 %f455, %f453;
|
182 |
+
mov.f32 %f456, %f453;
|
183 |
+
mov.f32 %f457, %f453;
|
184 |
+
mov.f32 %f458, %f453;
|
185 |
+
mov.f32 %f459, %f453;
|
186 |
+
mov.f32 %f460, %f453;
|
187 |
+
mov.f32 %f461, %f453;
|
188 |
+
mov.f32 %f462, %f453;
|
189 |
+
mov.f32 %f463, %f453;
|
190 |
+
mov.f32 %f464, %f453;
|
191 |
+
mov.f32 %f465, %f453;
|
192 |
+
mov.f32 %f466, %f453;
|
193 |
+
mov.f32 %f467, %f453;
|
194 |
+
mov.f32 %f468, %f453;
|
195 |
+
$L__BB0_1:
|
196 |
+
add.s32 %r397, %r397, 64;
|
197 |
+
.loc 1 33 27
|
198 |
+
add.s32 %r63, %r397, %r18;
|
199 |
+
.loc 1 34 25
|
200 |
+
setp.lt.u32 %p35, %r63, 50257;
|
201 |
+
.loc 1 36 34
|
202 |
+
add.s64 %rd64, %rd147, -12061680;
|
203 |
+
add.s64 %rd65, %rd147, -11257568;
|
204 |
+
add.s64 %rd66, %rd147, -10453456;
|
205 |
+
add.s64 %rd67, %rd147, -9649344;
|
206 |
+
add.s64 %rd68, %rd147, -8845232;
|
207 |
+
add.s64 %rd69, %rd147, -8041120;
|
208 |
+
add.s64 %rd70, %rd147, -7237008;
|
209 |
+
add.s64 %rd71, %rd147, -6432896;
|
210 |
+
add.s64 %rd72, %rd147, -5628784;
|
211 |
+
add.s64 %rd73, %rd147, -4824672;
|
212 |
+
add.s64 %rd74, %rd147, -4020560;
|
213 |
+
add.s64 %rd75, %rd147, -3216448;
|
214 |
+
add.s64 %rd76, %rd147, -2412336;
|
215 |
+
add.s64 %rd77, %rd147, -1608224;
|
216 |
+
add.s64 %rd78, %rd147, -804112;
|
217 |
+
mov.b32 %r333, 0;
|
218 |
+
.loc 1 36 52
|
219 |
+
mov.u32 %r31, 0x0;
|
220 |
+
@%p35 ld.global.L1::evict_last.b32 { %r31 }, [ %rd64 + 0 ];
|
221 |
+
@!%p35 mov.u32 %r31, %r333;
|
222 |
+
mov.u32 %r33, 0x0;
|
223 |
+
@%p35 ld.global.L1::evict_last.b32 { %r33 }, [ %rd65 + 0 ];
|
224 |
+
@!%p35 mov.u32 %r33, %r333;
|
225 |
+
mov.u32 %r35, 0x0;
|
226 |
+
@%p35 ld.global.L1::evict_last.b32 { %r35 }, [ %rd66 + 0 ];
|
227 |
+
@!%p35 mov.u32 %r35, %r333;
|
228 |
+
mov.u32 %r37, 0x0;
|
229 |
+
@%p35 ld.global.L1::evict_last.b32 { %r37 }, [ %rd67 + 0 ];
|
230 |
+
@!%p35 mov.u32 %r37, %r333;
|
231 |
+
mov.u32 %r39, 0x0;
|
232 |
+
@%p35 ld.global.L1::evict_last.b32 { %r39 }, [ %rd68 + 0 ];
|
233 |
+
@!%p35 mov.u32 %r39, %r333;
|
234 |
+
mov.u32 %r41, 0x0;
|
235 |
+
@%p35 ld.global.L1::evict_last.b32 { %r41 }, [ %rd69 + 0 ];
|
236 |
+
@!%p35 mov.u32 %r41, %r333;
|
237 |
+
mov.u32 %r43, 0x0;
|
238 |
+
@%p35 ld.global.L1::evict_last.b32 { %r43 }, [ %rd70 + 0 ];
|
239 |
+
@!%p35 mov.u32 %r43, %r333;
|
240 |
+
mov.u32 %r45, 0x0;
|
241 |
+
@%p35 ld.global.L1::evict_last.b32 { %r45 }, [ %rd71 + 0 ];
|
242 |
+
@!%p35 mov.u32 %r45, %r333;
|
243 |
+
mov.u32 %r47, 0x0;
|
244 |
+
@%p35 ld.global.L1::evict_last.b32 { %r47 }, [ %rd72 + 0 ];
|
245 |
+
@!%p35 mov.u32 %r47, %r333;
|
246 |
+
mov.u32 %r49, 0x0;
|
247 |
+
@%p35 ld.global.L1::evict_last.b32 { %r49 }, [ %rd73 + 0 ];
|
248 |
+
@!%p35 mov.u32 %r49, %r333;
|
249 |
+
mov.u32 %r51, 0x0;
|
250 |
+
@%p35 ld.global.L1::evict_last.b32 { %r51 }, [ %rd74 + 0 ];
|
251 |
+
@!%p35 mov.u32 %r51, %r333;
|
252 |
+
mov.u32 %r53, 0x0;
|
253 |
+
@%p35 ld.global.L1::evict_last.b32 { %r53 }, [ %rd75 + 0 ];
|
254 |
+
@!%p35 mov.u32 %r53, %r333;
|
255 |
+
mov.u32 %r55, 0x0;
|
256 |
+
@%p35 ld.global.L1::evict_last.b32 { %r55 }, [ %rd76 + 0 ];
|
257 |
+
@!%p35 mov.u32 %r55, %r333;
|
258 |
+
mov.u32 %r57, 0x0;
|
259 |
+
@%p35 ld.global.L1::evict_last.b32 { %r57 }, [ %rd77 + 0 ];
|
260 |
+
@!%p35 mov.u32 %r57, %r333;
|
261 |
+
mov.u32 %r59, 0x0;
|
262 |
+
@%p35 ld.global.L1::evict_last.b32 { %r59 }, [ %rd78 + 0 ];
|
263 |
+
@!%p35 mov.u32 %r59, %r333;
|
264 |
+
mov.u32 %r61, 0x0;
|
265 |
+
@%p35 ld.global.L1::evict_last.b32 { %r61 }, [ %rd147 + 0 ];
|
266 |
+
@!%p35 mov.u32 %r61, %r333;
|
267 |
+
mov.b32 %f98, %r61;
|
268 |
+
mov.b32 %f99, %r59;
|
269 |
+
mov.b32 %f100, %r57;
|
270 |
+
mov.b32 %f101, %r55;
|
271 |
+
mov.b32 %f102, %r53;
|
272 |
+
mov.b32 %f103, %r51;
|
273 |
+
mov.b32 %f104, %r49;
|
274 |
+
mov.b32 %f105, %r47;
|
275 |
+
mov.b32 %f106, %r45;
|
276 |
+
mov.b32 %f107, %r43;
|
277 |
+
mov.b32 %f108, %r41;
|
278 |
+
mov.b32 %f109, %r39;
|
279 |
+
mov.b32 %f110, %r37;
|
280 |
+
mov.b32 %f111, %r35;
|
281 |
+
mov.b32 %f112, %r33;
|
282 |
+
mov.b32 %f113, %r31;
|
283 |
+
.loc 1 42 23
|
284 |
+
mul.f32 %f114, %f1, %f113;
|
285 |
+
mul.f32 %f115, %f2, %f112;
|
286 |
+
mul.f32 %f116, %f3, %f111;
|
287 |
+
mul.f32 %f117, %f4, %f110;
|
288 |
+
mul.f32 %f118, %f5, %f109;
|
289 |
+
mul.f32 %f119, %f6, %f108;
|
290 |
+
mul.f32 %f120, %f7, %f107;
|
291 |
+
mul.f32 %f121, %f8, %f106;
|
292 |
+
mul.f32 %f122, %f9, %f105;
|
293 |
+
mul.f32 %f123, %f10, %f104;
|
294 |
+
mul.f32 %f124, %f11, %f103;
|
295 |
+
mul.f32 %f125, %f12, %f102;
|
296 |
+
mul.f32 %f126, %f13, %f101;
|
297 |
+
mul.f32 %f127, %f14, %f100;
|
298 |
+
mul.f32 %f128, %f15, %f99;
|
299 |
+
mul.f32 %f129, %f16, %f98;
|
300 |
+
.loc 1 45 40
|
301 |
+
selp.f32 %f130, %f129, 0f80000000, %p35;
|
302 |
+
selp.f32 %f131, %f128, 0f80000000, %p35;
|
303 |
+
selp.f32 %f132, %f127, 0f80000000, %p35;
|
304 |
+
selp.f32 %f133, %f126, 0f80000000, %p35;
|
305 |
+
selp.f32 %f134, %f125, 0f80000000, %p35;
|
306 |
+
selp.f32 %f135, %f124, 0f80000000, %p35;
|
307 |
+
selp.f32 %f136, %f123, 0f80000000, %p35;
|
308 |
+
selp.f32 %f137, %f122, 0f80000000, %p35;
|
309 |
+
selp.f32 %f138, %f121, 0f80000000, %p35;
|
310 |
+
selp.f32 %f139, %f120, 0f80000000, %p35;
|
311 |
+
selp.f32 %f140, %f119, 0f80000000, %p35;
|
312 |
+
selp.f32 %f141, %f118, 0f80000000, %p35;
|
313 |
+
selp.f32 %f142, %f117, 0f80000000, %p35;
|
314 |
+
selp.f32 %f143, %f116, 0f80000000, %p35;
|
315 |
+
selp.f32 %f144, %f115, 0f80000000, %p35;
|
316 |
+
selp.f32 %f145, %f114, 0f80000000, %p35;
|
317 |
+
add.f32 %f453, %f453, %f145;
|
318 |
+
add.f32 %f454, %f454, %f144;
|
319 |
+
add.f32 %f455, %f455, %f143;
|
320 |
+
add.f32 %f456, %f456, %f142;
|
321 |
+
add.f32 %f457, %f457, %f141;
|
322 |
+
add.f32 %f458, %f458, %f140;
|
323 |
+
add.f32 %f459, %f459, %f139;
|
324 |
+
add.f32 %f460, %f460, %f138;
|
325 |
+
add.f32 %f461, %f461, %f137;
|
326 |
+
add.f32 %f462, %f462, %f136;
|
327 |
+
add.f32 %f463, %f463, %f135;
|
328 |
+
add.f32 %f464, %f464, %f134;
|
329 |
+
add.f32 %f465, %f465, %f133;
|
330 |
+
add.f32 %f466, %f466, %f132;
|
331 |
+
add.f32 %f467, %f467, %f131;
|
332 |
+
add.f32 %f468, %f468, %f130;
|
333 |
+
.loc 1 32 36
|
334 |
+
add.s64 %rd147, %rd147, 256;
|
335 |
+
setp.lt.u32 %p67, %r397, 50193;
|
336 |
+
@%p67 bra $L__BB0_1;
|
337 |
+
.loc 1 0 36
|
338 |
+
cvt.u32.u64 %r101, %rd2;
|
339 |
+
$L__tmp1:
|
340 |
+
.loc 2 243 36
|
341 |
+
mov.b32 %r102, %f453;
|
342 |
+
shfl.sync.bfly.b32 %r103, %r102, 16, 31, -1;
|
343 |
+
mov.b32 %f146, %r103;
|
344 |
+
$L__tmp2:
|
345 |
+
.loc 2 233 15
|
346 |
+
add.f32 %f147, %f453, %f146;
|
347 |
+
$L__tmp3:
|
348 |
+
.loc 2 243 36
|
349 |
+
mov.b32 %r104, %f147;
|
350 |
+
shfl.sync.bfly.b32 %r105, %r104, 8, 31, -1;
|
351 |
+
mov.b32 %f148, %r105;
|
352 |
+
$L__tmp4:
|
353 |
+
.loc 2 233 15
|
354 |
+
add.f32 %f149, %f147, %f148;
|
355 |
+
$L__tmp5:
|
356 |
+
.loc 2 243 36
|
357 |
+
mov.b32 %r106, %f149;
|
358 |
+
shfl.sync.bfly.b32 %r107, %r106, 4, 31, -1;
|
359 |
+
mov.b32 %f150, %r107;
|
360 |
+
$L__tmp6:
|
361 |
+
.loc 2 233 15
|
362 |
+
add.f32 %f151, %f149, %f150;
|
363 |
+
$L__tmp7:
|
364 |
+
.loc 2 243 36
|
365 |
+
mov.b32 %r108, %f151;
|
366 |
+
shfl.sync.bfly.b32 %r109, %r108, 2, 31, -1;
|
367 |
+
mov.b32 %f152, %r109;
|
368 |
+
$L__tmp8:
|
369 |
+
.loc 2 233 15
|
370 |
+
add.f32 %f153, %f151, %f152;
|
371 |
+
$L__tmp9:
|
372 |
+
.loc 2 243 36
|
373 |
+
mov.b32 %r110, %f153;
|
374 |
+
shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
|
375 |
+
mov.b32 %f154, %r111;
|
376 |
+
$L__tmp10:
|
377 |
+
.loc 2 233 15
|
378 |
+
add.f32 %f155, %f153, %f154;
|
379 |
+
$L__tmp11:
|
380 |
+
.loc 2 243 36
|
381 |
+
mov.b32 %r112, %f454;
|
382 |
+
shfl.sync.bfly.b32 %r113, %r112, 16, 31, -1;
|
383 |
+
mov.b32 %f156, %r113;
|
384 |
+
$L__tmp12:
|
385 |
+
.loc 2 233 15
|
386 |
+
add.f32 %f157, %f454, %f156;
|
387 |
+
$L__tmp13:
|
388 |
+
.loc 2 243 36
|
389 |
+
mov.b32 %r114, %f157;
|
390 |
+
shfl.sync.bfly.b32 %r115, %r114, 8, 31, -1;
|
391 |
+
mov.b32 %f158, %r115;
|
392 |
+
$L__tmp14:
|
393 |
+
.loc 2 233 15
|
394 |
+
add.f32 %f159, %f157, %f158;
|
395 |
+
$L__tmp15:
|
396 |
+
.loc 2 243 36
|
397 |
+
mov.b32 %r116, %f159;
|
398 |
+
shfl.sync.bfly.b32 %r117, %r116, 4, 31, -1;
|
399 |
+
mov.b32 %f160, %r117;
|
400 |
+
$L__tmp16:
|
401 |
+
.loc 2 233 15
|
402 |
+
add.f32 %f161, %f159, %f160;
|
403 |
+
$L__tmp17:
|
404 |
+
.loc 2 243 36
|
405 |
+
mov.b32 %r118, %f161;
|
406 |
+
shfl.sync.bfly.b32 %r119, %r118, 2, 31, -1;
|
407 |
+
mov.b32 %f162, %r119;
|
408 |
+
$L__tmp18:
|
409 |
+
.loc 2 233 15
|
410 |
+
add.f32 %f163, %f161, %f162;
|
411 |
+
$L__tmp19:
|
412 |
+
.loc 2 243 36
|
413 |
+
mov.b32 %r120, %f163;
|
414 |
+
shfl.sync.bfly.b32 %r121, %r120, 1, 31, -1;
|
415 |
+
mov.b32 %f164, %r121;
|
416 |
+
$L__tmp20:
|
417 |
+
.loc 2 233 15
|
418 |
+
add.f32 %f165, %f163, %f164;
|
419 |
+
$L__tmp21:
|
420 |
+
.loc 2 243 36
|
421 |
+
mov.b32 %r122, %f455;
|
422 |
+
shfl.sync.bfly.b32 %r123, %r122, 16, 31, -1;
|
423 |
+
mov.b32 %f166, %r123;
|
424 |
+
$L__tmp22:
|
425 |
+
.loc 2 233 15
|
426 |
+
add.f32 %f167, %f455, %f166;
|
427 |
+
$L__tmp23:
|
428 |
+
.loc 2 243 36
|
429 |
+
mov.b32 %r124, %f167;
|
430 |
+
shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
|
431 |
+
mov.b32 %f168, %r125;
|
432 |
+
$L__tmp24:
|
433 |
+
.loc 2 233 15
|
434 |
+
add.f32 %f169, %f167, %f168;
|
435 |
+
$L__tmp25:
|
436 |
+
.loc 2 243 36
|
437 |
+
mov.b32 %r126, %f169;
|
438 |
+
shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
|
439 |
+
mov.b32 %f170, %r127;
|
440 |
+
$L__tmp26:
|
441 |
+
.loc 2 233 15
|
442 |
+
add.f32 %f171, %f169, %f170;
|
443 |
+
$L__tmp27:
|
444 |
+
.loc 2 243 36
|
445 |
+
mov.b32 %r128, %f171;
|
446 |
+
shfl.sync.bfly.b32 %r129, %r128, 2, 31, -1;
|
447 |
+
mov.b32 %f172, %r129;
|
448 |
+
$L__tmp28:
|
449 |
+
.loc 2 233 15
|
450 |
+
add.f32 %f173, %f171, %f172;
|
451 |
+
$L__tmp29:
|
452 |
+
.loc 2 243 36
|
453 |
+
mov.b32 %r130, %f173;
|
454 |
+
shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1;
|
455 |
+
mov.b32 %f174, %r131;
|
456 |
+
$L__tmp30:
|
457 |
+
.loc 2 233 15
|
458 |
+
add.f32 %f175, %f173, %f174;
|
459 |
+
$L__tmp31:
|
460 |
+
.loc 2 243 36
|
461 |
+
mov.b32 %r132, %f456;
|
462 |
+
shfl.sync.bfly.b32 %r133, %r132, 16, 31, -1;
|
463 |
+
mov.b32 %f176, %r133;
|
464 |
+
$L__tmp32:
|
465 |
+
.loc 2 233 15
|
466 |
+
add.f32 %f177, %f456, %f176;
|
467 |
+
$L__tmp33:
|
468 |
+
.loc 2 243 36
|
469 |
+
mov.b32 %r134, %f177;
|
470 |
+
shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1;
|
471 |
+
mov.b32 %f178, %r135;
|
472 |
+
$L__tmp34:
|
473 |
+
.loc 2 233 15
|
474 |
+
add.f32 %f179, %f177, %f178;
|
475 |
+
$L__tmp35:
|
476 |
+
.loc 2 243 36
|
477 |
+
mov.b32 %r136, %f179;
|
478 |
+
shfl.sync.bfly.b32 %r137, %r136, 4, 31, -1;
|
479 |
+
mov.b32 %f180, %r137;
|
480 |
+
$L__tmp36:
|
481 |
+
.loc 2 233 15
|
482 |
+
add.f32 %f181, %f179, %f180;
|
483 |
+
$L__tmp37:
|
484 |
+
.loc 2 243 36
|
485 |
+
mov.b32 %r138, %f181;
|
486 |
+
shfl.sync.bfly.b32 %r139, %r138, 2, 31, -1;
|
487 |
+
mov.b32 %f182, %r139;
|
488 |
+
$L__tmp38:
|
489 |
+
.loc 2 233 15
|
490 |
+
add.f32 %f183, %f181, %f182;
|
491 |
+
$L__tmp39:
|
492 |
+
.loc 2 243 36
|
493 |
+
mov.b32 %r140, %f183;
|
494 |
+
shfl.sync.bfly.b32 %r141, %r140, 1, 31, -1;
|
495 |
+
mov.b32 %f184, %r141;
|
496 |
+
$L__tmp40:
|
497 |
+
.loc 2 233 15
|
498 |
+
add.f32 %f185, %f183, %f184;
|
499 |
+
$L__tmp41:
|
500 |
+
.loc 2 243 36
|
501 |
+
mov.b32 %r142, %f457;
|
502 |
+
shfl.sync.bfly.b32 %r143, %r142, 16, 31, -1;
|
503 |
+
mov.b32 %f186, %r143;
|
504 |
+
$L__tmp42:
|
505 |
+
.loc 2 233 15
|
506 |
+
add.f32 %f187, %f457, %f186;
|
507 |
+
$L__tmp43:
|
508 |
+
.loc 2 243 36
|
509 |
+
mov.b32 %r144, %f187;
|
510 |
+
shfl.sync.bfly.b32 %r145, %r144, 8, 31, -1;
|
511 |
+
mov.b32 %f188, %r145;
|
512 |
+
$L__tmp44:
|
513 |
+
.loc 2 233 15
|
514 |
+
add.f32 %f189, %f187, %f188;
|
515 |
+
$L__tmp45:
|
516 |
+
.loc 2 243 36
|
517 |
+
mov.b32 %r146, %f189;
|
518 |
+
shfl.sync.bfly.b32 %r147, %r146, 4, 31, -1;
|
519 |
+
mov.b32 %f190, %r147;
|
520 |
+
$L__tmp46:
|
521 |
+
.loc 2 233 15
|
522 |
+
add.f32 %f191, %f189, %f190;
|
523 |
+
$L__tmp47:
|
524 |
+
.loc 2 243 36
|
525 |
+
mov.b32 %r148, %f191;
|
526 |
+
shfl.sync.bfly.b32 %r149, %r148, 2, 31, -1;
|
527 |
+
mov.b32 %f192, %r149;
|
528 |
+
$L__tmp48:
|
529 |
+
.loc 2 233 15
|
530 |
+
add.f32 %f193, %f191, %f192;
|
531 |
+
$L__tmp49:
|
532 |
+
.loc 2 243 36
|
533 |
+
mov.b32 %r150, %f193;
|
534 |
+
shfl.sync.bfly.b32 %r151, %r150, 1, 31, -1;
|
535 |
+
mov.b32 %f194, %r151;
|
536 |
+
$L__tmp50:
|
537 |
+
.loc 2 233 15
|
538 |
+
add.f32 %f195, %f193, %f194;
|
539 |
+
$L__tmp51:
|
540 |
+
.loc 2 243 36
|
541 |
+
mov.b32 %r152, %f458;
|
542 |
+
shfl.sync.bfly.b32 %r153, %r152, 16, 31, -1;
|
543 |
+
mov.b32 %f196, %r153;
|
544 |
+
$L__tmp52:
|
545 |
+
.loc 2 233 15
|
546 |
+
add.f32 %f197, %f458, %f196;
|
547 |
+
$L__tmp53:
|
548 |
+
.loc 2 243 36
|
549 |
+
mov.b32 %r154, %f197;
|
550 |
+
shfl.sync.bfly.b32 %r155, %r154, 8, 31, -1;
|
551 |
+
mov.b32 %f198, %r155;
|
552 |
+
$L__tmp54:
|
553 |
+
.loc 2 233 15
|
554 |
+
add.f32 %f199, %f197, %f198;
|
555 |
+
$L__tmp55:
|
556 |
+
.loc 2 243 36
|
557 |
+
mov.b32 %r156, %f199;
|
558 |
+
shfl.sync.bfly.b32 %r157, %r156, 4, 31, -1;
|
559 |
+
mov.b32 %f200, %r157;
|
560 |
+
$L__tmp56:
|
561 |
+
.loc 2 233 15
|
562 |
+
add.f32 %f201, %f199, %f200;
|
563 |
+
$L__tmp57:
|
564 |
+
.loc 2 243 36
|
565 |
+
mov.b32 %r158, %f201;
|
566 |
+
shfl.sync.bfly.b32 %r159, %r158, 2, 31, -1;
|
567 |
+
mov.b32 %f202, %r159;
|
568 |
+
$L__tmp58:
|
569 |
+
.loc 2 233 15
|
570 |
+
add.f32 %f203, %f201, %f202;
|
571 |
+
$L__tmp59:
|
572 |
+
.loc 2 243 36
|
573 |
+
mov.b32 %r160, %f203;
|
574 |
+
shfl.sync.bfly.b32 %r161, %r160, 1, 31, -1;
|
575 |
+
mov.b32 %f204, %r161;
|
576 |
+
$L__tmp60:
|
577 |
+
.loc 2 233 15
|
578 |
+
add.f32 %f205, %f203, %f204;
|
579 |
+
$L__tmp61:
|
580 |
+
.loc 2 243 36
|
581 |
+
mov.b32 %r162, %f459;
|
582 |
+
shfl.sync.bfly.b32 %r163, %r162, 16, 31, -1;
|
583 |
+
mov.b32 %f206, %r163;
|
584 |
+
$L__tmp62:
|
585 |
+
.loc 2 233 15
|
586 |
+
add.f32 %f207, %f459, %f206;
|
587 |
+
$L__tmp63:
|
588 |
+
.loc 2 243 36
|
589 |
+
mov.b32 %r164, %f207;
|
590 |
+
shfl.sync.bfly.b32 %r165, %r164, 8, 31, -1;
|
591 |
+
mov.b32 %f208, %r165;
|
592 |
+
$L__tmp64:
|
593 |
+
.loc 2 233 15
|
594 |
+
add.f32 %f209, %f207, %f208;
|
595 |
+
$L__tmp65:
|
596 |
+
.loc 2 243 36
|
597 |
+
mov.b32 %r166, %f209;
|
598 |
+
shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1;
|
599 |
+
mov.b32 %f210, %r167;
|
600 |
+
$L__tmp66:
|
601 |
+
.loc 2 233 15
|
602 |
+
add.f32 %f211, %f209, %f210;
|
603 |
+
$L__tmp67:
|
604 |
+
.loc 2 243 36
|
605 |
+
mov.b32 %r168, %f211;
|
606 |
+
shfl.sync.bfly.b32 %r169, %r168, 2, 31, -1;
|
607 |
+
mov.b32 %f212, %r169;
|
608 |
+
$L__tmp68:
|
609 |
+
.loc 2 233 15
|
610 |
+
add.f32 %f213, %f211, %f212;
|
611 |
+
$L__tmp69:
|
612 |
+
.loc 2 243 36
|
613 |
+
mov.b32 %r170, %f213;
|
614 |
+
shfl.sync.bfly.b32 %r171, %r170, 1, 31, -1;
|
615 |
+
mov.b32 %f214, %r171;
|
616 |
+
$L__tmp70:
|
617 |
+
.loc 2 233 15
|
618 |
+
add.f32 %f215, %f213, %f214;
|
619 |
+
$L__tmp71:
|
620 |
+
.loc 2 243 36
|
621 |
+
mov.b32 %r172, %f460;
|
622 |
+
shfl.sync.bfly.b32 %r173, %r172, 16, 31, -1;
|
623 |
+
mov.b32 %f216, %r173;
|
624 |
+
$L__tmp72:
|
625 |
+
.loc 2 233 15
|
626 |
+
add.f32 %f217, %f460, %f216;
|
627 |
+
$L__tmp73:
|
628 |
+
.loc 2 243 36
|
629 |
+
mov.b32 %r174, %f217;
|
630 |
+
shfl.sync.bfly.b32 %r175, %r174, 8, 31, -1;
|
631 |
+
mov.b32 %f218, %r175;
|
632 |
+
$L__tmp74:
|
633 |
+
.loc 2 233 15
|
634 |
+
add.f32 %f219, %f217, %f218;
|
635 |
+
$L__tmp75:
|
636 |
+
.loc 2 243 36
|
637 |
+
mov.b32 %r176, %f219;
|
638 |
+
shfl.sync.bfly.b32 %r177, %r176, 4, 31, -1;
|
639 |
+
mov.b32 %f220, %r177;
|
640 |
+
$L__tmp76:
|
641 |
+
.loc 2 233 15
|
642 |
+
add.f32 %f221, %f219, %f220;
|
643 |
+
$L__tmp77:
|
644 |
+
.loc 2 243 36
|
645 |
+
mov.b32 %r178, %f221;
|
646 |
+
shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1;
|
647 |
+
mov.b32 %f222, %r179;
|
648 |
+
$L__tmp78:
|
649 |
+
.loc 2 233 15
|
650 |
+
add.f32 %f223, %f221, %f222;
|
651 |
+
$L__tmp79:
|
652 |
+
.loc 2 243 36
|
653 |
+
mov.b32 %r180, %f223;
|
654 |
+
shfl.sync.bfly.b32 %r181, %r180, 1, 31, -1;
|
655 |
+
mov.b32 %f224, %r181;
|
656 |
+
$L__tmp80:
|
657 |
+
.loc 2 233 15
|
658 |
+
add.f32 %f225, %f223, %f224;
|
659 |
+
$L__tmp81:
|
660 |
+
.loc 2 243 36
|
661 |
+
mov.b32 %r182, %f461;
|
662 |
+
shfl.sync.bfly.b32 %r183, %r182, 16, 31, -1;
|
663 |
+
mov.b32 %f226, %r183;
|
664 |
+
$L__tmp82:
|
665 |
+
.loc 2 233 15
|
666 |
+
add.f32 %f227, %f461, %f226;
|
667 |
+
$L__tmp83:
|
668 |
+
.loc 2 243 36
|
669 |
+
mov.b32 %r184, %f227;
|
670 |
+
shfl.sync.bfly.b32 %r185, %r184, 8, 31, -1;
|
671 |
+
mov.b32 %f228, %r185;
|
672 |
+
$L__tmp84:
|
673 |
+
.loc 2 233 15
|
674 |
+
add.f32 %f229, %f227, %f228;
|
675 |
+
$L__tmp85:
|
676 |
+
.loc 2 243 36
|
677 |
+
mov.b32 %r186, %f229;
|
678 |
+
shfl.sync.bfly.b32 %r187, %r186, 4, 31, -1;
|
679 |
+
mov.b32 %f230, %r187;
|
680 |
+
$L__tmp86:
|
681 |
+
.loc 2 233 15
|
682 |
+
add.f32 %f231, %f229, %f230;
|
683 |
+
$L__tmp87:
|
684 |
+
.loc 2 243 36
|
685 |
+
mov.b32 %r188, %f231;
|
686 |
+
shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1;
|
687 |
+
mov.b32 %f232, %r189;
|
688 |
+
$L__tmp88:
|
689 |
+
.loc 2 233 15
|
690 |
+
add.f32 %f233, %f231, %f232;
|
691 |
+
$L__tmp89:
|
692 |
+
.loc 2 243 36
|
693 |
+
mov.b32 %r190, %f233;
|
694 |
+
shfl.sync.bfly.b32 %r191, %r190, 1, 31, -1;
|
695 |
+
mov.b32 %f234, %r191;
|
696 |
+
$L__tmp90:
|
697 |
+
.loc 2 233 15
|
698 |
+
add.f32 %f235, %f233, %f234;
|
699 |
+
$L__tmp91:
|
700 |
+
.loc 2 243 36
|
701 |
+
mov.b32 %r192, %f462;
|
702 |
+
shfl.sync.bfly.b32 %r193, %r192, 16, 31, -1;
|
703 |
+
mov.b32 %f236, %r193;
|
704 |
+
$L__tmp92:
|
705 |
+
.loc 2 233 15
|
706 |
+
add.f32 %f237, %f462, %f236;
|
707 |
+
$L__tmp93:
|
708 |
+
.loc 2 243 36
|
709 |
+
mov.b32 %r194, %f237;
|
710 |
+
shfl.sync.bfly.b32 %r195, %r194, 8, 31, -1;
|
711 |
+
mov.b32 %f238, %r195;
|
712 |
+
$L__tmp94:
|
713 |
+
.loc 2 233 15
|
714 |
+
add.f32 %f239, %f237, %f238;
|
715 |
+
$L__tmp95:
|
716 |
+
.loc 2 243 36
|
717 |
+
mov.b32 %r196, %f239;
|
718 |
+
shfl.sync.bfly.b32 %r197, %r196, 4, 31, -1;
|
719 |
+
mov.b32 %f240, %r197;
|
720 |
+
$L__tmp96:
|
721 |
+
.loc 2 233 15
|
722 |
+
add.f32 %f241, %f239, %f240;
|
723 |
+
$L__tmp97:
|
724 |
+
.loc 2 243 36
|
725 |
+
mov.b32 %r198, %f241;
|
726 |
+
shfl.sync.bfly.b32 %r199, %r198, 2, 31, -1;
|
727 |
+
mov.b32 %f242, %r199;
|
728 |
+
$L__tmp98:
|
729 |
+
.loc 2 233 15
|
730 |
+
add.f32 %f243, %f241, %f242;
|
731 |
+
$L__tmp99:
|
732 |
+
.loc 2 243 36
|
733 |
+
mov.b32 %r200, %f243;
|
734 |
+
shfl.sync.bfly.b32 %r201, %r200, 1, 31, -1;
|
735 |
+
mov.b32 %f244, %r201;
|
736 |
+
$L__tmp100:
|
737 |
+
.loc 2 233 15
|
738 |
+
add.f32 %f245, %f243, %f244;
|
739 |
+
$L__tmp101:
|
740 |
+
.loc 2 243 36
|
741 |
+
mov.b32 %r202, %f463;
|
742 |
+
shfl.sync.bfly.b32 %r203, %r202, 16, 31, -1;
|
743 |
+
mov.b32 %f246, %r203;
|
744 |
+
$L__tmp102:
|
745 |
+
.loc 2 233 15
|
746 |
+
add.f32 %f247, %f463, %f246;
|
747 |
+
$L__tmp103:
|
748 |
+
.loc 2 243 36
|
749 |
+
mov.b32 %r204, %f247;
|
750 |
+
shfl.sync.bfly.b32 %r205, %r204, 8, 31, -1;
|
751 |
+
mov.b32 %f248, %r205;
|
752 |
+
$L__tmp104:
|
753 |
+
.loc 2 233 15
|
754 |
+
add.f32 %f249, %f247, %f248;
|
755 |
+
$L__tmp105:
|
756 |
+
.loc 2 243 36
|
757 |
+
mov.b32 %r206, %f249;
|
758 |
+
shfl.sync.bfly.b32 %r207, %r206, 4, 31, -1;
|
759 |
+
mov.b32 %f250, %r207;
|
760 |
+
$L__tmp106:
|
761 |
+
.loc 2 233 15
|
762 |
+
add.f32 %f251, %f249, %f250;
|
763 |
+
$L__tmp107:
|
764 |
+
.loc 2 243 36
|
765 |
+
mov.b32 %r208, %f251;
|
766 |
+
shfl.sync.bfly.b32 %r209, %r208, 2, 31, -1;
|
767 |
+
mov.b32 %f252, %r209;
|
768 |
+
$L__tmp108:
|
769 |
+
.loc 2 233 15
|
770 |
+
add.f32 %f253, %f251, %f252;
|
771 |
+
$L__tmp109:
|
772 |
+
.loc 2 243 36
|
773 |
+
mov.b32 %r210, %f253;
|
774 |
+
shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1;
|
775 |
+
mov.b32 %f254, %r211;
|
776 |
+
$L__tmp110:
|
777 |
+
.loc 2 233 15
|
778 |
+
add.f32 %f255, %f253, %f254;
|
779 |
+
$L__tmp111:
|
780 |
+
.loc 2 243 36
|
781 |
+
mov.b32 %r212, %f464;
|
782 |
+
shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1;
|
783 |
+
mov.b32 %f256, %r213;
|
784 |
+
$L__tmp112:
|
785 |
+
.loc 2 233 15
|
786 |
+
add.f32 %f257, %f464, %f256;
|
787 |
+
$L__tmp113:
|
788 |
+
.loc 2 243 36
|
789 |
+
mov.b32 %r214, %f257;
|
790 |
+
shfl.sync.bfly.b32 %r215, %r214, 8, 31, -1;
|
791 |
+
mov.b32 %f258, %r215;
|
792 |
+
$L__tmp114:
|
793 |
+
.loc 2 233 15
|
794 |
+
add.f32 %f259, %f257, %f258;
|
795 |
+
$L__tmp115:
|
796 |
+
.loc 2 243 36
|
797 |
+
mov.b32 %r216, %f259;
|
798 |
+
shfl.sync.bfly.b32 %r217, %r216, 4, 31, -1;
|
799 |
+
mov.b32 %f260, %r217;
|
800 |
+
$L__tmp116:
|
801 |
+
.loc 2 233 15
|
802 |
+
add.f32 %f261, %f259, %f260;
|
803 |
+
$L__tmp117:
|
804 |
+
.loc 2 243 36
|
805 |
+
mov.b32 %r218, %f261;
|
806 |
+
shfl.sync.bfly.b32 %r219, %r218, 2, 31, -1;
|
807 |
+
mov.b32 %f262, %r219;
|
808 |
+
$L__tmp118:
|
809 |
+
.loc 2 233 15
|
810 |
+
add.f32 %f263, %f261, %f262;
|
811 |
+
$L__tmp119:
|
812 |
+
.loc 2 243 36
|
813 |
+
mov.b32 %r220, %f263;
|
814 |
+
shfl.sync.bfly.b32 %r221, %r220, 1, 31, -1;
|
815 |
+
mov.b32 %f264, %r221;
|
816 |
+
$L__tmp120:
|
817 |
+
.loc 2 233 15
|
818 |
+
add.f32 %f265, %f263, %f264;
|
819 |
+
$L__tmp121:
|
820 |
+
.loc 2 243 36
|
821 |
+
mov.b32 %r222, %f465;
|
822 |
+
shfl.sync.bfly.b32 %r223, %r222, 16, 31, -1;
|
823 |
+
mov.b32 %f266, %r223;
|
824 |
+
$L__tmp122:
|
825 |
+
.loc 2 233 15
|
826 |
+
add.f32 %f267, %f465, %f266;
|
827 |
+
$L__tmp123:
|
828 |
+
.loc 2 243 36
|
829 |
+
mov.b32 %r224, %f267;
|
830 |
+
shfl.sync.bfly.b32 %r225, %r224, 8, 31, -1;
|
831 |
+
mov.b32 %f268, %r225;
|
832 |
+
$L__tmp124:
|
833 |
+
.loc 2 233 15
|
834 |
+
add.f32 %f269, %f267, %f268;
|
835 |
+
$L__tmp125:
|
836 |
+
.loc 2 243 36
|
837 |
+
mov.b32 %r226, %f269;
|
838 |
+
shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1;
|
839 |
+
mov.b32 %f270, %r227;
|
840 |
+
$L__tmp126:
|
841 |
+
.loc 2 233 15
|
842 |
+
add.f32 %f271, %f269, %f270;
|
843 |
+
$L__tmp127:
|
844 |
+
.loc 2 243 36
|
845 |
+
mov.b32 %r228, %f271;
|
846 |
+
shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1;
|
847 |
+
mov.b32 %f272, %r229;
|
848 |
+
$L__tmp128:
|
849 |
+
.loc 2 233 15
|
850 |
+
add.f32 %f273, %f271, %f272;
|
851 |
+
$L__tmp129:
|
852 |
+
.loc 2 243 36
|
853 |
+
mov.b32 %r230, %f273;
|
854 |
+
shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1;
|
855 |
+
mov.b32 %f274, %r231;
|
856 |
+
$L__tmp130:
|
857 |
+
.loc 2 233 15
|
858 |
+
add.f32 %f275, %f273, %f274;
|
859 |
+
$L__tmp131:
|
860 |
+
.loc 2 243 36
|
861 |
+
mov.b32 %r232, %f466;
|
862 |
+
shfl.sync.bfly.b32 %r233, %r232, 16, 31, -1;
|
863 |
+
mov.b32 %f276, %r233;
|
864 |
+
$L__tmp132:
|
865 |
+
.loc 2 233 15
|
866 |
+
add.f32 %f277, %f466, %f276;
|
867 |
+
$L__tmp133:
|
868 |
+
.loc 2 243 36
|
869 |
+
mov.b32 %r234, %f277;
|
870 |
+
shfl.sync.bfly.b32 %r235, %r234, 8, 31, -1;
|
871 |
+
mov.b32 %f278, %r235;
|
872 |
+
$L__tmp134:
|
873 |
+
.loc 2 233 15
|
874 |
+
add.f32 %f279, %f277, %f278;
|
875 |
+
$L__tmp135:
|
876 |
+
.loc 2 243 36
|
877 |
+
mov.b32 %r236, %f279;
|
878 |
+
shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1;
|
879 |
+
mov.b32 %f280, %r237;
|
880 |
+
$L__tmp136:
|
881 |
+
.loc 2 233 15
|
882 |
+
add.f32 %f281, %f279, %f280;
|
883 |
+
$L__tmp137:
|
884 |
+
.loc 2 243 36
|
885 |
+
mov.b32 %r238, %f281;
|
886 |
+
shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1;
|
887 |
+
mov.b32 %f282, %r239;
|
888 |
+
$L__tmp138:
|
889 |
+
.loc 2 233 15
|
890 |
+
add.f32 %f283, %f281, %f282;
|
891 |
+
$L__tmp139:
|
892 |
+
.loc 2 243 36
|
893 |
+
mov.b32 %r240, %f283;
|
894 |
+
shfl.sync.bfly.b32 %r241, %r240, 1, 31, -1;
|
895 |
+
mov.b32 %f284, %r241;
|
896 |
+
$L__tmp140:
|
897 |
+
.loc 2 233 15
|
898 |
+
add.f32 %f285, %f283, %f284;
|
899 |
+
$L__tmp141:
|
900 |
+
.loc 2 243 36
|
901 |
+
mov.b32 %r242, %f467;
|
902 |
+
shfl.sync.bfly.b32 %r243, %r242, 16, 31, -1;
|
903 |
+
mov.b32 %f286, %r243;
|
904 |
+
$L__tmp142:
|
905 |
+
.loc 2 233 15
|
906 |
+
add.f32 %f287, %f467, %f286;
|
907 |
+
$L__tmp143:
|
908 |
+
.loc 2 243 36
|
909 |
+
mov.b32 %r244, %f287;
|
910 |
+
shfl.sync.bfly.b32 %r245, %r244, 8, 31, -1;
|
911 |
+
mov.b32 %f288, %r245;
|
912 |
+
$L__tmp144:
|
913 |
+
.loc 2 233 15
|
914 |
+
add.f32 %f289, %f287, %f288;
|
915 |
+
$L__tmp145:
|
916 |
+
.loc 2 243 36
|
917 |
+
mov.b32 %r246, %f289;
|
918 |
+
shfl.sync.bfly.b32 %r247, %r246, 4, 31, -1;
|
919 |
+
mov.b32 %f290, %r247;
|
920 |
+
$L__tmp146:
|
921 |
+
.loc 2 233 15
|
922 |
+
add.f32 %f291, %f289, %f290;
|
923 |
+
$L__tmp147:
|
924 |
+
.loc 2 243 36
|
925 |
+
mov.b32 %r248, %f291;
|
926 |
+
shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1;
|
927 |
+
mov.b32 %f292, %r249;
|
928 |
+
$L__tmp148:
|
929 |
+
.loc 2 233 15
|
930 |
+
add.f32 %f293, %f291, %f292;
|
931 |
+
$L__tmp149:
|
932 |
+
.loc 2 243 36
|
933 |
+
mov.b32 %r250, %f293;
|
934 |
+
shfl.sync.bfly.b32 %r251, %r250, 1, 31, -1;
|
935 |
+
mov.b32 %f294, %r251;
|
936 |
+
$L__tmp150:
|
937 |
+
.loc 2 233 15
|
938 |
+
add.f32 %f295, %f293, %f294;
|
939 |
+
$L__tmp151:
|
940 |
+
.loc 2 243 36
|
941 |
+
mov.b32 %r252, %f468;
|
942 |
+
shfl.sync.bfly.b32 %r253, %r252, 16, 31, -1;
|
943 |
+
mov.b32 %f296, %r253;
|
944 |
+
$L__tmp152:
|
945 |
+
.loc 2 233 15
|
946 |
+
add.f32 %f297, %f468, %f296;
|
947 |
+
$L__tmp153:
|
948 |
+
.loc 2 243 36
|
949 |
+
mov.b32 %r254, %f297;
|
950 |
+
shfl.sync.bfly.b32 %r255, %r254, 8, 31, -1;
|
951 |
+
mov.b32 %f298, %r255;
|
952 |
+
$L__tmp154:
|
953 |
+
.loc 2 233 15
|
954 |
+
add.f32 %f299, %f297, %f298;
|
955 |
+
$L__tmp155:
|
956 |
+
.loc 2 243 36
|
957 |
+
mov.b32 %r256, %f299;
|
958 |
+
shfl.sync.bfly.b32 %r257, %r256, 4, 31, -1;
|
959 |
+
mov.b32 %f300, %r257;
|
960 |
+
$L__tmp156:
|
961 |
+
.loc 2 233 15
|
962 |
+
add.f32 %f301, %f299, %f300;
|
963 |
+
$L__tmp157:
|
964 |
+
.loc 2 243 36
|
965 |
+
mov.b32 %r258, %f301;
|
966 |
+
shfl.sync.bfly.b32 %r259, %r258, 2, 31, -1;
|
967 |
+
mov.b32 %f302, %r259;
|
968 |
+
$L__tmp158:
|
969 |
+
.loc 2 233 15
|
970 |
+
add.f32 %f303, %f301, %f302;
|
971 |
+
$L__tmp159:
|
972 |
+
.loc 2 243 36
|
973 |
+
mov.b32 %r260, %f303;
|
974 |
+
shfl.sync.bfly.b32 %r261, %r260, 1, 31, -1;
|
975 |
+
mov.b32 %f304, %r261;
|
976 |
+
$L__tmp160:
|
977 |
+
.loc 2 233 15
|
978 |
+
add.f32 %f305, %f303, %f304;
|
979 |
+
$L__tmp161:
|
980 |
+
.loc 2 243 36
|
981 |
+
setp.eq.s32 %p68, %r2, 0;
|
982 |
+
shr.u32 %r262, %r1, 3;
|
983 |
+
and.b32 %r263, %r262, 4;
|
984 |
+
shl.b32 %r264, %r101, 3;
|
985 |
+
or.b32 %r265, %r264, %r263;
|
986 |
+
mov.u32 %r266, global_smem;
|
987 |
+
add.s32 %r64, %r266, %r265;
|
988 |
+
mov.b32 %r65, %f155;
|
989 |
+
@%p68 st.shared.b32 [ %r64 + 0 ], %r65;
|
990 |
+
shl.b32 %r267, %r3, 3;
|
991 |
+
or.b32 %r268, %r267, %r263;
|
992 |
+
add.s32 %r66, %r266, %r268;
|
993 |
+
mov.b32 %r67, %f165;
|
994 |
+
@%p68 st.shared.b32 [ %r66 + 0 ], %r67;
|
995 |
+
shl.b32 %r269, %r4, 3;
|
996 |
+
or.b32 %r270, %r269, %r263;
|
997 |
+
add.s32 %r68, %r266, %r270;
|
998 |
+
mov.b32 %r69, %f175;
|
999 |
+
@%p68 st.shared.b32 [ %r68 + 0 ], %r69;
|
1000 |
+
shl.b32 %r271, %r5, 3;
|
1001 |
+
or.b32 %r272, %r271, %r263;
|
1002 |
+
add.s32 %r70, %r266, %r272;
|
1003 |
+
mov.b32 %r71, %f185;
|
1004 |
+
@%p68 st.shared.b32 [ %r70 + 0 ], %r71;
|
1005 |
+
shl.b32 %r273, %r6, 3;
|
1006 |
+
or.b32 %r274, %r273, %r263;
|
1007 |
+
add.s32 %r72, %r266, %r274;
|
1008 |
+
mov.b32 %r73, %f195;
|
1009 |
+
@%p68 st.shared.b32 [ %r72 + 0 ], %r73;
|
1010 |
+
shl.b32 %r275, %r7, 3;
|
1011 |
+
or.b32 %r276, %r275, %r263;
|
1012 |
+
add.s32 %r74, %r266, %r276;
|
1013 |
+
mov.b32 %r75, %f205;
|
1014 |
+
@%p68 st.shared.b32 [ %r74 + 0 ], %r75;
|
1015 |
+
shl.b32 %r277, %r8, 3;
|
1016 |
+
or.b32 %r278, %r277, %r263;
|
1017 |
+
add.s32 %r76, %r266, %r278;
|
1018 |
+
mov.b32 %r77, %f215;
|
1019 |
+
@%p68 st.shared.b32 [ %r76 + 0 ], %r77;
|
1020 |
+
shl.b32 %r279, %r9, 3;
|
1021 |
+
or.b32 %r280, %r279, %r263;
|
1022 |
+
add.s32 %r78, %r266, %r280;
|
1023 |
+
mov.b32 %r79, %f225;
|
1024 |
+
@%p68 st.shared.b32 [ %r78 + 0 ], %r79;
|
1025 |
+
shl.b32 %r281, %r10, 3;
|
1026 |
+
or.b32 %r282, %r281, %r263;
|
1027 |
+
add.s32 %r80, %r266, %r282;
|
1028 |
+
mov.b32 %r81, %f235;
|
1029 |
+
@%p68 st.shared.b32 [ %r80 + 0 ], %r81;
|
1030 |
+
shl.b32 %r283, %r11, 3;
|
1031 |
+
or.b32 %r284, %r283, %r263;
|
1032 |
+
add.s32 %r82, %r266, %r284;
|
1033 |
+
mov.b32 %r83, %f245;
|
1034 |
+
@%p68 st.shared.b32 [ %r82 + 0 ], %r83;
|
1035 |
+
shl.b32 %r285, %r12, 3;
|
1036 |
+
or.b32 %r286, %r285, %r263;
|
1037 |
+
add.s32 %r84, %r266, %r286;
|
1038 |
+
mov.b32 %r85, %f255;
|
1039 |
+
@%p68 st.shared.b32 [ %r84 + 0 ], %r85;
|
1040 |
+
shl.b32 %r287, %r13, 3;
|
1041 |
+
or.b32 %r288, %r287, %r263;
|
1042 |
+
add.s32 %r86, %r266, %r288;
|
1043 |
+
mov.b32 %r87, %f265;
|
1044 |
+
@%p68 st.shared.b32 [ %r86 + 0 ], %r87;
|
1045 |
+
shl.b32 %r289, %r14, 3;
|
1046 |
+
or.b32 %r290, %r289, %r263;
|
1047 |
+
add.s32 %r88, %r266, %r290;
|
1048 |
+
mov.b32 %r89, %f275;
|
1049 |
+
@%p68 st.shared.b32 [ %r88 + 0 ], %r89;
|
1050 |
+
shl.b32 %r291, %r15, 3;
|
1051 |
+
or.b32 %r292, %r291, %r263;
|
1052 |
+
add.s32 %r90, %r266, %r292;
|
1053 |
+
mov.b32 %r91, %f285;
|
1054 |
+
@%p68 st.shared.b32 [ %r90 + 0 ], %r91;
|
1055 |
+
shl.b32 %r293, %r16, 3;
|
1056 |
+
or.b32 %r294, %r293, %r263;
|
1057 |
+
add.s32 %r92, %r266, %r294;
|
1058 |
+
mov.b32 %r93, %f295;
|
1059 |
+
@%p68 st.shared.b32 [ %r92 + 0 ], %r93;
|
1060 |
+
shl.b32 %r295, %r17, 3;
|
1061 |
+
or.b32 %r296, %r295, %r263;
|
1062 |
+
add.s32 %r94, %r266, %r296;
|
1063 |
+
mov.b32 %r95, %f305;
|
1064 |
+
@%p68 st.shared.b32 [ %r94 + 0 ], %r95;
|
1065 |
+
bar.sync 0;
|
1066 |
+
setp.lt.s32 %p84, %r1, 128;
|
1067 |
+
shl.b32 %r297, %r1, 2;
|
1068 |
+
add.s32 %r97, %r266, %r297;
|
1069 |
+
@%p84 ld.shared.b32 %r96, [ %r97 + 0 ];
|
1070 |
+
mov.b32 %f306, %r96;
|
1071 |
+
shfl.sync.bfly.b32 %r298, %r96, 1, 31, -1;
|
1072 |
+
mov.b32 %f307, %r298;
|
1073 |
+
$L__tmp162:
|
1074 |
+
.loc 2 233 15
|
1075 |
+
add.f32 %f308, %f306, %f307;
|
1076 |
+
$L__tmp163:
|
1077 |
+
.loc 2 243 36
|
1078 |
+
and.b32 %r299, %r1, 1;
|
1079 |
+
setp.eq.b32 %p86, %r299, 1;
|
1080 |
+
not.pred %p87, %p86;
|
1081 |
+
and.pred %p85, %p84, %p87;
|
1082 |
+
mov.b32 %r99, %f308;
|
1083 |
+
@%p85 st.shared.b32 [ %r97 + 0 ], %r99;
|
1084 |
+
bar.sync 0;
|
1085 |
+
add.s32 %r300, %r266, %r264;
|
1086 |
+
ld.shared.f32 %f49, [%r300];
|
1087 |
+
add.s32 %r301, %r266, %r267;
|
1088 |
+
ld.shared.f32 %f50, [%r301];
|
1089 |
+
add.s32 %r302, %r266, %r269;
|
1090 |
+
ld.shared.f32 %f51, [%r302];
|
1091 |
+
add.s32 %r303, %r266, %r271;
|
1092 |
+
ld.shared.f32 %f52, [%r303];
|
1093 |
+
add.s32 %r304, %r266, %r273;
|
1094 |
+
ld.shared.f32 %f53, [%r304];
|
1095 |
+
add.s32 %r305, %r266, %r275;
|
1096 |
+
ld.shared.f32 %f54, [%r305];
|
1097 |
+
add.s32 %r306, %r266, %r277;
|
1098 |
+
ld.shared.f32 %f55, [%r306];
|
1099 |
+
add.s32 %r307, %r266, %r279;
|
1100 |
+
ld.shared.f32 %f56, [%r307];
|
1101 |
+
add.s32 %r308, %r266, %r281;
|
1102 |
+
ld.shared.f32 %f57, [%r308];
|
1103 |
+
add.s32 %r309, %r266, %r283;
|
1104 |
+
ld.shared.f32 %f58, [%r309];
|
1105 |
+
add.s32 %r310, %r266, %r285;
|
1106 |
+
ld.shared.f32 %f59, [%r310];
|
1107 |
+
add.s32 %r311, %r266, %r287;
|
1108 |
+
ld.shared.f32 %f60, [%r311];
|
1109 |
+
add.s32 %r312, %r266, %r289;
|
1110 |
+
ld.shared.f32 %f61, [%r312];
|
1111 |
+
add.s32 %r313, %r266, %r291;
|
1112 |
+
ld.shared.f32 %f62, [%r313];
|
1113 |
+
add.s32 %r314, %r266, %r293;
|
1114 |
+
ld.shared.f32 %f63, [%r314];
|
1115 |
+
add.s32 %r315, %r266, %r295;
|
1116 |
+
ld.shared.f32 %f64, [%r315];
|
1117 |
+
$L__tmp164:
|
1118 |
+
.loc 1 51 36
|
1119 |
+
shl.b64 %rd80, %rd3, 1;
|
1120 |
+
add.s64 %rd7, %rd17, %rd80;
|
1121 |
+
mul.lo.s64 %rd81, %rd1, 6432896;
|
1122 |
+
mul.lo.s64 %rd82, %rd2, 100514;
|
1123 |
+
add.s64 %rd148, %rd81, %rd82;
|
1124 |
+
add.s64 %rd9, %rd16, %rd80;
|
1125 |
+
add.s64 %rd10, %rd15, %rd80;
|
1126 |
+
mov.b32 %r398, -64;
|
1127 |
+
mov.u16 %rs2, 0;
|
1128 |
+
$L__BB0_3:
|
1129 |
+
add.s32 %r398, %r398, 64;
|
1130 |
+
.loc 1 52 27
|
1131 |
+
add.s32 %r396, %r398, %r18;
|
1132 |
+
.loc 1 53 25
|
1133 |
+
setp.lt.u32 %p88, %r396, 50257;
|
1134 |
+
.loc 1 55 35
|
1135 |
+
add.s64 %rd83, %rd10, %rd148;
|
1136 |
+
add.s64 %rd84, %rd83, 402056;
|
1137 |
+
add.s64 %rd85, %rd83, 804112;
|
1138 |
+
add.s64 %rd86, %rd83, 1206168;
|
1139 |
+
add.s64 %rd87, %rd83, 1608224;
|
1140 |
+
add.s64 %rd88, %rd83, 2010280;
|
1141 |
+
add.s64 %rd89, %rd83, 2412336;
|
1142 |
+
add.s64 %rd90, %rd83, 2814392;
|
1143 |
+
add.s64 %rd91, %rd83, 3216448;
|
1144 |
+
add.s64 %rd92, %rd83, 3618504;
|
1145 |
+
add.s64 %rd93, %rd83, 4020560;
|
1146 |
+
add.s64 %rd94, %rd83, 4422616;
|
1147 |
+
add.s64 %rd95, %rd83, 4824672;
|
1148 |
+
add.s64 %rd96, %rd83, 5226728;
|
1149 |
+
add.s64 %rd97, %rd83, 5628784;
|
1150 |
+
.loc 1 55 53
|
1151 |
+
add.s64 %rd98, %rd83, 6030840;
|
1152 |
+
mov.u16 %rs1, 0x0;
|
1153 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd83 + 0 ];
|
1154 |
+
@!%p88 mov.u16 %rs1, %rs2;
|
1155 |
+
mov.u16 %rs3, 0x0;
|
1156 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd84 + 0 ];
|
1157 |
+
@!%p88 mov.u16 %rs3, %rs2;
|
1158 |
+
mov.u16 %rs5, 0x0;
|
1159 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd85 + 0 ];
|
1160 |
+
@!%p88 mov.u16 %rs5, %rs2;
|
1161 |
+
mov.u16 %rs7, 0x0;
|
1162 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd86 + 0 ];
|
1163 |
+
@!%p88 mov.u16 %rs7, %rs2;
|
1164 |
+
mov.u16 %rs9, 0x0;
|
1165 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd87 + 0 ];
|
1166 |
+
@!%p88 mov.u16 %rs9, %rs2;
|
1167 |
+
mov.u16 %rs11, 0x0;
|
1168 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd88 + 0 ];
|
1169 |
+
@!%p88 mov.u16 %rs11, %rs2;
|
1170 |
+
mov.u16 %rs13, 0x0;
|
1171 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd89 + 0 ];
|
1172 |
+
@!%p88 mov.u16 %rs13, %rs2;
|
1173 |
+
mov.u16 %rs15, 0x0;
|
1174 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd90 + 0 ];
|
1175 |
+
@!%p88 mov.u16 %rs15, %rs2;
|
1176 |
+
mov.u16 %rs17, 0x0;
|
1177 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd91 + 0 ];
|
1178 |
+
@!%p88 mov.u16 %rs17, %rs2;
|
1179 |
+
mov.u16 %rs19, 0x0;
|
1180 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd92 + 0 ];
|
1181 |
+
@!%p88 mov.u16 %rs19, %rs2;
|
1182 |
+
mov.u16 %rs21, 0x0;
|
1183 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd93 + 0 ];
|
1184 |
+
@!%p88 mov.u16 %rs21, %rs2;
|
1185 |
+
mov.u16 %rs23, 0x0;
|
1186 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd94 + 0 ];
|
1187 |
+
@!%p88 mov.u16 %rs23, %rs2;
|
1188 |
+
mov.u16 %rs25, 0x0;
|
1189 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd95 + 0 ];
|
1190 |
+
@!%p88 mov.u16 %rs25, %rs2;
|
1191 |
+
mov.u16 %rs27, 0x0;
|
1192 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd96 + 0 ];
|
1193 |
+
@!%p88 mov.u16 %rs27, %rs2;
|
1194 |
+
mov.u16 %rs29, 0x0;
|
1195 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd97 + 0 ];
|
1196 |
+
@!%p88 mov.u16 %rs29, %rs2;
|
1197 |
+
mov.u16 %rs31, 0x0;
|
1198 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd98 + 0 ];
|
1199 |
+
@!%p88 mov.u16 %rs31, %rs2;
|
1200 |
+
.loc 1 55 105
|
1201 |
+
cvt.f32.bf16 %r316, %rs1;
|
1202 |
+
mov.b32 %f341, %r316;
|
1203 |
+
cvt.f32.bf16 %r317, %rs3;
|
1204 |
+
mov.b32 %f342, %r317;
|
1205 |
+
cvt.f32.bf16 %r318, %rs5;
|
1206 |
+
mov.b32 %f343, %r318;
|
1207 |
+
cvt.f32.bf16 %r319, %rs7;
|
1208 |
+
mov.b32 %f344, %r319;
|
1209 |
+
cvt.f32.bf16 %r320, %rs9;
|
1210 |
+
mov.b32 %f345, %r320;
|
1211 |
+
cvt.f32.bf16 %r321, %rs11;
|
1212 |
+
mov.b32 %f346, %r321;
|
1213 |
+
cvt.f32.bf16 %r322, %rs13;
|
1214 |
+
mov.b32 %f347, %r322;
|
1215 |
+
cvt.f32.bf16 %r323, %rs15;
|
1216 |
+
mov.b32 %f348, %r323;
|
1217 |
+
cvt.f32.bf16 %r324, %rs17;
|
1218 |
+
mov.b32 %f349, %r324;
|
1219 |
+
cvt.f32.bf16 %r325, %rs19;
|
1220 |
+
mov.b32 %f350, %r325;
|
1221 |
+
cvt.f32.bf16 %r326, %rs21;
|
1222 |
+
mov.b32 %f351, %r326;
|
1223 |
+
cvt.f32.bf16 %r327, %rs23;
|
1224 |
+
mov.b32 %f352, %r327;
|
1225 |
+
cvt.f32.bf16 %r328, %rs25;
|
1226 |
+
mov.b32 %f353, %r328;
|
1227 |
+
cvt.f32.bf16 %r329, %rs27;
|
1228 |
+
mov.b32 %f354, %r329;
|
1229 |
+
cvt.f32.bf16 %r330, %rs29;
|
1230 |
+
mov.b32 %f355, %r330;
|
1231 |
+
cvt.f32.bf16 %r331, %rs31;
|
1232 |
+
mov.b32 %f356, %r331;
|
1233 |
+
.loc 1 56 35
|
1234 |
+
add.s64 %rd99, %rd149, -12061680;
|
1235 |
+
add.s64 %rd100, %rd149, -11257568;
|
1236 |
+
add.s64 %rd101, %rd149, -10453456;
|
1237 |
+
add.s64 %rd102, %rd149, -9649344;
|
1238 |
+
add.s64 %rd103, %rd149, -8845232;
|
1239 |
+
add.s64 %rd104, %rd149, -8041120;
|
1240 |
+
add.s64 %rd105, %rd149, -7237008;
|
1241 |
+
add.s64 %rd106, %rd149, -6432896;
|
1242 |
+
add.s64 %rd107, %rd149, -5628784;
|
1243 |
+
add.s64 %rd108, %rd149, -4824672;
|
1244 |
+
add.s64 %rd109, %rd149, -4020560;
|
1245 |
+
add.s64 %rd110, %rd149, -3216448;
|
1246 |
+
add.s64 %rd111, %rd149, -2412336;
|
1247 |
+
add.s64 %rd112, %rd149, -1608224;
|
1248 |
+
add.s64 %rd113, %rd149, -804112;
|
1249 |
+
.loc 1 56 53
|
1250 |
+
mov.u32 %r332, 0x0;
|
1251 |
+
@%p88 ld.global.L1::evict_first.b32 { %r332 }, [ %rd99 + 0 ];
|
1252 |
+
@!%p88 mov.u32 %r332, %r333;
|
1253 |
+
mov.b32 %f357, %r332;
|
1254 |
+
mov.u32 %r334, 0x0;
|
1255 |
+
@%p88 ld.global.L1::evict_first.b32 { %r334 }, [ %rd100 + 0 ];
|
1256 |
+
@!%p88 mov.u32 %r334, %r333;
|
1257 |
+
mov.b32 %f358, %r334;
|
1258 |
+
mov.u32 %r336, 0x0;
|
1259 |
+
@%p88 ld.global.L1::evict_first.b32 { %r336 }, [ %rd101 + 0 ];
|
1260 |
+
@!%p88 mov.u32 %r336, %r333;
|
1261 |
+
mov.b32 %f359, %r336;
|
1262 |
+
mov.u32 %r338, 0x0;
|
1263 |
+
@%p88 ld.global.L1::evict_first.b32 { %r338 }, [ %rd102 + 0 ];
|
1264 |
+
@!%p88 mov.u32 %r338, %r333;
|
1265 |
+
mov.b32 %f360, %r338;
|
1266 |
+
mov.u32 %r340, 0x0;
|
1267 |
+
@%p88 ld.global.L1::evict_first.b32 { %r340 }, [ %rd103 + 0 ];
|
1268 |
+
@!%p88 mov.u32 %r340, %r333;
|
1269 |
+
mov.b32 %f361, %r340;
|
1270 |
+
mov.u32 %r342, 0x0;
|
1271 |
+
@%p88 ld.global.L1::evict_first.b32 { %r342 }, [ %rd104 + 0 ];
|
1272 |
+
@!%p88 mov.u32 %r342, %r333;
|
1273 |
+
mov.b32 %f362, %r342;
|
1274 |
+
mov.u32 %r344, 0x0;
|
1275 |
+
@%p88 ld.global.L1::evict_first.b32 { %r344 }, [ %rd105 + 0 ];
|
1276 |
+
@!%p88 mov.u32 %r344, %r333;
|
1277 |
+
mov.b32 %f363, %r344;
|
1278 |
+
mov.u32 %r346, 0x0;
|
1279 |
+
@%p88 ld.global.L1::evict_first.b32 { %r346 }, [ %rd106 + 0 ];
|
1280 |
+
@!%p88 mov.u32 %r346, %r333;
|
1281 |
+
mov.b32 %f364, %r346;
|
1282 |
+
mov.u32 %r348, 0x0;
|
1283 |
+
@%p88 ld.global.L1::evict_first.b32 { %r348 }, [ %rd107 + 0 ];
|
1284 |
+
@!%p88 mov.u32 %r348, %r333;
|
1285 |
+
mov.b32 %f365, %r348;
|
1286 |
+
mov.u32 %r350, 0x0;
|
1287 |
+
@%p88 ld.global.L1::evict_first.b32 { %r350 }, [ %rd108 + 0 ];
|
1288 |
+
@!%p88 mov.u32 %r350, %r333;
|
1289 |
+
mov.b32 %f366, %r350;
|
1290 |
+
mov.u32 %r352, 0x0;
|
1291 |
+
@%p88 ld.global.L1::evict_first.b32 { %r352 }, [ %rd109 + 0 ];
|
1292 |
+
@!%p88 mov.u32 %r352, %r333;
|
1293 |
+
mov.b32 %f367, %r352;
|
1294 |
+
mov.u32 %r354, 0x0;
|
1295 |
+
@%p88 ld.global.L1::evict_first.b32 { %r354 }, [ %rd110 + 0 ];
|
1296 |
+
@!%p88 mov.u32 %r354, %r333;
|
1297 |
+
mov.b32 %f368, %r354;
|
1298 |
+
mov.u32 %r356, 0x0;
|
1299 |
+
@%p88 ld.global.L1::evict_first.b32 { %r356 }, [ %rd111 + 0 ];
|
1300 |
+
@!%p88 mov.u32 %r356, %r333;
|
1301 |
+
mov.b32 %f369, %r356;
|
1302 |
+
mov.u32 %r358, 0x0;
|
1303 |
+
@%p88 ld.global.L1::evict_first.b32 { %r358 }, [ %rd112 + 0 ];
|
1304 |
+
@!%p88 mov.u32 %r358, %r333;
|
1305 |
+
mov.b32 %f370, %r358;
|
1306 |
+
mov.u32 %r360, 0x0;
|
1307 |
+
@%p88 ld.global.L1::evict_first.b32 { %r360 }, [ %rd113 + 0 ];
|
1308 |
+
@!%p88 mov.u32 %r360, %r333;
|
1309 |
+
mov.b32 %f371, %r360;
|
1310 |
+
mov.u32 %r362, 0x0;
|
1311 |
+
@%p88 ld.global.L1::evict_first.b32 { %r362 }, [ %rd149 + 0 ];
|
1312 |
+
@!%p88 mov.u32 %r362, %r333;
|
1313 |
+
mov.b32 %f372, %r362;
|
1314 |
+
.loc 1 57 35
|
1315 |
+
add.s64 %rd115, %rd9, %rd148;
|
1316 |
+
add.s64 %rd116, %rd115, 402056;
|
1317 |
+
add.s64 %rd117, %rd115, 804112;
|
1318 |
+
add.s64 %rd118, %rd115, 1206168;
|
1319 |
+
add.s64 %rd119, %rd115, 1608224;
|
1320 |
+
add.s64 %rd120, %rd115, 2010280;
|
1321 |
+
add.s64 %rd121, %rd115, 2412336;
|
1322 |
+
add.s64 %rd122, %rd115, 2814392;
|
1323 |
+
add.s64 %rd123, %rd115, 3216448;
|
1324 |
+
add.s64 %rd124, %rd115, 3618504;
|
1325 |
+
add.s64 %rd125, %rd115, 4020560;
|
1326 |
+
add.s64 %rd126, %rd115, 4422616;
|
1327 |
+
add.s64 %rd127, %rd115, 4824672;
|
1328 |
+
add.s64 %rd128, %rd115, 5226728;
|
1329 |
+
add.s64 %rd129, %rd115, 5628784;
|
1330 |
+
.loc 1 57 53
|
1331 |
+
add.s64 %rd130, %rd115, 6030840;
|
1332 |
+
mov.u16 %rs49, 0x0;
|
1333 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd115 + 0 ];
|
1334 |
+
@!%p88 mov.u16 %rs49, %rs2;
|
1335 |
+
mov.u16 %rs51, 0x0;
|
1336 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd116 + 0 ];
|
1337 |
+
@!%p88 mov.u16 %rs51, %rs2;
|
1338 |
+
mov.u16 %rs53, 0x0;
|
1339 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd117 + 0 ];
|
1340 |
+
@!%p88 mov.u16 %rs53, %rs2;
|
1341 |
+
mov.u16 %rs55, 0x0;
|
1342 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd118 + 0 ];
|
1343 |
+
@!%p88 mov.u16 %rs55, %rs2;
|
1344 |
+
mov.u16 %rs57, 0x0;
|
1345 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd119 + 0 ];
|
1346 |
+
@!%p88 mov.u16 %rs57, %rs2;
|
1347 |
+
mov.u16 %rs59, 0x0;
|
1348 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd120 + 0 ];
|
1349 |
+
@!%p88 mov.u16 %rs59, %rs2;
|
1350 |
+
mov.u16 %rs61, 0x0;
|
1351 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd121 + 0 ];
|
1352 |
+
@!%p88 mov.u16 %rs61, %rs2;
|
1353 |
+
mov.u16 %rs63, 0x0;
|
1354 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd122 + 0 ];
|
1355 |
+
@!%p88 mov.u16 %rs63, %rs2;
|
1356 |
+
mov.u16 %rs65, 0x0;
|
1357 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd123 + 0 ];
|
1358 |
+
@!%p88 mov.u16 %rs65, %rs2;
|
1359 |
+
mov.u16 %rs67, 0x0;
|
1360 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd124 + 0 ];
|
1361 |
+
@!%p88 mov.u16 %rs67, %rs2;
|
1362 |
+
mov.u16 %rs69, 0x0;
|
1363 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd125 + 0 ];
|
1364 |
+
@!%p88 mov.u16 %rs69, %rs2;
|
1365 |
+
mov.u16 %rs71, 0x0;
|
1366 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd126 + 0 ];
|
1367 |
+
@!%p88 mov.u16 %rs71, %rs2;
|
1368 |
+
mov.u16 %rs73, 0x0;
|
1369 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd127 + 0 ];
|
1370 |
+
@!%p88 mov.u16 %rs73, %rs2;
|
1371 |
+
mov.u16 %rs75, 0x0;
|
1372 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd128 + 0 ];
|
1373 |
+
@!%p88 mov.u16 %rs75, %rs2;
|
1374 |
+
mov.u16 %rs77, 0x0;
|
1375 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd129 + 0 ];
|
1376 |
+
@!%p88 mov.u16 %rs77, %rs2;
|
1377 |
+
mov.u16 %rs79, 0x0;
|
1378 |
+
@%p88 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd130 + 0 ];
|
1379 |
+
@!%p88 mov.u16 %rs79, %rs2;
|
1380 |
+
.loc 1 57 105
|
1381 |
+
cvt.f32.bf16 %r364, %rs49;
|
1382 |
+
mov.b32 %f373, %r364;
|
1383 |
+
cvt.f32.bf16 %r365, %rs51;
|
1384 |
+
mov.b32 %f374, %r365;
|
1385 |
+
cvt.f32.bf16 %r366, %rs53;
|
1386 |
+
mov.b32 %f375, %r366;
|
1387 |
+
cvt.f32.bf16 %r367, %rs55;
|
1388 |
+
mov.b32 %f376, %r367;
|
1389 |
+
cvt.f32.bf16 %r368, %rs57;
|
1390 |
+
mov.b32 %f377, %r368;
|
1391 |
+
cvt.f32.bf16 %r369, %rs59;
|
1392 |
+
mov.b32 %f378, %r369;
|
1393 |
+
cvt.f32.bf16 %r370, %rs61;
|
1394 |
+
mov.b32 %f379, %r370;
|
1395 |
+
cvt.f32.bf16 %r371, %rs63;
|
1396 |
+
mov.b32 %f380, %r371;
|
1397 |
+
cvt.f32.bf16 %r372, %rs65;
|
1398 |
+
mov.b32 %f381, %r372;
|
1399 |
+
cvt.f32.bf16 %r373, %rs67;
|
1400 |
+
mov.b32 %f382, %r373;
|
1401 |
+
cvt.f32.bf16 %r374, %rs69;
|
1402 |
+
mov.b32 %f383, %r374;
|
1403 |
+
cvt.f32.bf16 %r375, %rs71;
|
1404 |
+
mov.b32 %f384, %r375;
|
1405 |
+
cvt.f32.bf16 %r376, %rs73;
|
1406 |
+
mov.b32 %f385, %r376;
|
1407 |
+
cvt.f32.bf16 %r377, %rs75;
|
1408 |
+
mov.b32 %f386, %r377;
|
1409 |
+
cvt.f32.bf16 %r378, %rs77;
|
1410 |
+
mov.b32 %f387, %r378;
|
1411 |
+
cvt.f32.bf16 %r379, %rs79;
|
1412 |
+
mov.b32 %f388, %r379;
|
1413 |
+
.loc 1 65 23
|
1414 |
+
mul.f32 %f310, %f373, 0f3FB8AA3B;
|
1415 |
+
ex2.approx.f32 %f309, %f310;
|
1416 |
+
mul.f32 %f312, %f374, 0f3FB8AA3B;
|
1417 |
+
ex2.approx.f32 %f311, %f312;
|
1418 |
+
mul.f32 %f314, %f375, 0f3FB8AA3B;
|
1419 |
+
ex2.approx.f32 %f313, %f314;
|
1420 |
+
mul.f32 %f316, %f376, 0f3FB8AA3B;
|
1421 |
+
ex2.approx.f32 %f315, %f316;
|
1422 |
+
mul.f32 %f318, %f377, 0f3FB8AA3B;
|
1423 |
+
ex2.approx.f32 %f317, %f318;
|
1424 |
+
mul.f32 %f320, %f378, 0f3FB8AA3B;
|
1425 |
+
ex2.approx.f32 %f319, %f320;
|
1426 |
+
mul.f32 %f322, %f379, 0f3FB8AA3B;
|
1427 |
+
ex2.approx.f32 %f321, %f322;
|
1428 |
+
mul.f32 %f324, %f380, 0f3FB8AA3B;
|
1429 |
+
ex2.approx.f32 %f323, %f324;
|
1430 |
+
mul.f32 %f326, %f381, 0f3FB8AA3B;
|
1431 |
+
ex2.approx.f32 %f325, %f326;
|
1432 |
+
mul.f32 %f328, %f382, 0f3FB8AA3B;
|
1433 |
+
ex2.approx.f32 %f327, %f328;
|
1434 |
+
mul.f32 %f330, %f383, 0f3FB8AA3B;
|
1435 |
+
ex2.approx.f32 %f329, %f330;
|
1436 |
+
mul.f32 %f332, %f384, 0f3FB8AA3B;
|
1437 |
+
ex2.approx.f32 %f331, %f332;
|
1438 |
+
mul.f32 %f334, %f385, 0f3FB8AA3B;
|
1439 |
+
ex2.approx.f32 %f333, %f334;
|
1440 |
+
mul.f32 %f336, %f386, 0f3FB8AA3B;
|
1441 |
+
ex2.approx.f32 %f335, %f336;
|
1442 |
+
mul.f32 %f338, %f387, 0f3FB8AA3B;
|
1443 |
+
ex2.approx.f32 %f337, %f338;
|
1444 |
+
mul.f32 %f340, %f388, 0f3FB8AA3B;
|
1445 |
+
ex2.approx.f32 %f339, %f340;
|
1446 |
+
.loc 1 66 24
|
1447 |
+
mul.f32 %f389, %f49, %f309;
|
1448 |
+
mul.f32 %f390, %f50, %f311;
|
1449 |
+
mul.f32 %f391, %f51, %f313;
|
1450 |
+
mul.f32 %f392, %f52, %f315;
|
1451 |
+
mul.f32 %f393, %f53, %f317;
|
1452 |
+
mul.f32 %f394, %f54, %f319;
|
1453 |
+
mul.f32 %f395, %f55, %f321;
|
1454 |
+
mul.f32 %f396, %f56, %f323;
|
1455 |
+
mul.f32 %f397, %f57, %f325;
|
1456 |
+
mul.f32 %f398, %f58, %f327;
|
1457 |
+
mul.f32 %f399, %f59, %f329;
|
1458 |
+
mul.f32 %f400, %f60, %f331;
|
1459 |
+
mul.f32 %f401, %f61, %f333;
|
1460 |
+
mul.f32 %f402, %f62, %f335;
|
1461 |
+
mul.f32 %f403, %f63, %f337;
|
1462 |
+
mul.f32 %f404, %f64, %f339;
|
1463 |
+
.loc 1 67 24
|
1464 |
+
neg.f32 %f405, %f389;
|
1465 |
+
fma.rn.f32 %f406, %f1, %f357, %f405;
|
1466 |
+
neg.f32 %f407, %f390;
|
1467 |
+
fma.rn.f32 %f408, %f2, %f358, %f407;
|
1468 |
+
neg.f32 %f409, %f391;
|
1469 |
+
fma.rn.f32 %f410, %f3, %f359, %f409;
|
1470 |
+
neg.f32 %f411, %f392;
|
1471 |
+
fma.rn.f32 %f412, %f4, %f360, %f411;
|
1472 |
+
neg.f32 %f413, %f393;
|
1473 |
+
fma.rn.f32 %f414, %f5, %f361, %f413;
|
1474 |
+
neg.f32 %f415, %f394;
|
1475 |
+
fma.rn.f32 %f416, %f6, %f362, %f415;
|
1476 |
+
neg.f32 %f417, %f395;
|
1477 |
+
fma.rn.f32 %f418, %f7, %f363, %f417;
|
1478 |
+
neg.f32 %f419, %f396;
|
1479 |
+
fma.rn.f32 %f420, %f8, %f364, %f419;
|
1480 |
+
neg.f32 %f421, %f397;
|
1481 |
+
fma.rn.f32 %f422, %f9, %f365, %f421;
|
1482 |
+
neg.f32 %f423, %f398;
|
1483 |
+
fma.rn.f32 %f424, %f10, %f366, %f423;
|
1484 |
+
neg.f32 %f425, %f399;
|
1485 |
+
fma.rn.f32 %f426, %f11, %f367, %f425;
|
1486 |
+
neg.f32 %f427, %f400;
|
1487 |
+
fma.rn.f32 %f428, %f12, %f368, %f427;
|
1488 |
+
neg.f32 %f429, %f401;
|
1489 |
+
fma.rn.f32 %f430, %f13, %f369, %f429;
|
1490 |
+
neg.f32 %f431, %f402;
|
1491 |
+
fma.rn.f32 %f432, %f14, %f370, %f431;
|
1492 |
+
neg.f32 %f433, %f403;
|
1493 |
+
fma.rn.f32 %f434, %f15, %f371, %f433;
|
1494 |
+
neg.f32 %f435, %f404;
|
1495 |
+
fma.rn.f32 %f436, %f16, %f372, %f435;
|
1496 |
+
.loc 1 69 24
|
1497 |
+
add.f32 %f437, %f341, %f406;
|
1498 |
+
add.f32 %f438, %f342, %f408;
|
1499 |
+
add.f32 %f439, %f343, %f410;
|
1500 |
+
add.f32 %f440, %f344, %f412;
|
1501 |
+
add.f32 %f441, %f345, %f414;
|
1502 |
+
add.f32 %f442, %f346, %f416;
|
1503 |
+
add.f32 %f443, %f347, %f418;
|
1504 |
+
add.f32 %f444, %f348, %f420;
|
1505 |
+
add.f32 %f445, %f349, %f422;
|
1506 |
+
add.f32 %f446, %f350, %f424;
|
1507 |
+
add.f32 %f447, %f351, %f426;
|
1508 |
+
add.f32 %f448, %f352, %f428;
|
1509 |
+
add.f32 %f449, %f353, %f430;
|
1510 |
+
add.f32 %f450, %f354, %f432;
|
1511 |
+
add.f32 %f451, %f355, %f434;
|
1512 |
+
add.f32 %f452, %f356, %f436;
|
1513 |
+
.loc 1 70 29
|
1514 |
+
add.s64 %rd131, %rd7, %rd148;
|
1515 |
+
add.s64 %rd132, %rd131, 402056;
|
1516 |
+
add.s64 %rd133, %rd131, 804112;
|
1517 |
+
add.s64 %rd134, %rd131, 1206168;
|
1518 |
+
add.s64 %rd135, %rd131, 1608224;
|
1519 |
+
add.s64 %rd136, %rd131, 2010280;
|
1520 |
+
add.s64 %rd137, %rd131, 2412336;
|
1521 |
+
add.s64 %rd138, %rd131, 2814392;
|
1522 |
+
add.s64 %rd139, %rd131, 3216448;
|
1523 |
+
add.s64 %rd140, %rd131, 3618504;
|
1524 |
+
add.s64 %rd141, %rd131, 4020560;
|
1525 |
+
add.s64 %rd142, %rd131, 4422616;
|
1526 |
+
add.s64 %rd143, %rd131, 4824672;
|
1527 |
+
add.s64 %rd144, %rd131, 5226728;
|
1528 |
+
add.s64 %rd145, %rd131, 5628784;
|
1529 |
+
.loc 1 70 54
|
1530 |
+
add.s64 %rd146, %rd131, 6030840;
|
1531 |
+
mov.b32 %r380, %f437;
|
1532 |
+
cvt.rn.bf16.f32 %rs97, %r380;
|
1533 |
+
mov.b32 %r381, %f438;
|
1534 |
+
cvt.rn.bf16.f32 %rs98, %r381;
|
1535 |
+
mov.b32 %r382, %f439;
|
1536 |
+
cvt.rn.bf16.f32 %rs99, %r382;
|
1537 |
+
mov.b32 %r383, %f440;
|
1538 |
+
cvt.rn.bf16.f32 %rs100, %r383;
|
1539 |
+
mov.b32 %r384, %f441;
|
1540 |
+
cvt.rn.bf16.f32 %rs101, %r384;
|
1541 |
+
mov.b32 %r385, %f442;
|
1542 |
+
cvt.rn.bf16.f32 %rs102, %r385;
|
1543 |
+
mov.b32 %r386, %f443;
|
1544 |
+
cvt.rn.bf16.f32 %rs103, %r386;
|
1545 |
+
mov.b32 %r387, %f444;
|
1546 |
+
cvt.rn.bf16.f32 %rs104, %r387;
|
1547 |
+
mov.b32 %r388, %f445;
|
1548 |
+
cvt.rn.bf16.f32 %rs105, %r388;
|
1549 |
+
mov.b32 %r389, %f446;
|
1550 |
+
cvt.rn.bf16.f32 %rs106, %r389;
|
1551 |
+
mov.b32 %r390, %f447;
|
1552 |
+
cvt.rn.bf16.f32 %rs107, %r390;
|
1553 |
+
mov.b32 %r391, %f448;
|
1554 |
+
cvt.rn.bf16.f32 %rs108, %r391;
|
1555 |
+
mov.b32 %r392, %f449;
|
1556 |
+
cvt.rn.bf16.f32 %rs109, %r392;
|
1557 |
+
mov.b32 %r393, %f450;
|
1558 |
+
cvt.rn.bf16.f32 %rs110, %r393;
|
1559 |
+
mov.b32 %r394, %f451;
|
1560 |
+
cvt.rn.bf16.f32 %rs111, %r394;
|
1561 |
+
mov.b32 %r395, %f452;
|
1562 |
+
cvt.rn.bf16.f32 %rs112, %r395;
|
1563 |
+
@%p88 st.global.b16 [ %rd131 + 0 ], { %rs97 };
|
1564 |
+
@%p88 st.global.b16 [ %rd132 + 0 ], { %rs98 };
|
1565 |
+
@%p88 st.global.b16 [ %rd133 + 0 ], { %rs99 };
|
1566 |
+
@%p88 st.global.b16 [ %rd134 + 0 ], { %rs100 };
|
1567 |
+
@%p88 st.global.b16 [ %rd135 + 0 ], { %rs101 };
|
1568 |
+
@%p88 st.global.b16 [ %rd136 + 0 ], { %rs102 };
|
1569 |
+
@%p88 st.global.b16 [ %rd137 + 0 ], { %rs103 };
|
1570 |
+
@%p88 st.global.b16 [ %rd138 + 0 ], { %rs104 };
|
1571 |
+
@%p88 st.global.b16 [ %rd139 + 0 ], { %rs105 };
|
1572 |
+
@%p88 st.global.b16 [ %rd140 + 0 ], { %rs106 };
|
1573 |
+
@%p88 st.global.b16 [ %rd141 + 0 ], { %rs107 };
|
1574 |
+
@%p88 st.global.b16 [ %rd142 + 0 ], { %rs108 };
|
1575 |
+
@%p88 st.global.b16 [ %rd143 + 0 ], { %rs109 };
|
1576 |
+
@%p88 st.global.b16 [ %rd144 + 0 ], { %rs110 };
|
1577 |
+
@%p88 st.global.b16 [ %rd145 + 0 ], { %rs111 };
|
1578 |
+
@%p88 st.global.b16 [ %rd146 + 0 ], { %rs112 };
|
1579 |
+
.loc 1 51 36
|
1580 |
+
add.s64 %rd149, %rd149, 256;
|
1581 |
+
add.s64 %rd148, %rd148, 128;
|
1582 |
+
setp.lt.u32 %p200, %r398, 50193;
|
1583 |
+
@%p200 bra $L__BB0_3;
|
1584 |
+
.loc 1 51 4
|
1585 |
+
ret;
|
1586 |
+
$L__tmp165:
|
1587 |
+
$L__func_end0:
|
1588 |
+
|
1589 |
+
}
|
1590 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
1591 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
1592 |
+
.section .debug_abbrev
|
1593 |
+
{
|
1594 |
+
.b8 1
|
1595 |
+
.b8 17
|
1596 |
+
.b8 1
|
1597 |
+
.b8 37
|
1598 |
+
.b8 8
|
1599 |
+
.b8 19
|
1600 |
+
.b8 5
|
1601 |
+
.b8 3
|
1602 |
+
.b8 8
|
1603 |
+
.b8 16
|
1604 |
+
.b8 6
|
1605 |
+
.b8 27
|
1606 |
+
.b8 8
|
1607 |
+
.b8 180
|
1608 |
+
.b8 66
|
1609 |
+
.b8 12
|
1610 |
+
.b8 17
|
1611 |
+
.b8 1
|
1612 |
+
.b8 18
|
1613 |
+
.b8 1
|
1614 |
+
.b8 0
|
1615 |
+
.b8 0
|
1616 |
+
.b8 2
|
1617 |
+
.b8 46
|
1618 |
+
.b8 0
|
1619 |
+
.b8 135
|
1620 |
+
.b8 64
|
1621 |
+
.b8 8
|
1622 |
+
.b8 3
|
1623 |
+
.b8 8
|
1624 |
+
.b8 58
|
1625 |
+
.b8 11
|
1626 |
+
.b8 59
|
1627 |
+
.b8 11
|
1628 |
+
.b8 63
|
1629 |
+
.b8 12
|
1630 |
+
.b8 32
|
1631 |
+
.b8 11
|
1632 |
+
.b8 0
|
1633 |
+
.b8 0
|
1634 |
+
.b8 3
|
1635 |
+
.b8 46
|
1636 |
+
.b8 1
|
1637 |
+
.b8 17
|
1638 |
+
.b8 1
|
1639 |
+
.b8 18
|
1640 |
+
.b8 1
|
1641 |
+
.b8 64
|
1642 |
+
.b8 10
|
1643 |
+
.b8 49
|
1644 |
+
.b8 19
|
1645 |
+
.b8 0
|
1646 |
+
.b8 0
|
1647 |
+
.b8 4
|
1648 |
+
.b8 29
|
1649 |
+
.b8 0
|
1650 |
+
.b8 49
|
1651 |
+
.b8 19
|
1652 |
+
.b8 17
|
1653 |
+
.b8 1
|
1654 |
+
.b8 18
|
1655 |
+
.b8 1
|
1656 |
+
.b8 88
|
1657 |
+
.b8 11
|
1658 |
+
.b8 89
|
1659 |
+
.b8 11
|
1660 |
+
.b8 87
|
1661 |
+
.b8 11
|
1662 |
+
.b8 0
|
1663 |
+
.b8 0
|
1664 |
+
.b8 5
|
1665 |
+
.b8 29
|
1666 |
+
.b8 1
|
1667 |
+
.b8 49
|
1668 |
+
.b8 19
|
1669 |
+
.b8 17
|
1670 |
+
.b8 1
|
1671 |
+
.b8 18
|
1672 |
+
.b8 1
|
1673 |
+
.b8 88
|
1674 |
+
.b8 11
|
1675 |
+
.b8 89
|
1676 |
+
.b8 11
|
1677 |
+
.b8 87
|
1678 |
+
.b8 11
|
1679 |
+
.b8 0
|
1680 |
+
.b8 0
|
1681 |
+
.b8 0
|
1682 |
+
}
|
1683 |
+
.section .debug_info
|
1684 |
+
{
|
1685 |
+
.b32 278
|
1686 |
+
.b8 2
|
1687 |
+
.b8 0
|
1688 |
+
.b32 .debug_abbrev
|
1689 |
+
.b8 8
|
1690 |
+
.b8 1
|
1691 |
+
.b8 116
|
1692 |
+
.b8 114
|
1693 |
+
.b8 105
|
1694 |
+
.b8 116
|
1695 |
+
.b8 111
|
1696 |
+
.b8 110
|
1697 |
+
.b8 0
|
1698 |
+
.b8 2
|
1699 |
+
.b8 0
|
1700 |
+
.b8 99
|
1701 |
+
.b8 107
|
1702 |
+
.b8 122
|
1703 |
+
.b8 103
|
1704 |
+
.b8 108
|
1705 |
+
.b8 55
|
1706 |
+
.b8 116
|
1707 |
+
.b8 104
|
1708 |
+
.b8 98
|
1709 |
+
.b8 52
|
1710 |
+
.b8 120
|
1711 |
+
.b8 100
|
1712 |
+
.b8 102
|
1713 |
+
.b8 107
|
1714 |
+
.b8 102
|
1715 |
+
.b8 110
|
1716 |
+
.b8 100
|
1717 |
+
.b8 50
|
1718 |
+
.b8 116
|
1719 |
+
.b8 105
|
1720 |
+
.b8 100
|
1721 |
+
.b8 107
|
1722 |
+
.b8 115
|
1723 |
+
.b8 54
|
1724 |
+
.b8 109
|
1725 |
+
.b8 116
|
1726 |
+
.b8 53
|
1727 |
+
.b8 102
|
1728 |
+
.b8 51
|
1729 |
+
.b8 104
|
1730 |
+
.b8 97
|
1731 |
+
.b8 117
|
1732 |
+
.b8 119
|
1733 |
+
.b8 102
|
1734 |
+
.b8 121
|
1735 |
+
.b8 106
|
1736 |
+
.b8 102
|
1737 |
+
.b8 108
|
1738 |
+
.b8 98
|
1739 |
+
.b8 116
|
1740 |
+
.b8 122
|
1741 |
+
.b8 121
|
1742 |
+
.b8 101
|
1743 |
+
.b8 112
|
1744 |
+
.b8 111
|
1745 |
+
.b8 53
|
1746 |
+
.b8 111
|
1747 |
+
.b8 120
|
1748 |
+
.b8 107
|
1749 |
+
.b8 118
|
1750 |
+
.b8 104
|
1751 |
+
.b8 107
|
1752 |
+
.b8 46
|
1753 |
+
.b8 112
|
1754 |
+
.b8 121
|
1755 |
+
.b8 0
|
1756 |
+
.b32 .debug_line
|
1757 |
+
.b8 47
|
1758 |
+
.b8 116
|
1759 |
+
.b8 109
|
1760 |
+
.b8 112
|
1761 |
+
.b8 47
|
1762 |
+
.b8 116
|
1763 |
+
.b8 111
|
1764 |
+
.b8 114
|
1765 |
+
.b8 99
|
1766 |
+
.b8 104
|
1767 |
+
.b8 105
|
1768 |
+
.b8 110
|
1769 |
+
.b8 100
|
1770 |
+
.b8 117
|
1771 |
+
.b8 99
|
1772 |
+
.b8 116
|
1773 |
+
.b8 111
|
1774 |
+
.b8 114
|
1775 |
+
.b8 95
|
1776 |
+
.b8 114
|
1777 |
+
.b8 111
|
1778 |
+
.b8 111
|
1779 |
+
.b8 116
|
1780 |
+
.b8 47
|
1781 |
+
.b8 107
|
1782 |
+
.b8 122
|
1783 |
+
.b8 0
|
1784 |
+
.b8 1
|
1785 |
+
.b64 $L__func_begin0
|
1786 |
+
.b64 $L__func_end0
|
1787 |
+
.b8 2
|
1788 |
+
.b8 116
|
1789 |
+
.b8 114
|
1790 |
+
.b8 105
|
1791 |
+
.b8 116
|
1792 |
+
.b8 111
|
1793 |
+
.b8 110
|
1794 |
+
.b8 95
|
1795 |
+
.b8 95
|
1796 |
+
.b8 48
|
1797 |
+
.b8 100
|
1798 |
+
.b8 49
|
1799 |
+
.b8 100
|
1800 |
+
.b8 50
|
1801 |
+
.b8 100
|
1802 |
+
.b8 51
|
1803 |
+
.b8 100
|
1804 |
+
.b8 52
|
1805 |
+
.b8 100
|
1806 |
+
.b8 53
|
1807 |
+
.b8 100
|
1808 |
+
.b8 54
|
1809 |
+
.b8 100
|
1810 |
+
.b8 55
|
1811 |
+
.b8 100
|
1812 |
+
.b8 101
|
1813 |
+
.b8 56
|
1814 |
+
.b8 0
|
1815 |
+
.b8 116
|
1816 |
+
.b8 114
|
1817 |
+
.b8 105
|
1818 |
+
.b8 116
|
1819 |
+
.b8 111
|
1820 |
+
.b8 110
|
1821 |
+
.b8 95
|
1822 |
+
.b8 95
|
1823 |
+
.b8 48
|
1824 |
+
.b8 100
|
1825 |
+
.b8 49
|
1826 |
+
.b8 100
|
1827 |
+
.b8 50
|
1828 |
+
.b8 100
|
1829 |
+
.b8 51
|
1830 |
+
.b8 100
|
1831 |
+
.b8 52
|
1832 |
+
.b8 100
|
1833 |
+
.b8 53
|
1834 |
+
.b8 100
|
1835 |
+
.b8 54
|
1836 |
+
.b8 100
|
1837 |
+
.b8 55
|
1838 |
+
.b8 100
|
1839 |
+
.b8 101
|
1840 |
+
.b8 56
|
1841 |
+
.b8 0
|
1842 |
+
.b8 1
|
1843 |
+
.b8 18
|
1844 |
+
.b8 1
|
1845 |
+
.b8 1
|
1846 |
+
.b8 3
|
1847 |
+
.b64 $L__func_begin0
|
1848 |
+
.b64 $L__func_end0
|
1849 |
+
.b8 1
|
1850 |
+
.b8 156
|
1851 |
+
.b32 125
|
1852 |
+
.b8 4
|
1853 |
+
.b32 125
|
1854 |
+
.b64 $L__tmp1
|
1855 |
+
.b64 $L__tmp164
|
1856 |
+
.b8 2
|
1857 |
+
.b8 46
|
1858 |
+
.b8 27
|
1859 |
+
.b8 5
|
1860 |
+
.b32 125
|
1861 |
+
.b64 $L__tmp2
|
1862 |
+
.b64 $L__tmp163
|
1863 |
+
.b8 2
|
1864 |
+
.b8 46
|
1865 |
+
.b8 27
|
1866 |
+
.b8 4
|
1867 |
+
.b32 125
|
1868 |
+
.b64 $L__tmp2
|
1869 |
+
.b64 $L__tmp163
|
1870 |
+
.b8 2
|
1871 |
+
.b8 243
|
1872 |
+
.b8 36
|
1873 |
+
.b8 0
|
1874 |
+
.b8 0
|
1875 |
+
.b8 0
|
1876 |
+
}
|
1877 |
+
.section .debug_pubnames
|
1878 |
+
{
|
1879 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1880 |
+
$L__pubNames_start0:
|
1881 |
+
.b8 2
|
1882 |
+
.b8 0
|
1883 |
+
.b32 .debug_info
|
1884 |
+
.b32 282
|
1885 |
+
.b32 125
|
1886 |
+
.b8 116
|
1887 |
+
.b8 114
|
1888 |
+
.b8 105
|
1889 |
+
.b8 116
|
1890 |
+
.b8 111
|
1891 |
+
.b8 110
|
1892 |
+
.b8 95
|
1893 |
+
.b8 95
|
1894 |
+
.b8 48
|
1895 |
+
.b8 100
|
1896 |
+
.b8 49
|
1897 |
+
.b8 100
|
1898 |
+
.b8 50
|
1899 |
+
.b8 100
|
1900 |
+
.b8 51
|
1901 |
+
.b8 100
|
1902 |
+
.b8 52
|
1903 |
+
.b8 100
|
1904 |
+
.b8 53
|
1905 |
+
.b8 100
|
1906 |
+
.b8 54
|
1907 |
+
.b8 100
|
1908 |
+
.b8 55
|
1909 |
+
.b8 100
|
1910 |
+
.b8 101
|
1911 |
+
.b8 56
|
1912 |
+
.b8 0
|
1913 |
+
.b32 0
|
1914 |
+
$L__pubNames_end0:
|
1915 |
+
}
|
1916 |
+
.section .debug_pubtypes
|
1917 |
+
{
|
1918 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1919 |
+
$L__pubTypes_start0:
|
1920 |
+
.b8 2
|
1921 |
+
.b8 0
|
1922 |
+
.b32 .debug_info
|
1923 |
+
.b32 282
|
1924 |
+
.b32 0
|
1925 |
+
$L__pubTypes_end0:
|
1926 |
+
}
|
1927 |
+
.section .debug_loc { }
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir
ADDED
@@ -0,0 +1,860 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%5 = shl i32 %4, 3, !dbg !10
|
9 |
+
%6 = and i32 %5, 1016, !dbg !10
|
10 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%8 = shl i32 %7, 10, !dbg !12
|
12 |
+
%9 = or i32 %8, %6, !dbg !13
|
13 |
+
%10 = sext i32 %9 to i64, !dbg !14
|
14 |
+
%11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
|
15 |
+
%12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
|
16 |
+
%13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
|
17 |
+
%14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
|
18 |
+
%15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
|
19 |
+
%16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
|
20 |
+
%17 = trunc i32 %13 to i16, !dbg !15
|
21 |
+
%extelt.offset = lshr i32 %13, 16, !dbg !15
|
22 |
+
%18 = trunc i32 %extelt.offset to i16, !dbg !15
|
23 |
+
%19 = trunc i32 %14 to i16, !dbg !15
|
24 |
+
%extelt.offset1 = lshr i32 %14, 16, !dbg !15
|
25 |
+
%20 = trunc i32 %extelt.offset1 to i16, !dbg !15
|
26 |
+
%21 = trunc i32 %15 to i16, !dbg !15
|
27 |
+
%extelt.offset2 = lshr i32 %15, 16, !dbg !15
|
28 |
+
%22 = trunc i32 %extelt.offset2 to i16, !dbg !15
|
29 |
+
%23 = trunc i32 %16 to i16, !dbg !15
|
30 |
+
%extelt.offset3 = lshr i32 %16, 16, !dbg !15
|
31 |
+
%24 = trunc i32 %extelt.offset3 to i16, !dbg !15
|
32 |
+
%25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
|
33 |
+
%26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
|
34 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
|
35 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
|
36 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
|
37 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
|
38 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
|
39 |
+
%32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
|
40 |
+
%33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
|
41 |
+
%34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
|
42 |
+
%35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
|
43 |
+
%36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
|
44 |
+
%37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
|
45 |
+
%38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
|
46 |
+
%39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
|
47 |
+
%40 = fmul float %32, 0x3FE6A09E60000000, !dbg !17
|
48 |
+
%41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
49 |
+
%.not.i = icmp eq i32 %41, 0, !dbg !18
|
50 |
+
%42 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
|
51 |
+
%43 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
|
52 |
+
%.0.i = select i1 %.not.i, float %43, float %42, !dbg !18
|
53 |
+
%44 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
|
54 |
+
br i1 %44, label %__nv_fabsf.exit1.i, label %46, !dbg !18
|
55 |
+
|
56 |
+
__nv_fabsf.exit1.i: ; preds = %3
|
57 |
+
%45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
58 |
+
%.not1.i = icmp eq i32 %45, 0, !dbg !18
|
59 |
+
%.01.i = select i1 %.not1.i, float %43, float %42, !dbg !18
|
60 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
61 |
+
|
62 |
+
46: ; preds = %3
|
63 |
+
%47 = fmul float %33, %33, !dbg !18
|
64 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
65 |
+
|
66 |
+
__internal_fmad.exit.i: ; preds = %46, %__nv_fabsf.exit1.i
|
67 |
+
%48 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %46 ], !dbg !18
|
68 |
+
%49 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %46 ], !dbg !18
|
69 |
+
%50 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %46 ], !dbg !18
|
70 |
+
%51 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %46 ], !dbg !18
|
71 |
+
%52 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %46 ], !dbg !18
|
72 |
+
%53 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %46 ], !dbg !18
|
73 |
+
%54 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %46 ], !dbg !18
|
74 |
+
%55 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %47, %46 ], !dbg !18
|
75 |
+
%56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
76 |
+
%.not2.i = icmp eq i32 %56, 0, !dbg !18
|
77 |
+
%57 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %54, float %55, float %53) #4, !dbg !18
|
78 |
+
%58 = tail call float @llvm.nvvm.fma.rn.f(float %54, float %55, float %53) #4, !dbg !18
|
79 |
+
%.02.i = select i1 %.not2.i, float %58, float %57, !dbg !18
|
80 |
+
%59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
81 |
+
%.not3.i = icmp eq i32 %59, 0, !dbg !18
|
82 |
+
%60 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %55, float %52) #4, !dbg !18
|
83 |
+
%61 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %55, float %52) #4, !dbg !18
|
84 |
+
%.03.i = select i1 %.not3.i, float %61, float %60, !dbg !18
|
85 |
+
%62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
86 |
+
%.not4.i = icmp eq i32 %62, 0, !dbg !18
|
87 |
+
%63 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %55, float %51) #4, !dbg !18
|
88 |
+
%64 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %55, float %51) #4, !dbg !18
|
89 |
+
%.04.i = select i1 %.not4.i, float %64, float %63, !dbg !18
|
90 |
+
%65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
91 |
+
%.not5.i = icmp eq i32 %65, 0, !dbg !18
|
92 |
+
%66 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %55, float %50) #4, !dbg !18
|
93 |
+
%67 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %55, float %50) #4, !dbg !18
|
94 |
+
%.05.i = select i1 %.not5.i, float %67, float %66, !dbg !18
|
95 |
+
%68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
96 |
+
%.not6.i = icmp eq i32 %68, 0, !dbg !18
|
97 |
+
%69 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %55, float %49) #4, !dbg !18
|
98 |
+
%70 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %55, float %49) #4, !dbg !18
|
99 |
+
%.06.i = select i1 %.not6.i, float %70, float %69, !dbg !18
|
100 |
+
%71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
101 |
+
%.not7.i = icmp eq i32 %71, 0, !dbg !18
|
102 |
+
%72 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %55, float %48) #4, !dbg !18
|
103 |
+
%73 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %55, float %48) #4, !dbg !18
|
104 |
+
%.07.i = select i1 %.not7.i, float %73, float %72, !dbg !18
|
105 |
+
%74 = fneg float %55, !dbg !18
|
106 |
+
%75 = select i1 %44, float %74, float %33, !dbg !18
|
107 |
+
%76 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
108 |
+
%.not8.i = icmp eq i32 %76, 0, !dbg !18
|
109 |
+
%77 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %75, float %75) #4, !dbg !18
|
110 |
+
%78 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %75, float %75) #4, !dbg !18
|
111 |
+
%.08.i = select i1 %.not8.i, float %78, float %77, !dbg !18
|
112 |
+
br i1 %44, label %79, label %__nv_erff.exit, !dbg !18
|
113 |
+
|
114 |
+
79: ; preds = %__internal_fmad.exit.i
|
115 |
+
%80 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
|
116 |
+
%81 = fsub float 1.000000e+00, %80, !dbg !18
|
117 |
+
%82 = bitcast float %81 to i32, !dbg !18
|
118 |
+
%83 = bitcast float %33 to i32, !dbg !18
|
119 |
+
%84 = and i32 %83, -2147483648, !dbg !18
|
120 |
+
%85 = or i32 %84, %82, !dbg !18
|
121 |
+
%86 = bitcast i32 %85 to float, !dbg !18
|
122 |
+
br label %__nv_erff.exit, !dbg !18
|
123 |
+
|
124 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %79
|
125 |
+
%r.0.i = phi float [ %86, %79 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
|
126 |
+
%87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
127 |
+
%.not.i4 = icmp eq i32 %87, 0, !dbg !18
|
128 |
+
%88 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
|
129 |
+
%89 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
|
130 |
+
%.0.i5 = select i1 %.not.i4, float %89, float %88, !dbg !18
|
131 |
+
%90 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
|
132 |
+
br i1 %90, label %__nv_fabsf.exit1.i22, label %92, !dbg !18
|
133 |
+
|
134 |
+
__nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
|
135 |
+
%91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
136 |
+
%.not1.i23 = icmp eq i32 %91, 0, !dbg !18
|
137 |
+
%.01.i24 = select i1 %.not1.i23, float %89, float %88, !dbg !18
|
138 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
139 |
+
|
140 |
+
92: ; preds = %__nv_erff.exit
|
141 |
+
%93 = fmul float %34, %34, !dbg !18
|
142 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
143 |
+
|
144 |
+
__internal_fmad.exit.i6: ; preds = %92, %__nv_fabsf.exit1.i22
|
145 |
+
%94 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %92 ], !dbg !18
|
146 |
+
%95 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %92 ], !dbg !18
|
147 |
+
%96 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %92 ], !dbg !18
|
148 |
+
%97 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %92 ], !dbg !18
|
149 |
+
%98 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %92 ], !dbg !18
|
150 |
+
%99 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %92 ], !dbg !18
|
151 |
+
%100 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %92 ], !dbg !18
|
152 |
+
%101 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %93, %92 ], !dbg !18
|
153 |
+
%102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
154 |
+
%.not2.i7 = icmp eq i32 %102, 0, !dbg !18
|
155 |
+
%103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %100, float %101, float %99) #4, !dbg !18
|
156 |
+
%104 = tail call float @llvm.nvvm.fma.rn.f(float %100, float %101, float %99) #4, !dbg !18
|
157 |
+
%.02.i8 = select i1 %.not2.i7, float %104, float %103, !dbg !18
|
158 |
+
%105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
159 |
+
%.not3.i9 = icmp eq i32 %105, 0, !dbg !18
|
160 |
+
%106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %101, float %98) #4, !dbg !18
|
161 |
+
%107 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %101, float %98) #4, !dbg !18
|
162 |
+
%.03.i10 = select i1 %.not3.i9, float %107, float %106, !dbg !18
|
163 |
+
%108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
164 |
+
%.not4.i11 = icmp eq i32 %108, 0, !dbg !18
|
165 |
+
%109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %101, float %97) #4, !dbg !18
|
166 |
+
%110 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %101, float %97) #4, !dbg !18
|
167 |
+
%.04.i12 = select i1 %.not4.i11, float %110, float %109, !dbg !18
|
168 |
+
%111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
169 |
+
%.not5.i13 = icmp eq i32 %111, 0, !dbg !18
|
170 |
+
%112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %101, float %96) #4, !dbg !18
|
171 |
+
%113 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %101, float %96) #4, !dbg !18
|
172 |
+
%.05.i14 = select i1 %.not5.i13, float %113, float %112, !dbg !18
|
173 |
+
%114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
174 |
+
%.not6.i15 = icmp eq i32 %114, 0, !dbg !18
|
175 |
+
%115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %101, float %95) #4, !dbg !18
|
176 |
+
%116 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %101, float %95) #4, !dbg !18
|
177 |
+
%.06.i16 = select i1 %.not6.i15, float %116, float %115, !dbg !18
|
178 |
+
%117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
179 |
+
%.not7.i17 = icmp eq i32 %117, 0, !dbg !18
|
180 |
+
%118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %101, float %94) #4, !dbg !18
|
181 |
+
%119 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %101, float %94) #4, !dbg !18
|
182 |
+
%.07.i18 = select i1 %.not7.i17, float %119, float %118, !dbg !18
|
183 |
+
%120 = fneg float %101, !dbg !18
|
184 |
+
%121 = select i1 %90, float %120, float %34, !dbg !18
|
185 |
+
%122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
186 |
+
%.not8.i19 = icmp eq i32 %122, 0, !dbg !18
|
187 |
+
%123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %121, float %121) #4, !dbg !18
|
188 |
+
%124 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %121, float %121) #4, !dbg !18
|
189 |
+
%.08.i20 = select i1 %.not8.i19, float %124, float %123, !dbg !18
|
190 |
+
br i1 %90, label %125, label %__nv_erff.exit25, !dbg !18
|
191 |
+
|
192 |
+
125: ; preds = %__internal_fmad.exit.i6
|
193 |
+
%126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
|
194 |
+
%127 = fsub float 1.000000e+00, %126, !dbg !18
|
195 |
+
%128 = bitcast float %127 to i32, !dbg !18
|
196 |
+
%129 = bitcast float %34 to i32, !dbg !18
|
197 |
+
%130 = and i32 %129, -2147483648, !dbg !18
|
198 |
+
%131 = or i32 %130, %128, !dbg !18
|
199 |
+
%132 = bitcast i32 %131 to float, !dbg !18
|
200 |
+
br label %__nv_erff.exit25, !dbg !18
|
201 |
+
|
202 |
+
__nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %125
|
203 |
+
%r.0.i21 = phi float [ %132, %125 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
|
204 |
+
%133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
205 |
+
%.not.i26 = icmp eq i32 %133, 0, !dbg !18
|
206 |
+
%134 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
|
207 |
+
%135 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
|
208 |
+
%.0.i27 = select i1 %.not.i26, float %135, float %134, !dbg !18
|
209 |
+
%136 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
|
210 |
+
br i1 %136, label %__nv_fabsf.exit1.i44, label %138, !dbg !18
|
211 |
+
|
212 |
+
__nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
|
213 |
+
%137 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
214 |
+
%.not1.i45 = icmp eq i32 %137, 0, !dbg !18
|
215 |
+
%.01.i46 = select i1 %.not1.i45, float %135, float %134, !dbg !18
|
216 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
217 |
+
|
218 |
+
138: ; preds = %__nv_erff.exit25
|
219 |
+
%139 = fmul float %35, %35, !dbg !18
|
220 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
221 |
+
|
222 |
+
__internal_fmad.exit.i28: ; preds = %138, %__nv_fabsf.exit1.i44
|
223 |
+
%140 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %138 ], !dbg !18
|
224 |
+
%141 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %138 ], !dbg !18
|
225 |
+
%142 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %138 ], !dbg !18
|
226 |
+
%143 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %138 ], !dbg !18
|
227 |
+
%144 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %138 ], !dbg !18
|
228 |
+
%145 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %138 ], !dbg !18
|
229 |
+
%146 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %138 ], !dbg !18
|
230 |
+
%147 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %139, %138 ], !dbg !18
|
231 |
+
%148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
232 |
+
%.not2.i29 = icmp eq i32 %148, 0, !dbg !18
|
233 |
+
%149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %146, float %147, float %145) #4, !dbg !18
|
234 |
+
%150 = tail call float @llvm.nvvm.fma.rn.f(float %146, float %147, float %145) #4, !dbg !18
|
235 |
+
%.02.i30 = select i1 %.not2.i29, float %150, float %149, !dbg !18
|
236 |
+
%151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
237 |
+
%.not3.i31 = icmp eq i32 %151, 0, !dbg !18
|
238 |
+
%152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %147, float %144) #4, !dbg !18
|
239 |
+
%153 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %147, float %144) #4, !dbg !18
|
240 |
+
%.03.i32 = select i1 %.not3.i31, float %153, float %152, !dbg !18
|
241 |
+
%154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
242 |
+
%.not4.i33 = icmp eq i32 %154, 0, !dbg !18
|
243 |
+
%155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %147, float %143) #4, !dbg !18
|
244 |
+
%156 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %147, float %143) #4, !dbg !18
|
245 |
+
%.04.i34 = select i1 %.not4.i33, float %156, float %155, !dbg !18
|
246 |
+
%157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
247 |
+
%.not5.i35 = icmp eq i32 %157, 0, !dbg !18
|
248 |
+
%158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %147, float %142) #4, !dbg !18
|
249 |
+
%159 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %147, float %142) #4, !dbg !18
|
250 |
+
%.05.i36 = select i1 %.not5.i35, float %159, float %158, !dbg !18
|
251 |
+
%160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
252 |
+
%.not6.i37 = icmp eq i32 %160, 0, !dbg !18
|
253 |
+
%161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %147, float %141) #4, !dbg !18
|
254 |
+
%162 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %147, float %141) #4, !dbg !18
|
255 |
+
%.06.i38 = select i1 %.not6.i37, float %162, float %161, !dbg !18
|
256 |
+
%163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
257 |
+
%.not7.i39 = icmp eq i32 %163, 0, !dbg !18
|
258 |
+
%164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %147, float %140) #4, !dbg !18
|
259 |
+
%165 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %147, float %140) #4, !dbg !18
|
260 |
+
%.07.i40 = select i1 %.not7.i39, float %165, float %164, !dbg !18
|
261 |
+
%166 = fneg float %147, !dbg !18
|
262 |
+
%167 = select i1 %136, float %166, float %35, !dbg !18
|
263 |
+
%168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
264 |
+
%.not8.i41 = icmp eq i32 %168, 0, !dbg !18
|
265 |
+
%169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %167, float %167) #4, !dbg !18
|
266 |
+
%170 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %167, float %167) #4, !dbg !18
|
267 |
+
%.08.i42 = select i1 %.not8.i41, float %170, float %169, !dbg !18
|
268 |
+
br i1 %136, label %171, label %__nv_erff.exit47, !dbg !18
|
269 |
+
|
270 |
+
171: ; preds = %__internal_fmad.exit.i28
|
271 |
+
%172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
|
272 |
+
%173 = fsub float 1.000000e+00, %172, !dbg !18
|
273 |
+
%174 = bitcast float %173 to i32, !dbg !18
|
274 |
+
%175 = bitcast float %35 to i32, !dbg !18
|
275 |
+
%176 = and i32 %175, -2147483648, !dbg !18
|
276 |
+
%177 = or i32 %176, %174, !dbg !18
|
277 |
+
%178 = bitcast i32 %177 to float, !dbg !18
|
278 |
+
br label %__nv_erff.exit47, !dbg !18
|
279 |
+
|
280 |
+
__nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %171
|
281 |
+
%r.0.i43 = phi float [ %178, %171 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
|
282 |
+
%179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
283 |
+
%.not.i48 = icmp eq i32 %179, 0, !dbg !18
|
284 |
+
%180 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
|
285 |
+
%181 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
|
286 |
+
%.0.i49 = select i1 %.not.i48, float %181, float %180, !dbg !18
|
287 |
+
%182 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
|
288 |
+
br i1 %182, label %__nv_fabsf.exit1.i66, label %184, !dbg !18
|
289 |
+
|
290 |
+
__nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
|
291 |
+
%183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
292 |
+
%.not1.i67 = icmp eq i32 %183, 0, !dbg !18
|
293 |
+
%.01.i68 = select i1 %.not1.i67, float %181, float %180, !dbg !18
|
294 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
295 |
+
|
296 |
+
184: ; preds = %__nv_erff.exit47
|
297 |
+
%185 = fmul float %36, %36, !dbg !18
|
298 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
299 |
+
|
300 |
+
__internal_fmad.exit.i50: ; preds = %184, %__nv_fabsf.exit1.i66
|
301 |
+
%186 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %184 ], !dbg !18
|
302 |
+
%187 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %184 ], !dbg !18
|
303 |
+
%188 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %184 ], !dbg !18
|
304 |
+
%189 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %184 ], !dbg !18
|
305 |
+
%190 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %184 ], !dbg !18
|
306 |
+
%191 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %184 ], !dbg !18
|
307 |
+
%192 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %184 ], !dbg !18
|
308 |
+
%193 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %185, %184 ], !dbg !18
|
309 |
+
%194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
310 |
+
%.not2.i51 = icmp eq i32 %194, 0, !dbg !18
|
311 |
+
%195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %192, float %193, float %191) #4, !dbg !18
|
312 |
+
%196 = tail call float @llvm.nvvm.fma.rn.f(float %192, float %193, float %191) #4, !dbg !18
|
313 |
+
%.02.i52 = select i1 %.not2.i51, float %196, float %195, !dbg !18
|
314 |
+
%197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
315 |
+
%.not3.i53 = icmp eq i32 %197, 0, !dbg !18
|
316 |
+
%198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %193, float %190) #4, !dbg !18
|
317 |
+
%199 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %193, float %190) #4, !dbg !18
|
318 |
+
%.03.i54 = select i1 %.not3.i53, float %199, float %198, !dbg !18
|
319 |
+
%200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
320 |
+
%.not4.i55 = icmp eq i32 %200, 0, !dbg !18
|
321 |
+
%201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %193, float %189) #4, !dbg !18
|
322 |
+
%202 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %193, float %189) #4, !dbg !18
|
323 |
+
%.04.i56 = select i1 %.not4.i55, float %202, float %201, !dbg !18
|
324 |
+
%203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
325 |
+
%.not5.i57 = icmp eq i32 %203, 0, !dbg !18
|
326 |
+
%204 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %193, float %188) #4, !dbg !18
|
327 |
+
%205 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %193, float %188) #4, !dbg !18
|
328 |
+
%.05.i58 = select i1 %.not5.i57, float %205, float %204, !dbg !18
|
329 |
+
%206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
330 |
+
%.not6.i59 = icmp eq i32 %206, 0, !dbg !18
|
331 |
+
%207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %193, float %187) #4, !dbg !18
|
332 |
+
%208 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %193, float %187) #4, !dbg !18
|
333 |
+
%.06.i60 = select i1 %.not6.i59, float %208, float %207, !dbg !18
|
334 |
+
%209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
335 |
+
%.not7.i61 = icmp eq i32 %209, 0, !dbg !18
|
336 |
+
%210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %193, float %186) #4, !dbg !18
|
337 |
+
%211 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %193, float %186) #4, !dbg !18
|
338 |
+
%.07.i62 = select i1 %.not7.i61, float %211, float %210, !dbg !18
|
339 |
+
%212 = fneg float %193, !dbg !18
|
340 |
+
%213 = select i1 %182, float %212, float %36, !dbg !18
|
341 |
+
%214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
342 |
+
%.not8.i63 = icmp eq i32 %214, 0, !dbg !18
|
343 |
+
%215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %213, float %213) #4, !dbg !18
|
344 |
+
%216 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %213, float %213) #4, !dbg !18
|
345 |
+
%.08.i64 = select i1 %.not8.i63, float %216, float %215, !dbg !18
|
346 |
+
br i1 %182, label %217, label %__nv_erff.exit69, !dbg !18
|
347 |
+
|
348 |
+
217: ; preds = %__internal_fmad.exit.i50
|
349 |
+
%218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
|
350 |
+
%219 = fsub float 1.000000e+00, %218, !dbg !18
|
351 |
+
%220 = bitcast float %219 to i32, !dbg !18
|
352 |
+
%221 = bitcast float %36 to i32, !dbg !18
|
353 |
+
%222 = and i32 %221, -2147483648, !dbg !18
|
354 |
+
%223 = or i32 %222, %220, !dbg !18
|
355 |
+
%224 = bitcast i32 %223 to float, !dbg !18
|
356 |
+
br label %__nv_erff.exit69, !dbg !18
|
357 |
+
|
358 |
+
__nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %217
|
359 |
+
%r.0.i65 = phi float [ %224, %217 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
|
360 |
+
%225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
361 |
+
%.not.i70 = icmp eq i32 %225, 0, !dbg !18
|
362 |
+
%226 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
|
363 |
+
%227 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
|
364 |
+
%.0.i71 = select i1 %.not.i70, float %227, float %226, !dbg !18
|
365 |
+
%228 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
|
366 |
+
br i1 %228, label %__nv_fabsf.exit1.i88, label %230, !dbg !18
|
367 |
+
|
368 |
+
__nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
|
369 |
+
%229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
370 |
+
%.not1.i89 = icmp eq i32 %229, 0, !dbg !18
|
371 |
+
%.01.i90 = select i1 %.not1.i89, float %227, float %226, !dbg !18
|
372 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
373 |
+
|
374 |
+
230: ; preds = %__nv_erff.exit69
|
375 |
+
%231 = fmul float %37, %37, !dbg !18
|
376 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
377 |
+
|
378 |
+
__internal_fmad.exit.i72: ; preds = %230, %__nv_fabsf.exit1.i88
|
379 |
+
%232 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %230 ], !dbg !18
|
380 |
+
%233 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %230 ], !dbg !18
|
381 |
+
%234 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %230 ], !dbg !18
|
382 |
+
%235 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %230 ], !dbg !18
|
383 |
+
%236 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %230 ], !dbg !18
|
384 |
+
%237 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %230 ], !dbg !18
|
385 |
+
%238 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %230 ], !dbg !18
|
386 |
+
%239 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %231, %230 ], !dbg !18
|
387 |
+
%240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
388 |
+
%.not2.i73 = icmp eq i32 %240, 0, !dbg !18
|
389 |
+
%241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %238, float %239, float %237) #4, !dbg !18
|
390 |
+
%242 = tail call float @llvm.nvvm.fma.rn.f(float %238, float %239, float %237) #4, !dbg !18
|
391 |
+
%.02.i74 = select i1 %.not2.i73, float %242, float %241, !dbg !18
|
392 |
+
%243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
393 |
+
%.not3.i75 = icmp eq i32 %243, 0, !dbg !18
|
394 |
+
%244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %239, float %236) #4, !dbg !18
|
395 |
+
%245 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %239, float %236) #4, !dbg !18
|
396 |
+
%.03.i76 = select i1 %.not3.i75, float %245, float %244, !dbg !18
|
397 |
+
%246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
398 |
+
%.not4.i77 = icmp eq i32 %246, 0, !dbg !18
|
399 |
+
%247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %239, float %235) #4, !dbg !18
|
400 |
+
%248 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %239, float %235) #4, !dbg !18
|
401 |
+
%.04.i78 = select i1 %.not4.i77, float %248, float %247, !dbg !18
|
402 |
+
%249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
403 |
+
%.not5.i79 = icmp eq i32 %249, 0, !dbg !18
|
404 |
+
%250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %239, float %234) #4, !dbg !18
|
405 |
+
%251 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %239, float %234) #4, !dbg !18
|
406 |
+
%.05.i80 = select i1 %.not5.i79, float %251, float %250, !dbg !18
|
407 |
+
%252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
408 |
+
%.not6.i81 = icmp eq i32 %252, 0, !dbg !18
|
409 |
+
%253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %239, float %233) #4, !dbg !18
|
410 |
+
%254 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %239, float %233) #4, !dbg !18
|
411 |
+
%.06.i82 = select i1 %.not6.i81, float %254, float %253, !dbg !18
|
412 |
+
%255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
413 |
+
%.not7.i83 = icmp eq i32 %255, 0, !dbg !18
|
414 |
+
%256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %239, float %232) #4, !dbg !18
|
415 |
+
%257 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %239, float %232) #4, !dbg !18
|
416 |
+
%.07.i84 = select i1 %.not7.i83, float %257, float %256, !dbg !18
|
417 |
+
%258 = fneg float %239, !dbg !18
|
418 |
+
%259 = select i1 %228, float %258, float %37, !dbg !18
|
419 |
+
%260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
420 |
+
%.not8.i85 = icmp eq i32 %260, 0, !dbg !18
|
421 |
+
%261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %259, float %259) #4, !dbg !18
|
422 |
+
%262 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %259, float %259) #4, !dbg !18
|
423 |
+
%.08.i86 = select i1 %.not8.i85, float %262, float %261, !dbg !18
|
424 |
+
br i1 %228, label %263, label %__nv_erff.exit91, !dbg !18
|
425 |
+
|
426 |
+
263: ; preds = %__internal_fmad.exit.i72
|
427 |
+
%264 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
|
428 |
+
%265 = fsub float 1.000000e+00, %264, !dbg !18
|
429 |
+
%266 = bitcast float %265 to i32, !dbg !18
|
430 |
+
%267 = bitcast float %37 to i32, !dbg !18
|
431 |
+
%268 = and i32 %267, -2147483648, !dbg !18
|
432 |
+
%269 = or i32 %268, %266, !dbg !18
|
433 |
+
%270 = bitcast i32 %269 to float, !dbg !18
|
434 |
+
br label %__nv_erff.exit91, !dbg !18
|
435 |
+
|
436 |
+
__nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %263
|
437 |
+
%r.0.i87 = phi float [ %270, %263 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
|
438 |
+
%271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
439 |
+
%.not.i92 = icmp eq i32 %271, 0, !dbg !18
|
440 |
+
%272 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
|
441 |
+
%273 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
|
442 |
+
%.0.i93 = select i1 %.not.i92, float %273, float %272, !dbg !18
|
443 |
+
%274 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
|
444 |
+
br i1 %274, label %__nv_fabsf.exit1.i110, label %276, !dbg !18
|
445 |
+
|
446 |
+
__nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
|
447 |
+
%275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
448 |
+
%.not1.i111 = icmp eq i32 %275, 0, !dbg !18
|
449 |
+
%.01.i112 = select i1 %.not1.i111, float %273, float %272, !dbg !18
|
450 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
451 |
+
|
452 |
+
276: ; preds = %__nv_erff.exit91
|
453 |
+
%277 = fmul float %38, %38, !dbg !18
|
454 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
455 |
+
|
456 |
+
__internal_fmad.exit.i94: ; preds = %276, %__nv_fabsf.exit1.i110
|
457 |
+
%278 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %276 ], !dbg !18
|
458 |
+
%279 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %276 ], !dbg !18
|
459 |
+
%280 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %276 ], !dbg !18
|
460 |
+
%281 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %276 ], !dbg !18
|
461 |
+
%282 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %276 ], !dbg !18
|
462 |
+
%283 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %276 ], !dbg !18
|
463 |
+
%284 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %276 ], !dbg !18
|
464 |
+
%285 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %277, %276 ], !dbg !18
|
465 |
+
%286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
466 |
+
%.not2.i95 = icmp eq i32 %286, 0, !dbg !18
|
467 |
+
%287 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %284, float %285, float %283) #4, !dbg !18
|
468 |
+
%288 = tail call float @llvm.nvvm.fma.rn.f(float %284, float %285, float %283) #4, !dbg !18
|
469 |
+
%.02.i96 = select i1 %.not2.i95, float %288, float %287, !dbg !18
|
470 |
+
%289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
471 |
+
%.not3.i97 = icmp eq i32 %289, 0, !dbg !18
|
472 |
+
%290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %285, float %282) #4, !dbg !18
|
473 |
+
%291 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %285, float %282) #4, !dbg !18
|
474 |
+
%.03.i98 = select i1 %.not3.i97, float %291, float %290, !dbg !18
|
475 |
+
%292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
476 |
+
%.not4.i99 = icmp eq i32 %292, 0, !dbg !18
|
477 |
+
%293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %285, float %281) #4, !dbg !18
|
478 |
+
%294 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %285, float %281) #4, !dbg !18
|
479 |
+
%.04.i100 = select i1 %.not4.i99, float %294, float %293, !dbg !18
|
480 |
+
%295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
481 |
+
%.not5.i101 = icmp eq i32 %295, 0, !dbg !18
|
482 |
+
%296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %285, float %280) #4, !dbg !18
|
483 |
+
%297 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %285, float %280) #4, !dbg !18
|
484 |
+
%.05.i102 = select i1 %.not5.i101, float %297, float %296, !dbg !18
|
485 |
+
%298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
486 |
+
%.not6.i103 = icmp eq i32 %298, 0, !dbg !18
|
487 |
+
%299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %285, float %279) #4, !dbg !18
|
488 |
+
%300 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %285, float %279) #4, !dbg !18
|
489 |
+
%.06.i104 = select i1 %.not6.i103, float %300, float %299, !dbg !18
|
490 |
+
%301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
491 |
+
%.not7.i105 = icmp eq i32 %301, 0, !dbg !18
|
492 |
+
%302 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %285, float %278) #4, !dbg !18
|
493 |
+
%303 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %285, float %278) #4, !dbg !18
|
494 |
+
%.07.i106 = select i1 %.not7.i105, float %303, float %302, !dbg !18
|
495 |
+
%304 = fneg float %285, !dbg !18
|
496 |
+
%305 = select i1 %274, float %304, float %38, !dbg !18
|
497 |
+
%306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
498 |
+
%.not8.i107 = icmp eq i32 %306, 0, !dbg !18
|
499 |
+
%307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %305, float %305) #4, !dbg !18
|
500 |
+
%308 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %305, float %305) #4, !dbg !18
|
501 |
+
%.08.i108 = select i1 %.not8.i107, float %308, float %307, !dbg !18
|
502 |
+
br i1 %274, label %309, label %__nv_erff.exit113, !dbg !18
|
503 |
+
|
504 |
+
309: ; preds = %__internal_fmad.exit.i94
|
505 |
+
%310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
|
506 |
+
%311 = fsub float 1.000000e+00, %310, !dbg !18
|
507 |
+
%312 = bitcast float %311 to i32, !dbg !18
|
508 |
+
%313 = bitcast float %38 to i32, !dbg !18
|
509 |
+
%314 = and i32 %313, -2147483648, !dbg !18
|
510 |
+
%315 = or i32 %314, %312, !dbg !18
|
511 |
+
%316 = bitcast i32 %315 to float, !dbg !18
|
512 |
+
br label %__nv_erff.exit113, !dbg !18
|
513 |
+
|
514 |
+
__nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %309
|
515 |
+
%r.0.i109 = phi float [ %316, %309 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
|
516 |
+
%317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
517 |
+
%.not.i114 = icmp eq i32 %317, 0, !dbg !18
|
518 |
+
%318 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
|
519 |
+
%319 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
|
520 |
+
%.0.i115 = select i1 %.not.i114, float %319, float %318, !dbg !18
|
521 |
+
%320 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
|
522 |
+
br i1 %320, label %__nv_fabsf.exit1.i132, label %322, !dbg !18
|
523 |
+
|
524 |
+
__nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
|
525 |
+
%321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
526 |
+
%.not1.i133 = icmp eq i32 %321, 0, !dbg !18
|
527 |
+
%.01.i134 = select i1 %.not1.i133, float %319, float %318, !dbg !18
|
528 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
529 |
+
|
530 |
+
322: ; preds = %__nv_erff.exit113
|
531 |
+
%323 = fmul float %39, %39, !dbg !18
|
532 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
533 |
+
|
534 |
+
__internal_fmad.exit.i116: ; preds = %322, %__nv_fabsf.exit1.i132
|
535 |
+
%324 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %322 ], !dbg !18
|
536 |
+
%325 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %322 ], !dbg !18
|
537 |
+
%326 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %322 ], !dbg !18
|
538 |
+
%327 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %322 ], !dbg !18
|
539 |
+
%328 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %322 ], !dbg !18
|
540 |
+
%329 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %322 ], !dbg !18
|
541 |
+
%330 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %322 ], !dbg !18
|
542 |
+
%331 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %323, %322 ], !dbg !18
|
543 |
+
%332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
544 |
+
%.not2.i117 = icmp eq i32 %332, 0, !dbg !18
|
545 |
+
%333 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %330, float %331, float %329) #4, !dbg !18
|
546 |
+
%334 = tail call float @llvm.nvvm.fma.rn.f(float %330, float %331, float %329) #4, !dbg !18
|
547 |
+
%.02.i118 = select i1 %.not2.i117, float %334, float %333, !dbg !18
|
548 |
+
%335 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
549 |
+
%.not3.i119 = icmp eq i32 %335, 0, !dbg !18
|
550 |
+
%336 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %331, float %328) #4, !dbg !18
|
551 |
+
%337 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %331, float %328) #4, !dbg !18
|
552 |
+
%.03.i120 = select i1 %.not3.i119, float %337, float %336, !dbg !18
|
553 |
+
%338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
554 |
+
%.not4.i121 = icmp eq i32 %338, 0, !dbg !18
|
555 |
+
%339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %331, float %327) #4, !dbg !18
|
556 |
+
%340 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %331, float %327) #4, !dbg !18
|
557 |
+
%.04.i122 = select i1 %.not4.i121, float %340, float %339, !dbg !18
|
558 |
+
%341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
559 |
+
%.not5.i123 = icmp eq i32 %341, 0, !dbg !18
|
560 |
+
%342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %331, float %326) #4, !dbg !18
|
561 |
+
%343 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %331, float %326) #4, !dbg !18
|
562 |
+
%.05.i124 = select i1 %.not5.i123, float %343, float %342, !dbg !18
|
563 |
+
%344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
564 |
+
%.not6.i125 = icmp eq i32 %344, 0, !dbg !18
|
565 |
+
%345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %331, float %325) #4, !dbg !18
|
566 |
+
%346 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %331, float %325) #4, !dbg !18
|
567 |
+
%.06.i126 = select i1 %.not6.i125, float %346, float %345, !dbg !18
|
568 |
+
%347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
569 |
+
%.not7.i127 = icmp eq i32 %347, 0, !dbg !18
|
570 |
+
%348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %331, float %324) #4, !dbg !18
|
571 |
+
%349 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %331, float %324) #4, !dbg !18
|
572 |
+
%.07.i128 = select i1 %.not7.i127, float %349, float %348, !dbg !18
|
573 |
+
%350 = fneg float %331, !dbg !18
|
574 |
+
%351 = select i1 %320, float %350, float %39, !dbg !18
|
575 |
+
%352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
576 |
+
%.not8.i129 = icmp eq i32 %352, 0, !dbg !18
|
577 |
+
%353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %351, float %351) #4, !dbg !18
|
578 |
+
%354 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %351, float %351) #4, !dbg !18
|
579 |
+
%.08.i130 = select i1 %.not8.i129, float %354, float %353, !dbg !18
|
580 |
+
br i1 %320, label %355, label %__nv_erff.exit135, !dbg !18
|
581 |
+
|
582 |
+
355: ; preds = %__internal_fmad.exit.i116
|
583 |
+
%356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
|
584 |
+
%357 = fsub float 1.000000e+00, %356, !dbg !18
|
585 |
+
%358 = bitcast float %357 to i32, !dbg !18
|
586 |
+
%359 = bitcast float %39 to i32, !dbg !18
|
587 |
+
%360 = and i32 %359, -2147483648, !dbg !18
|
588 |
+
%361 = or i32 %360, %358, !dbg !18
|
589 |
+
%362 = bitcast i32 %361 to float, !dbg !18
|
590 |
+
br label %__nv_erff.exit135, !dbg !18
|
591 |
+
|
592 |
+
__nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %355
|
593 |
+
%r.0.i131 = phi float [ %362, %355 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
|
594 |
+
%363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
595 |
+
%.not.i136 = icmp eq i32 %363, 0, !dbg !18
|
596 |
+
%364 = tail call float @llvm.nvvm.fabs.ftz.f(float %40) #4, !dbg !18
|
597 |
+
%365 = tail call float @llvm.nvvm.fabs.f(float %40) #4, !dbg !18
|
598 |
+
%.0.i137 = select i1 %.not.i136, float %365, float %364, !dbg !18
|
599 |
+
%366 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
|
600 |
+
br i1 %366, label %__nv_fabsf.exit1.i154, label %368, !dbg !18
|
601 |
+
|
602 |
+
__nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
|
603 |
+
%367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
604 |
+
%.not1.i155 = icmp eq i32 %367, 0, !dbg !18
|
605 |
+
%.01.i156 = select i1 %.not1.i155, float %365, float %364, !dbg !18
|
606 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
607 |
+
|
608 |
+
368: ; preds = %__nv_erff.exit135
|
609 |
+
%369 = fmul float %40, %40, !dbg !18
|
610 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
611 |
+
|
612 |
+
__internal_fmad.exit.i138: ; preds = %368, %__nv_fabsf.exit1.i154
|
613 |
+
%370 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %368 ], !dbg !18
|
614 |
+
%371 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %368 ], !dbg !18
|
615 |
+
%372 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %368 ], !dbg !18
|
616 |
+
%373 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %368 ], !dbg !18
|
617 |
+
%374 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %368 ], !dbg !18
|
618 |
+
%375 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %368 ], !dbg !18
|
619 |
+
%376 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %368 ], !dbg !18
|
620 |
+
%377 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %369, %368 ], !dbg !18
|
621 |
+
%378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
622 |
+
%.not2.i139 = icmp eq i32 %378, 0, !dbg !18
|
623 |
+
%379 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float %377, float %375) #4, !dbg !18
|
624 |
+
%380 = tail call float @llvm.nvvm.fma.rn.f(float %376, float %377, float %375) #4, !dbg !18
|
625 |
+
%.02.i140 = select i1 %.not2.i139, float %380, float %379, !dbg !18
|
626 |
+
%381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
627 |
+
%.not3.i141 = icmp eq i32 %381, 0, !dbg !18
|
628 |
+
%382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %377, float %374) #4, !dbg !18
|
629 |
+
%383 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %377, float %374) #4, !dbg !18
|
630 |
+
%.03.i142 = select i1 %.not3.i141, float %383, float %382, !dbg !18
|
631 |
+
%384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
632 |
+
%.not4.i143 = icmp eq i32 %384, 0, !dbg !18
|
633 |
+
%385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %377, float %373) #4, !dbg !18
|
634 |
+
%386 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %377, float %373) #4, !dbg !18
|
635 |
+
%.04.i144 = select i1 %.not4.i143, float %386, float %385, !dbg !18
|
636 |
+
%387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
637 |
+
%.not5.i145 = icmp eq i32 %387, 0, !dbg !18
|
638 |
+
%388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %377, float %372) #4, !dbg !18
|
639 |
+
%389 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %377, float %372) #4, !dbg !18
|
640 |
+
%.05.i146 = select i1 %.not5.i145, float %389, float %388, !dbg !18
|
641 |
+
%390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
642 |
+
%.not6.i147 = icmp eq i32 %390, 0, !dbg !18
|
643 |
+
%391 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %377, float %371) #4, !dbg !18
|
644 |
+
%392 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %377, float %371) #4, !dbg !18
|
645 |
+
%.06.i148 = select i1 %.not6.i147, float %392, float %391, !dbg !18
|
646 |
+
%393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
647 |
+
%.not7.i149 = icmp eq i32 %393, 0, !dbg !18
|
648 |
+
%394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %377, float %370) #4, !dbg !18
|
649 |
+
%395 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %377, float %370) #4, !dbg !18
|
650 |
+
%.07.i150 = select i1 %.not7.i149, float %395, float %394, !dbg !18
|
651 |
+
%396 = fneg float %377, !dbg !18
|
652 |
+
%397 = select i1 %366, float %396, float %40, !dbg !18
|
653 |
+
%398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
654 |
+
%.not8.i151 = icmp eq i32 %398, 0, !dbg !18
|
655 |
+
%399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %397, float %397) #4, !dbg !18
|
656 |
+
%400 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %397, float %397) #4, !dbg !18
|
657 |
+
%.08.i152 = select i1 %.not8.i151, float %400, float %399, !dbg !18
|
658 |
+
br i1 %366, label %401, label %__nv_erff.exit157, !dbg !18
|
659 |
+
|
660 |
+
401: ; preds = %__internal_fmad.exit.i138
|
661 |
+
%402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
|
662 |
+
%403 = fsub float 1.000000e+00, %402, !dbg !18
|
663 |
+
%404 = bitcast float %403 to i32, !dbg !18
|
664 |
+
%405 = bitcast float %40 to i32, !dbg !18
|
665 |
+
%406 = and i32 %405, -2147483648, !dbg !18
|
666 |
+
%407 = or i32 %406, %404, !dbg !18
|
667 |
+
%408 = bitcast i32 %407 to float, !dbg !18
|
668 |
+
br label %__nv_erff.exit157, !dbg !18
|
669 |
+
|
670 |
+
__nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %401
|
671 |
+
%r.0.i153 = phi float [ %408, %401 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
|
672 |
+
%409 = fmul float %32, 5.000000e-01, !dbg !19
|
673 |
+
%410 = fmul float %31, 5.000000e-01, !dbg !19
|
674 |
+
%411 = fmul float %30, 5.000000e-01, !dbg !19
|
675 |
+
%412 = fmul float %29, 5.000000e-01, !dbg !19
|
676 |
+
%413 = fmul float %28, 5.000000e-01, !dbg !19
|
677 |
+
%414 = fmul float %27, 5.000000e-01, !dbg !19
|
678 |
+
%415 = fmul float %26, 5.000000e-01, !dbg !19
|
679 |
+
%416 = fmul float %25, 5.000000e-01, !dbg !19
|
680 |
+
%417 = fadd float %r.0.i, 1.000000e+00, !dbg !20
|
681 |
+
%418 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
|
682 |
+
%419 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
|
683 |
+
%420 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
|
684 |
+
%421 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
|
685 |
+
%422 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
|
686 |
+
%423 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
|
687 |
+
%424 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
|
688 |
+
%425 = fmul float %416, %417, !dbg !21
|
689 |
+
%426 = fmul float %415, %418, !dbg !21
|
690 |
+
%427 = fmul float %414, %419, !dbg !21
|
691 |
+
%428 = fmul float %413, %420, !dbg !21
|
692 |
+
%429 = fmul float %412, %421, !dbg !21
|
693 |
+
%430 = fmul float %411, %422, !dbg !21
|
694 |
+
%431 = fmul float %410, %423, !dbg !21
|
695 |
+
%432 = fmul float %409, %424, !dbg !21
|
696 |
+
%433 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !22
|
697 |
+
%434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !23
|
698 |
+
%435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !23
|
699 |
+
%436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !23
|
700 |
+
%437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !23
|
701 |
+
%438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !23
|
702 |
+
%439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !23
|
703 |
+
%440 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !23
|
704 |
+
%441 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #4, !dbg !23
|
705 |
+
%442 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !23
|
706 |
+
%443 = insertelement <2 x i16> %442, i16 %435, i64 1, !dbg !23
|
707 |
+
%444 = bitcast <2 x i16> %443 to i32, !dbg !23
|
708 |
+
%445 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !23
|
709 |
+
%446 = insertelement <2 x i16> %445, i16 %437, i64 1, !dbg !23
|
710 |
+
%447 = bitcast <2 x i16> %446 to i32, !dbg !23
|
711 |
+
%448 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !23
|
712 |
+
%449 = insertelement <2 x i16> %448, i16 %439, i64 1, !dbg !23
|
713 |
+
%450 = bitcast <2 x i16> %449 to i32, !dbg !23
|
714 |
+
%451 = insertelement <2 x i16> undef, i16 %440, i64 0, !dbg !23
|
715 |
+
%452 = insertelement <2 x i16> %451, i16 %441, i64 1, !dbg !23
|
716 |
+
%453 = bitcast <2 x i16> %452 to i32, !dbg !23
|
717 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %444, i32 %447, i32 %450, i32 %453, ptr addrspace(1) %433, i1 true) #4, !dbg !23
|
718 |
+
ret void, !dbg !24
|
719 |
+
}
|
720 |
+
|
721 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
722 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
723 |
+
|
724 |
+
; Function Attrs: alwaysinline nounwind
|
725 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
726 |
+
__nv_fabsf.exit:
|
727 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
728 |
+
%.not = icmp eq i32 %0, 0
|
729 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
730 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
731 |
+
%.0 = select i1 %.not, float %2, float %1
|
732 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
733 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
734 |
+
|
735 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
736 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
737 |
+
%.not1 = icmp eq i32 %4, 0
|
738 |
+
%.01 = select i1 %.not1, float %2, float %1
|
739 |
+
br label %__internal_fmad.exit
|
740 |
+
|
741 |
+
5: ; preds = %__nv_fabsf.exit
|
742 |
+
%6 = fmul float %a, %a
|
743 |
+
br label %__internal_fmad.exit
|
744 |
+
|
745 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
746 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
747 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
748 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
749 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
750 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
751 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
752 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
753 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
754 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
755 |
+
%.not2 = icmp eq i32 %15, 0
|
756 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
757 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
758 |
+
%.02 = select i1 %.not2, float %17, float %16
|
759 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
760 |
+
%.not3 = icmp eq i32 %18, 0
|
761 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
762 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
763 |
+
%.03 = select i1 %.not3, float %20, float %19
|
764 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
765 |
+
%.not4 = icmp eq i32 %21, 0
|
766 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
767 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
768 |
+
%.04 = select i1 %.not4, float %23, float %22
|
769 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
770 |
+
%.not5 = icmp eq i32 %24, 0
|
771 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
772 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
773 |
+
%.05 = select i1 %.not5, float %26, float %25
|
774 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
775 |
+
%.not6 = icmp eq i32 %27, 0
|
776 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
777 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
778 |
+
%.06 = select i1 %.not6, float %29, float %28
|
779 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
780 |
+
%.not7 = icmp eq i32 %30, 0
|
781 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
782 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
783 |
+
%.07 = select i1 %.not7, float %32, float %31
|
784 |
+
%33 = fneg float %14
|
785 |
+
%34 = select i1 %3, float %33, float %a
|
786 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
787 |
+
%.not8 = icmp eq i32 %35, 0
|
788 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
789 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
790 |
+
%.08 = select i1 %.not8, float %37, float %36
|
791 |
+
br i1 %3, label %38, label %46
|
792 |
+
|
793 |
+
38: ; preds = %__internal_fmad.exit
|
794 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
795 |
+
%40 = fsub float 1.000000e+00, %39
|
796 |
+
%41 = bitcast float %40 to i32
|
797 |
+
%42 = bitcast float %a to i32
|
798 |
+
%43 = and i32 %42, -2147483648
|
799 |
+
%44 = or i32 %43, %41
|
800 |
+
%45 = bitcast i32 %44 to float
|
801 |
+
br label %46
|
802 |
+
|
803 |
+
46: ; preds = %38, %__internal_fmad.exit
|
804 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
805 |
+
ret float %r.0
|
806 |
+
}
|
807 |
+
|
808 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
809 |
+
|
810 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
811 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
812 |
+
|
813 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
814 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
815 |
+
|
816 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
817 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
818 |
+
|
819 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
820 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
821 |
+
|
822 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
823 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
824 |
+
|
825 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
826 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
827 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
828 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
829 |
+
attributes #4 = { nounwind }
|
830 |
+
|
831 |
+
!llvm.module.flags = !{!0, !1}
|
832 |
+
!llvm.dbg.cu = !{!2}
|
833 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
834 |
+
!llvm.ident = !{!6}
|
835 |
+
|
836 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
837 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
838 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
839 |
+
!3 = !DIFile(filename: "cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py", directory: "/tmp/torchinductor_root/jf")
|
840 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
841 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
842 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
843 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
844 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
845 |
+
!9 = !{}
|
846 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
847 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
848 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
849 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
850 |
+
!14 = !DILocation(line: 24, column: 30, scope: !7)
|
851 |
+
!15 = !DILocation(line: 24, column: 35, scope: !7)
|
852 |
+
!16 = !DILocation(line: 24, column: 44, scope: !7)
|
853 |
+
!17 = !DILocation(line: 29, column: 18, scope: !7)
|
854 |
+
!18 = !DILocation(line: 30, column: 23, scope: !7)
|
855 |
+
!19 = !DILocation(line: 27, column: 18, scope: !7)
|
856 |
+
!20 = !DILocation(line: 32, column: 18, scope: !7)
|
857 |
+
!21 = !DILocation(line: 33, column: 18, scope: !7)
|
858 |
+
!22 = !DILocation(line: 35, column: 25, scope: !7)
|
859 |
+
!23 = !DILocation(line: 35, column: 37, scope: !7)
|
860 |
+
!24 = !DILocation(line: 35, column: 4, scope: !7)
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%c1024_i32 = arith.constant 1024 : i32
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
12 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
15 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
16 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
|
17 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
|
18 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
19 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32>
|
20 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32>
|
21 |
+
%14 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
22 |
+
%15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
23 |
+
%16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
|
24 |
+
tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
25 |
+
tt.return
|
26 |
+
}
|
27 |
+
}
|
.triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.llir
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 510, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
12 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
13 |
+
%12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
14 |
+
%13 = extractvalue { i32, i32 } %12, 0, !dbg !13
|
15 |
+
%14 = extractvalue { i32, i32 } %12, 1, !dbg !13
|
16 |
+
%15 = bitcast i32 %13 to float, !dbg !13
|
17 |
+
%16 = bitcast i32 %14 to float, !dbg !13
|
18 |
+
%17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
|
19 |
+
%18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
|
20 |
+
%19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
|
21 |
+
%20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
|
22 |
+
%21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
|
23 |
+
%22 = bitcast <2 x i16> %21 to i32, !dbg !15
|
24 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
|
25 |
+
ret void, !dbg !16
|
26 |
+
}
|
27 |
+
|
28 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
29 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
30 |
+
|
31 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
32 |
+
attributes #1 = { nounwind }
|
33 |
+
|
34 |
+
!llvm.module.flags = !{!0}
|
35 |
+
!llvm.dbg.cu = !{!1}
|
36 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
37 |
+
|
38 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
39 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
40 |
+
!2 = !DIFile(filename: "czjxjqxojsyyr4zmce6q6twysnucw6p4l5ujgp6ts2ecrm3ue3ex.py", directory: "/tmp/torchinductor_root/zj")
|
41 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
42 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
43 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
44 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
45 |
+
!7 = !{}
|
46 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
47 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
48 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
49 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
50 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
51 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
52 |
+
!14 = !DILocation(line: 26, column: 25, scope: !5)
|
53 |
+
!15 = !DILocation(line: 26, column: 36, scope: !5)
|
54 |
+
!16 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin
ADDED
Binary file (13.7 kB). View file
|
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin
ADDED
Binary file (19.5 kB). View file
|
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c512_i32 = arith.constant 512 : i32
|
4 |
+
%c256_i32 = arith.constant 256 : i32
|
5 |
+
%cst = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_0 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_1 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_3 = arith.constant dense<256> : tensor<1xi64>
|
10 |
+
%cst_4 = arith.constant dense<50257> : tensor<1xi64>
|
11 |
+
%cst_5 = arith.constant dense<0> : tensor<1xi64>
|
12 |
+
%cst_6 = arith.constant dense<256> : tensor<256xi32>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
15 |
+
%2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32>
|
16 |
+
%3 = arith.remsi %0, %c512_i32 : i32
|
17 |
+
%4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
|
18 |
+
%5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
|
19 |
+
%6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
|
20 |
+
%7 = arith.muli %3, %c256_i32 : i32
|
21 |
+
%8 = tt.splat %7 : (i32) -> tensor<256xi32>
|
22 |
+
%9 = arith.addi %1, %8 : tensor<256xi32>
|
23 |
+
%10 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%11 = tt.addptr %10, %9 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%13 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
27 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
28 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
29 |
+
%16 = arith.addi %6, %cst_4 : tensor<1xi64>
|
30 |
+
%17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64>
|
31 |
+
%18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64>
|
32 |
+
%19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64>
|
33 |
+
%20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64>
|
34 |
+
%21 = arith.andi %19, %20 : tensor<1xi1>
|
35 |
+
tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1>
|
36 |
+
%22 = arith.muli %18, %cst_3 : tensor<1xi64>
|
37 |
+
%23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64>
|
38 |
+
%24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
|
39 |
+
%25 = arith.addi %24, %23 : tensor<256xi64>
|
40 |
+
%26 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
41 |
+
%27 = tt.addptr %26, %25 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
|
42 |
+
%28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
43 |
+
%29 = arith.addf %28, %12 : tensor<256xf32>
|
44 |
+
%30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32>
|
45 |
+
%31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg10: f32, %arg11: f32):
|
47 |
+
%60 = arith.addf %arg10, %arg11 : f32
|
48 |
+
tt.reduce.return %60 : f32
|
49 |
+
}) : (tensor<256xf32>) -> f32
|
50 |
+
%32 = arith.addf %31, %cst : f32
|
51 |
+
%33 = arith.divf %32, %cst_0 : f32
|
52 |
+
%34 = tt.splat %33 : (f32) -> tensor<1xf32>
|
53 |
+
%35 = tt.splat %33 : (f32) -> tensor<256xf32>
|
54 |
+
%36 = arith.subf %29, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.mulf %36, %36 : tensor<256xf32>
|
56 |
+
%38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32>
|
57 |
+
%39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({
|
58 |
+
^bb0(%arg10: f32, %arg11: f32):
|
59 |
+
%60 = arith.addf %arg10, %arg11 : f32
|
60 |
+
tt.reduce.return %60 : f32
|
61 |
+
}) : (tensor<256xf32>) -> f32
|
62 |
+
%40 = arith.addf %39, %cst : f32
|
63 |
+
%41 = arith.divf %40, %cst_0 : f32
|
64 |
+
%42 = arith.addf %41, %cst_1 : f32
|
65 |
+
%43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
66 |
+
%44 = tt.splat %43 : (f32) -> tensor<1xf32>
|
67 |
+
%45 = tt.splat %43 : (f32) -> tensor<256xf32>
|
68 |
+
%46 = arith.mulf %36, %45 : tensor<256xf32>
|
69 |
+
%47 = arith.mulf %46, %15 : tensor<256xf32>
|
70 |
+
%48 = arith.muli %0, %c256_i32 : i32
|
71 |
+
%49 = tt.splat %48 : (i32) -> tensor<256xi32>
|
72 |
+
%50 = arith.addi %1, %49 : tensor<256xi32>
|
73 |
+
%51 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
74 |
+
%52 = tt.addptr %51, %50 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
75 |
+
tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
76 |
+
gpu.barrier
|
77 |
+
%53 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
78 |
+
%54 = tt.splat %53 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
79 |
+
tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
80 |
+
%55 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
81 |
+
%56 = tt.addptr %55, %50 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
82 |
+
%57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
|
83 |
+
tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
84 |
+
%58 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
|
85 |
+
%59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
86 |
+
tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
87 |
+
tt.return
|
88 |
+
}
|
89 |
+
}
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin
ADDED
Binary file (5.16 kB). View file
|
|
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 510, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 9, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = icmp slt i32 %9, 12865792, !dbg !12
|
12 |
+
%11 = sext i32 %9 to i64, !dbg !13
|
13 |
+
%12 = getelementptr i16, ptr addrspace(1) %0, i64 %11, !dbg !13
|
14 |
+
%13 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
|
15 |
+
%14 = trunc i32 %13 to i16, !dbg !14
|
16 |
+
%extelt.offset = lshr i32 %13, 16, !dbg !14
|
17 |
+
%15 = trunc i32 %extelt.offset to i16, !dbg !14
|
18 |
+
%16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !15
|
19 |
+
%17 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !15
|
20 |
+
%18 = getelementptr float, ptr addrspace(1) %1, i64 %11, !dbg !16
|
21 |
+
%19 = bitcast float %16 to i32, !dbg !17
|
22 |
+
%20 = bitcast float %17 to i32, !dbg !17
|
23 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %19, i32 %20, ptr addrspace(1) %18, i1 %10) #1, !dbg !17
|
24 |
+
ret void, !dbg !18
|
25 |
+
}
|
26 |
+
|
27 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
28 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
29 |
+
|
30 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
31 |
+
attributes #1 = { nounwind }
|
32 |
+
|
33 |
+
!llvm.module.flags = !{!0}
|
34 |
+
!llvm.dbg.cu = !{!1}
|
35 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
36 |
+
|
37 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
38 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
39 |
+
!2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
|
40 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
41 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
|
42 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
43 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
44 |
+
!7 = !{}
|
45 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
46 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
47 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
48 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
49 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
50 |
+
!13 = !DILocation(line: 24, column: 30, scope: !5)
|
51 |
+
!14 = !DILocation(line: 24, column: 35, scope: !5)
|
52 |
+
!15 = !DILocation(line: 24, column: 45, scope: !5)
|
53 |
+
!16 = !DILocation(line: 26, column: 25, scope: !5)
|
54 |
+
!17 = !DILocation(line: 26, column: 36, scope: !5)
|
55 |
+
!18 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 10, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = or i32 %8, 512, !dbg !11
|
12 |
+
%10 = sext i32 %8 to i64, !dbg !12
|
13 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
14 |
+
%12 = sext i32 %9 to i64, !dbg !12
|
15 |
+
%13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !12
|
16 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
17 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 true) #1, !dbg !13
|
18 |
+
ret void, !dbg !14
|
19 |
+
}
|
20 |
+
|
21 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
22 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
23 |
+
|
24 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
25 |
+
attributes #1 = { nounwind }
|
26 |
+
|
27 |
+
!llvm.module.flags = !{!0}
|
28 |
+
!llvm.dbg.cu = !{!1}
|
29 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
30 |
+
|
31 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
32 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
33 |
+
!2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w")
|
34 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
35 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
36 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
37 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
38 |
+
!7 = !{}
|
39 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
40 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
41 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
42 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
43 |
+
!12 = !DILocation(line: 25, column: 25, scope: !5)
|
44 |
+
!13 = !DILocation(line: 25, column: 36, scope: !5)
|
45 |
+
!14 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<512> : tensor<128xi32, #blocked>
|
5 |
+
%c128_i32 = arith.constant 128 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c128_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<128xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>, #blocked>, tensor<128xi32, #blocked>
|
14 |
+
%8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked>
|
15 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = or i32 %5, 512, !dbg !8
|
9 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
10 |
+
%8 = sext i32 %7 to i64, !dbg !10
|
11 |
+
%9 = shl nsw i64 %8, 10, !dbg !11
|
12 |
+
%10 = zext nneg i32 %5 to i64
|
13 |
+
%11 = zext nneg i32 %6 to i64
|
14 |
+
%12 = or i64 %9, %10, !dbg !12
|
15 |
+
%13 = or i64 %9, %11, !dbg !12
|
16 |
+
%14 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
|
17 |
+
%15 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !13
|
18 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %14, i1 true) #1, !dbg !14
|
19 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 true) #1, !dbg !14
|
20 |
+
ret void, !dbg !15
|
21 |
+
}
|
22 |
+
|
23 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
24 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
25 |
+
|
26 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
27 |
+
attributes #1 = { nounwind }
|
28 |
+
|
29 |
+
!llvm.module.flags = !{!0}
|
30 |
+
!llvm.dbg.cu = !{!1}
|
31 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
32 |
+
|
33 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
34 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
35 |
+
!2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
|
36 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
37 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
38 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
39 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
40 |
+
!7 = !{}
|
41 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
42 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
43 |
+
!10 = !DILocation(line: 20, column: 34, scope: !5)
|
44 |
+
!11 = !DILocation(line: 20, column: 46, scope: !5)
|
45 |
+
!12 = !DILocation(line: 21, column: 23, scope: !5)
|
46 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
47 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
48 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
|
5 |
+
%c1024_i64 = arith.constant 1024 : i64
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.extsi %0 : i32 to i64
|
8 |
+
%2 = arith.muli %1, %c1024_i64 : i64
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
10 |
+
%4 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
|
11 |
+
%5 = tt.splat %2 : (i64) -> tensor<1024xi64, #blocked>
|
12 |
+
%6 = arith.addi %5, %4 : tensor<1024xi64, #blocked>
|
13 |
+
%7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi64, #blocked>
|
15 |
+
tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin
ADDED
Binary file (4.78 kB). View file
|
|
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<512> : tensor<256xi32, #blocked>
|
5 |
+
%c256_i32 = arith.constant 256 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c256_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<256xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<256xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>, #blocked>, tensor<256xi32, #blocked>
|
14 |
+
%8 = arith.extsi %4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
|
15 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<12865792> : tensor<1024xi32, #blocked1>
|
7 |
+
%c1024_i32 = arith.constant 1024 : i32
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
10 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
11 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
12 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
13 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
14 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
15 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
16 |
+
%8 = arith.cmpi slt, %6, %cst : tensor<1024xi32, #blocked>
|
17 |
+
%9 = arith.cmpi slt, %7, %cst_0 : tensor<1024xi32, #blocked1>
|
18 |
+
%10 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
19 |
+
%11 = tt.addptr %10, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
20 |
+
%12 = tt.load %11, %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
21 |
+
%13 = triton_gpu.convert_layout %12 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
22 |
+
%14 = arith.extf %13 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
23 |
+
%15 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
24 |
+
%16 = tt.addptr %15, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
25 |
+
tt.store %16, %14, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
26 |
+
tt.return
|
27 |
+
}
|
28 |
+
}
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32>
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
10 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<1024xi32>
|
11 |
+
%6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
12 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
13 |
+
%8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
14 |
+
%9 = arith.extf %8 : tensor<1024xbf16> to tensor<1024xf32>
|
15 |
+
%10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
16 |
+
%11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
17 |
+
tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
18 |
+
tt.return
|
19 |
+
}
|
20 |
+
}
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.llir
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 10, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = or i32 %8, 512, !dbg !11
|
12 |
+
%10 = icmp slt i32 %8, 12865792, !dbg !12
|
13 |
+
%11 = icmp slt i32 %9, 12865792, !dbg !12
|
14 |
+
%12 = sext i32 %8 to i64, !dbg !13
|
15 |
+
%13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
|
16 |
+
%14 = sext i32 %9 to i64, !dbg !13
|
17 |
+
%15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13
|
18 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 %10) #1, !dbg !14
|
19 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 %11) #1, !dbg !14
|
20 |
+
ret void, !dbg !15
|
21 |
+
}
|
22 |
+
|
23 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
24 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
25 |
+
|
26 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
27 |
+
attributes #1 = { nounwind }
|
28 |
+
|
29 |
+
!llvm.module.flags = !{!0}
|
30 |
+
!llvm.dbg.cu = !{!1}
|
31 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
32 |
+
|
33 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
34 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
35 |
+
!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
|
36 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
37 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
38 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
39 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
40 |
+
!7 = !{}
|
41 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
42 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
43 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
44 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
45 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
46 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
47 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
48 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin
ADDED
Binary file (49.4 kB). View file
|
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx
ADDED
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
|
22 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
|
23 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
|
24 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
|
25 |
+
)
|
26 |
+
.maxntid 128, 1, 1
|
27 |
+
{
|
28 |
+
.reg .pred %p<38>;
|
29 |
+
.reg .b16 %rs<13>;
|
30 |
+
.reg .b32 %r<135>;
|
31 |
+
.reg .f32 %f<103>;
|
32 |
+
.reg .b64 %rd<41>;
|
33 |
+
.loc 1 18 0
|
34 |
+
$L__func_begin0:
|
35 |
+
.loc 1 18 0
|
36 |
+
|
37 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
|
38 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
|
39 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
|
40 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
|
41 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
|
42 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
|
43 |
+
$L__tmp0:
|
44 |
+
.loc 1 22 44
|
45 |
+
mov.u32 %r1, %tid.x;
|
46 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
|
47 |
+
shl.b32 %r17, %r1, 2;
|
48 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
|
49 |
+
and.b32 %r18, %r17, 60;
|
50 |
+
bfe.u32 %r19, %r1, 5, 2;
|
51 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
|
52 |
+
bfe.u32 %r20, %r1, 1, 4;
|
53 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
|
54 |
+
shl.b32 %r21, %r19, 4;
|
55 |
+
or.b32 %r2, %r21, %r20;
|
56 |
+
.loc 1 24 33
|
57 |
+
and.b32 %r22, %r17, 4;
|
58 |
+
bfe.u32 %r23, %r1, 4, 1;
|
59 |
+
shl.b32 %r24, %r19, 1;
|
60 |
+
or.b32 %r25, %r24, %r23;
|
61 |
+
.loc 1 21 28
|
62 |
+
mov.u32 %r15, %ctaid.x;
|
63 |
+
.loc 1 21 33
|
64 |
+
shl.b32 %r3, %r15, 6;
|
65 |
+
.loc 1 22 23
|
66 |
+
or.b32 %r26, %r3, %r18;
|
67 |
+
or.b32 %r27, %r3, %r2;
|
68 |
+
.loc 1 26 20
|
69 |
+
shr.s32 %r29, %r26, 31;
|
70 |
+
shr.u32 %r30, %r29, 24;
|
71 |
+
add.s32 %r31, %r26, %r30;
|
72 |
+
shr.s32 %r32, %r31, 8;
|
73 |
+
bfe.s32 %r33, %r15, 25, 1;
|
74 |
+
shr.u32 %r34, %r33, 24;
|
75 |
+
add.s32 %r35, %r27, %r34;
|
76 |
+
shr.s32 %r36, %r35, 8;
|
77 |
+
.loc 1 37 44
|
78 |
+
shl.b32 %r37, %r36, 7;
|
79 |
+
mul.lo.s32 %r38, %r18, 12;
|
80 |
+
or.b32 %r39, %r25, %r38;
|
81 |
+
shl.b32 %r40, %r39, 1;
|
82 |
+
mov.u32 %r41, global_smem;
|
83 |
+
add.s32 %r4, %r41, %r40;
|
84 |
+
mad.lo.s32 %r42, %r2, 12, %r22;
|
85 |
+
shl.b32 %r43, %r42, 1;
|
86 |
+
add.s32 %r6, %r41, %r43;
|
87 |
+
shl.b32 %r44, %r39, 2;
|
88 |
+
add.s32 %r7, %r41, %r44;
|
89 |
+
shl.b32 %r45, %r42, 2;
|
90 |
+
add.s32 %r9, %r41, %r45;
|
91 |
+
.loc 1 30 36
|
92 |
+
mad.lo.s32 %r46, %r32, 32512, %r26;
|
93 |
+
shl.b32 %r47, %r19, 9;
|
94 |
+
add.s32 %r48, %r46, %r47;
|
95 |
+
shl.b32 %r49, %r23, 8;
|
96 |
+
add.s32 %r133, %r48, %r49;
|
97 |
+
or.b32 %r50, %r37, %r22;
|
98 |
+
mul.wide.s32 %rd23, %r50, 4;
|
99 |
+
add.s64 %rd40, %rd22, %rd23;
|
100 |
+
add.s64 %rd39, %rd21, %rd23;
|
101 |
+
add.s64 %rd38, %rd20, %rd23;
|
102 |
+
add.s64 %rd37, %rd19, %rd23;
|
103 |
+
mov.f32 %f95, 0f00000000;
|
104 |
+
mov.b32 %r134, -8;
|
105 |
+
mov.pred %p1, -1;
|
106 |
+
mov.f32 %f96, %f95;
|
107 |
+
mov.f32 %f97, %f95;
|
108 |
+
mov.f32 %f98, %f95;
|
109 |
+
mov.f32 %f99, %f95;
|
110 |
+
mov.f32 %f100, %f95;
|
111 |
+
mov.f32 %f101, %f95;
|
112 |
+
mov.f32 %f102, %f95;
|
113 |
+
$L__BB0_1:
|
114 |
+
.loc 1 34 34
|
115 |
+
mul.wide.s32 %rd32, %r133, 2;
|
116 |
+
add.s64 %rd24, %rd13, %rd32;
|
117 |
+
mov.b32 %r53, 0;
|
118 |
+
.loc 1 34 63
|
119 |
+
mov.u32 %r51, 0x0;
|
120 |
+
mov.u32 %r52, 0x0;
|
121 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r51, %r52 }, [ %rd24 + 0 ];
|
122 |
+
@!%p1 mov.u32 %r51, %r53;
|
123 |
+
@!%p1 mov.u32 %r52, %r53;
|
124 |
+
shr.u32 %r115, %r51, 16;
|
125 |
+
shr.u32 %r116, %r52, 16;
|
126 |
+
.loc 1 34 115
|
127 |
+
bar.sync 0;
|
128 |
+
st.shared.u16 [%r4], %r51;
|
129 |
+
st.shared.u16 [%r4+24], %r115;
|
130 |
+
st.shared.u16 [%r4+48], %r52;
|
131 |
+
st.shared.u16 [%r4+72], %r116;
|
132 |
+
bar.sync 0;
|
133 |
+
ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%r6];
|
134 |
+
cvt.f32.bf16 %r55, %rs1;
|
135 |
+
mov.b32 %f25, %r55;
|
136 |
+
cvt.f32.bf16 %r56, %rs2;
|
137 |
+
mov.b32 %f26, %r56;
|
138 |
+
cvt.f32.bf16 %r57, %rs3;
|
139 |
+
mov.b32 %f27, %r57;
|
140 |
+
cvt.f32.bf16 %r58, %rs4;
|
141 |
+
mov.b32 %f28, %r58;
|
142 |
+
.loc 1 35 34
|
143 |
+
mul.wide.s32 %rd33, %r133, 4;
|
144 |
+
add.s64 %rd25, %rd14, %rd33;
|
145 |
+
.loc 1 35 63
|
146 |
+
mov.u32 %r59, 0x0;
|
147 |
+
mov.u32 %r60, 0x0;
|
148 |
+
mov.u32 %r61, 0x0;
|
149 |
+
mov.u32 %r62, 0x0;
|
150 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd25 + 0 ];
|
151 |
+
@!%p1 mov.u32 %r59, %r53;
|
152 |
+
@!%p1 mov.u32 %r60, %r53;
|
153 |
+
@!%p1 mov.u32 %r61, %r53;
|
154 |
+
@!%p1 mov.u32 %r62, %r53;
|
155 |
+
mov.b32 %f29, %r59;
|
156 |
+
mov.b32 %f30, %r60;
|
157 |
+
mov.b32 %f31, %r61;
|
158 |
+
mov.b32 %f32, %r62;
|
159 |
+
bar.sync 0;
|
160 |
+
st.shared.u32 [%r7], %r59;
|
161 |
+
st.shared.u32 [%r7+48], %r60;
|
162 |
+
st.shared.u32 [%r7+96], %r61;
|
163 |
+
st.shared.u32 [%r7+144], %r62;
|
164 |
+
bar.sync 0;
|
165 |
+
ld.shared.v4.f32 {%f33, %f34, %f35, %f36}, [%r9];
|
166 |
+
.loc 1 36 34
|
167 |
+
add.s64 %rd26, %rd15, %rd32;
|
168 |
+
.loc 1 36 63
|
169 |
+
mov.u32 %r67, 0x0;
|
170 |
+
mov.u32 %r68, 0x0;
|
171 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r67, %r68 }, [ %rd26 + 0 ];
|
172 |
+
@!%p1 mov.u32 %r67, %r53;
|
173 |
+
@!%p1 mov.u32 %r68, %r53;
|
174 |
+
cvt.u16.u32 %rs5, %r67;
|
175 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r67; }
|
176 |
+
cvt.u16.u32 %rs7, %r68;
|
177 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r68; }
|
178 |
+
.loc 1 36 115
|
179 |
+
cvt.f32.bf16 %r71, %rs5;
|
180 |
+
mov.b32 %f37, %r71;
|
181 |
+
cvt.f32.bf16 %r72, %rs6;
|
182 |
+
mov.b32 %f38, %r72;
|
183 |
+
cvt.f32.bf16 %r73, %rs7;
|
184 |
+
mov.b32 %f39, %r73;
|
185 |
+
cvt.f32.bf16 %r74, %rs8;
|
186 |
+
mov.b32 %f40, %r74;
|
187 |
+
.loc 1 37 50
|
188 |
+
mov.u32 %r75, 0x0;
|
189 |
+
mov.u32 %r76, 0x0;
|
190 |
+
mov.u32 %r77, 0x0;
|
191 |
+
mov.u32 %r78, 0x0;
|
192 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd37 + 0 ];
|
193 |
+
@!%p1 mov.u32 %r75, %r53;
|
194 |
+
@!%p1 mov.u32 %r76, %r53;
|
195 |
+
@!%p1 mov.u32 %r77, %r53;
|
196 |
+
@!%p1 mov.u32 %r78, %r53;
|
197 |
+
.loc 1 38 50
|
198 |
+
mov.u32 %r83, 0x0;
|
199 |
+
mov.u32 %r84, 0x0;
|
200 |
+
mov.u32 %r85, 0x0;
|
201 |
+
mov.u32 %r86, 0x0;
|
202 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd38 + 0 ];
|
203 |
+
@!%p1 mov.u32 %r83, %r53;
|
204 |
+
@!%p1 mov.u32 %r84, %r53;
|
205 |
+
@!%p1 mov.u32 %r85, %r53;
|
206 |
+
@!%p1 mov.u32 %r86, %r53;
|
207 |
+
.loc 1 39 35
|
208 |
+
add.s64 %rd29, %rd16, %rd32;
|
209 |
+
.loc 1 39 64
|
210 |
+
mov.u32 %r91, 0x0;
|
211 |
+
mov.u32 %r92, 0x0;
|
212 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r91, %r92 }, [ %rd29 + 0 ];
|
213 |
+
@!%p1 mov.u32 %r91, %r53;
|
214 |
+
@!%p1 mov.u32 %r92, %r53;
|
215 |
+
shr.u32 %r117, %r91, 16;
|
216 |
+
shr.u32 %r118, %r92, 16;
|
217 |
+
.loc 1 39 116
|
218 |
+
bar.sync 0;
|
219 |
+
st.shared.u16 [%r4], %r91;
|
220 |
+
st.shared.u16 [%r4+24], %r117;
|
221 |
+
st.shared.u16 [%r4+48], %r92;
|
222 |
+
st.shared.u16 [%r4+72], %r118;
|
223 |
+
bar.sync 0;
|
224 |
+
ld.shared.v4.u16 {%rs9, %rs10, %rs11, %rs12}, [%r6];
|
225 |
+
cvt.f32.bf16 %r95, %rs9;
|
226 |
+
mov.b32 %f41, %r95;
|
227 |
+
cvt.f32.bf16 %r96, %rs10;
|
228 |
+
mov.b32 %f42, %r96;
|
229 |
+
cvt.f32.bf16 %r97, %rs11;
|
230 |
+
mov.b32 %f43, %r97;
|
231 |
+
cvt.f32.bf16 %r98, %rs12;
|
232 |
+
mov.b32 %f44, %r98;
|
233 |
+
.loc 1 40 51
|
234 |
+
mov.u32 %r99, 0x0;
|
235 |
+
mov.u32 %r100, 0x0;
|
236 |
+
mov.u32 %r101, 0x0;
|
237 |
+
mov.u32 %r102, 0x0;
|
238 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd39 + 0 ];
|
239 |
+
@!%p1 mov.u32 %r99, %r53;
|
240 |
+
@!%p1 mov.u32 %r100, %r53;
|
241 |
+
@!%p1 mov.u32 %r101, %r53;
|
242 |
+
@!%p1 mov.u32 %r102, %r53;
|
243 |
+
.loc 1 41 51
|
244 |
+
mov.u32 %r107, 0x0;
|
245 |
+
mov.u32 %r108, 0x0;
|
246 |
+
mov.u32 %r109, 0x0;
|
247 |
+
mov.u32 %r110, 0x0;
|
248 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd40 + 0 ];
|
249 |
+
@!%p1 mov.u32 %r107, %r53;
|
250 |
+
@!%p1 mov.u32 %r108, %r53;
|
251 |
+
@!%p1 mov.u32 %r109, %r53;
|
252 |
+
@!%p1 mov.u32 %r110, %r53;
|
253 |
+
.loc 1 44 22
|
254 |
+
add.f32 %f45, %f37, %f29;
|
255 |
+
add.f32 %f46, %f38, %f30;
|
256 |
+
add.f32 %f47, %f39, %f31;
|
257 |
+
add.f32 %f48, %f40, %f32;
|
258 |
+
bar.sync 0;
|
259 |
+
st.shared.f32 [%r7], %f45;
|
260 |
+
st.shared.f32 [%r7+48], %f46;
|
261 |
+
st.shared.f32 [%r7+96], %f47;
|
262 |
+
st.shared.f32 [%r7+144], %f48;
|
263 |
+
bar.sync 0;
|
264 |
+
ld.shared.v4.f32 {%f49, %f50, %f51, %f52}, [%r9];
|
265 |
+
.loc 1 40 51
|
266 |
+
mov.b32 %f53, %r75;
|
267 |
+
mov.b32 %f54, %r76;
|
268 |
+
mov.b32 %f55, %r77;
|
269 |
+
mov.b32 %f56, %r78;
|
270 |
+
mov.b32 %f57, %r99;
|
271 |
+
mov.b32 %f58, %r100;
|
272 |
+
mov.b32 %f59, %r101;
|
273 |
+
mov.b32 %f60, %r102;
|
274 |
+
.loc 1 41 51
|
275 |
+
mov.b32 %f61, %r110;
|
276 |
+
mov.b32 %f62, %r109;
|
277 |
+
mov.b32 %f63, %r108;
|
278 |
+
mov.b32 %f64, %r107;
|
279 |
+
mov.b32 %f65, %r86;
|
280 |
+
mov.b32 %f66, %r85;
|
281 |
+
mov.b32 %f67, %r84;
|
282 |
+
mov.b32 %f68, %r83;
|
283 |
+
.loc 1 52 23
|
284 |
+
sub.f32 %f69, %f36, %f60;
|
285 |
+
sub.f32 %f70, %f35, %f59;
|
286 |
+
sub.f32 %f71, %f34, %f58;
|
287 |
+
sub.f32 %f72, %f33, %f57;
|
288 |
+
sub.f32 %f73, %f52, %f56;
|
289 |
+
sub.f32 %f74, %f51, %f55;
|
290 |
+
sub.f32 %f75, %f50, %f54;
|
291 |
+
sub.f32 %f76, %f49, %f53;
|
292 |
+
.loc 1 53 24
|
293 |
+
mul.f32 %f77, %f76, %f68;
|
294 |
+
mul.f32 %f78, %f75, %f67;
|
295 |
+
mul.f32 %f79, %f74, %f66;
|
296 |
+
mul.f32 %f80, %f73, %f65;
|
297 |
+
mul.f32 %f81, %f72, %f64;
|
298 |
+
mul.f32 %f82, %f71, %f63;
|
299 |
+
mul.f32 %f83, %f70, %f62;
|
300 |
+
mul.f32 %f84, %f69, %f61;
|
301 |
+
.loc 1 57 40
|
302 |
+
fma.rn.f32 %f98, %f44, %f84, %f98;
|
303 |
+
fma.rn.f32 %f97, %f43, %f83, %f97;
|
304 |
+
fma.rn.f32 %f96, %f42, %f82, %f96;
|
305 |
+
fma.rn.f32 %f95, %f41, %f81, %f95;
|
306 |
+
fma.rn.f32 %f102, %f28, %f80, %f102;
|
307 |
+
fma.rn.f32 %f101, %f27, %f79, %f101;
|
308 |
+
fma.rn.f32 %f100, %f26, %f78, %f100;
|
309 |
+
fma.rn.f32 %f99, %f25, %f77, %f99;
|
310 |
+
.loc 1 30 36
|
311 |
+
add.s32 %r134, %r134, 8;
|
312 |
+
add.s32 %r133, %r133, 2048;
|
313 |
+
add.s64 %rd40, %rd40, 32;
|
314 |
+
add.s64 %rd39, %rd39, 32;
|
315 |
+
add.s64 %rd38, %rd38, 32;
|
316 |
+
add.s64 %rd37, %rd37, 32;
|
317 |
+
setp.lt.u32 %p35, %r134, 120;
|
318 |
+
@%p35 bra $L__BB0_1;
|
319 |
+
.loc 1 22 44
|
320 |
+
and.b32 %r121, %r1, 63;
|
321 |
+
.loc 1 22 23
|
322 |
+
or.b32 %r122, %r3, %r121;
|
323 |
+
$L__tmp1:
|
324 |
+
.loc 2 233 15
|
325 |
+
add.f32 %f85, %f99, %f100;
|
326 |
+
add.f32 %f86, %f101, %f85;
|
327 |
+
add.f32 %f87, %f102, %f86;
|
328 |
+
$L__tmp2:
|
329 |
+
.loc 2 243 36
|
330 |
+
mov.b32 %r123, %f87;
|
331 |
+
shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1;
|
332 |
+
mov.b32 %f88, %r124;
|
333 |
+
$L__tmp3:
|
334 |
+
.loc 2 233 15
|
335 |
+
add.f32 %f89, %f87, %f88;
|
336 |
+
$L__tmp4:
|
337 |
+
.loc 1 58 30
|
338 |
+
bar.sync 0;
|
339 |
+
shl.b32 %r125, %r2, 2;
|
340 |
+
add.s32 %r127, %r41, %r125;
|
341 |
+
st.shared.f32 [%r127], %f89;
|
342 |
+
bar.sync 0;
|
343 |
+
shl.b32 %r128, %r121, 2;
|
344 |
+
add.s32 %r129, %r41, %r128;
|
345 |
+
ld.shared.u32 %r119, [%r129];
|
346 |
+
.loc 1 59 25
|
347 |
+
mul.wide.s32 %rd36, %r122, 4;
|
348 |
+
add.s64 %rd34, %rd17, %rd36;
|
349 |
+
.loc 1 59 37
|
350 |
+
and.b32 %r130, %r1, 64;
|
351 |
+
setp.eq.s32 %p36, %r130, 0;
|
352 |
+
@%p36 st.global.b32 [ %rd34 + 0 ], { %r119 };
|
353 |
+
$L__tmp5:
|
354 |
+
.loc 2 233 15
|
355 |
+
add.f32 %f90, %f95, %f96;
|
356 |
+
add.f32 %f91, %f97, %f90;
|
357 |
+
add.f32 %f92, %f98, %f91;
|
358 |
+
$L__tmp6:
|
359 |
+
.loc 2 243 36
|
360 |
+
mov.b32 %r131, %f92;
|
361 |
+
shfl.sync.bfly.b32 %r132, %r131, 1, 31, -1;
|
362 |
+
mov.b32 %f93, %r132;
|
363 |
+
$L__tmp7:
|
364 |
+
.loc 2 233 15
|
365 |
+
add.f32 %f94, %f92, %f93;
|
366 |
+
$L__tmp8:
|
367 |
+
.loc 1 60 30
|
368 |
+
bar.sync 0;
|
369 |
+
st.shared.f32 [%r127], %f94;
|
370 |
+
bar.sync 0;
|
371 |
+
ld.shared.u32 %r120, [%r129];
|
372 |
+
.loc 1 61 25
|
373 |
+
add.s64 %rd35, %rd18, %rd36;
|
374 |
+
.loc 1 61 37
|
375 |
+
@%p36 st.global.b32 [ %rd35 + 0 ], { %r120 };
|
376 |
+
.loc 1 61 4
|
377 |
+
ret;
|
378 |
+
$L__tmp9:
|
379 |
+
$L__func_end0:
|
380 |
+
|
381 |
+
}
|
382 |
+
.file 1 "/tmp/torchinductor_root/3x/c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py"
|
383 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
384 |
+
.section .debug_abbrev
|
385 |
+
{
|
386 |
+
.b8 1
|
387 |
+
.b8 17
|
388 |
+
.b8 1
|
389 |
+
.b8 37
|
390 |
+
.b8 8
|
391 |
+
.b8 19
|
392 |
+
.b8 5
|
393 |
+
.b8 3
|
394 |
+
.b8 8
|
395 |
+
.b8 16
|
396 |
+
.b8 6
|
397 |
+
.b8 27
|
398 |
+
.b8 8
|
399 |
+
.b8 180
|
400 |
+
.b8 66
|
401 |
+
.b8 12
|
402 |
+
.b8 17
|
403 |
+
.b8 1
|
404 |
+
.b8 18
|
405 |
+
.b8 1
|
406 |
+
.b8 0
|
407 |
+
.b8 0
|
408 |
+
.b8 2
|
409 |
+
.b8 46
|
410 |
+
.b8 0
|
411 |
+
.b8 135
|
412 |
+
.b8 64
|
413 |
+
.b8 8
|
414 |
+
.b8 3
|
415 |
+
.b8 8
|
416 |
+
.b8 58
|
417 |
+
.b8 11
|
418 |
+
.b8 59
|
419 |
+
.b8 11
|
420 |
+
.b8 63
|
421 |
+
.b8 12
|
422 |
+
.b8 32
|
423 |
+
.b8 11
|
424 |
+
.b8 0
|
425 |
+
.b8 0
|
426 |
+
.b8 3
|
427 |
+
.b8 46
|
428 |
+
.b8 1
|
429 |
+
.b8 17
|
430 |
+
.b8 1
|
431 |
+
.b8 18
|
432 |
+
.b8 1
|
433 |
+
.b8 64
|
434 |
+
.b8 10
|
435 |
+
.b8 49
|
436 |
+
.b8 19
|
437 |
+
.b8 0
|
438 |
+
.b8 0
|
439 |
+
.b8 4
|
440 |
+
.b8 29
|
441 |
+
.b8 1
|
442 |
+
.b8 49
|
443 |
+
.b8 19
|
444 |
+
.b8 17
|
445 |
+
.b8 1
|
446 |
+
.b8 18
|
447 |
+
.b8 1
|
448 |
+
.b8 88
|
449 |
+
.b8 11
|
450 |
+
.b8 89
|
451 |
+
.b8 11
|
452 |
+
.b8 87
|
453 |
+
.b8 11
|
454 |
+
.b8 0
|
455 |
+
.b8 0
|
456 |
+
.b8 5
|
457 |
+
.b8 29
|
458 |
+
.b8 0
|
459 |
+
.b8 49
|
460 |
+
.b8 19
|
461 |
+
.b8 17
|
462 |
+
.b8 1
|
463 |
+
.b8 18
|
464 |
+
.b8 1
|
465 |
+
.b8 88
|
466 |
+
.b8 11
|
467 |
+
.b8 89
|
468 |
+
.b8 11
|
469 |
+
.b8 87
|
470 |
+
.b8 11
|
471 |
+
.b8 0
|
472 |
+
.b8 0
|
473 |
+
.b8 0
|
474 |
+
}
|
475 |
+
.section .debug_info
|
476 |
+
{
|
477 |
+
.b32 371
|
478 |
+
.b8 2
|
479 |
+
.b8 0
|
480 |
+
.b32 .debug_abbrev
|
481 |
+
.b8 8
|
482 |
+
.b8 1
|
483 |
+
.b8 116
|
484 |
+
.b8 114
|
485 |
+
.b8 105
|
486 |
+
.b8 116
|
487 |
+
.b8 111
|
488 |
+
.b8 110
|
489 |
+
.b8 0
|
490 |
+
.b8 2
|
491 |
+
.b8 0
|
492 |
+
.b8 99
|
493 |
+
.b8 51
|
494 |
+
.b8 120
|
495 |
+
.b8 120
|
496 |
+
.b8 115
|
497 |
+
.b8 122
|
498 |
+
.b8 118
|
499 |
+
.b8 103
|
500 |
+
.b8 116
|
501 |
+
.b8 102
|
502 |
+
.b8 110
|
503 |
+
.b8 106
|
504 |
+
.b8 98
|
505 |
+
.b8 55
|
506 |
+
.b8 119
|
507 |
+
.b8 101
|
508 |
+
.b8 108
|
509 |
+
.b8 113
|
510 |
+
.b8 118
|
511 |
+
.b8 114
|
512 |
+
.b8 51
|
513 |
+
.b8 51
|
514 |
+
.b8 122
|
515 |
+
.b8 52
|
516 |
+
.b8 99
|
517 |
+
.b8 113
|
518 |
+
.b8 111
|
519 |
+
.b8 117
|
520 |
+
.b8 120
|
521 |
+
.b8 104
|
522 |
+
.b8 113
|
523 |
+
.b8 106
|
524 |
+
.b8 121
|
525 |
+
.b8 51
|
526 |
+
.b8 100
|
527 |
+
.b8 112
|
528 |
+
.b8 119
|
529 |
+
.b8 97
|
530 |
+
.b8 50
|
531 |
+
.b8 113
|
532 |
+
.b8 109
|
533 |
+
.b8 109
|
534 |
+
.b8 120
|
535 |
+
.b8 50
|
536 |
+
.b8 120
|
537 |
+
.b8 116
|
538 |
+
.b8 111
|
539 |
+
.b8 54
|
540 |
+
.b8 115
|
541 |
+
.b8 103
|
542 |
+
.b8 118
|
543 |
+
.b8 122
|
544 |
+
.b8 46
|
545 |
+
.b8 112
|
546 |
+
.b8 121
|
547 |
+
.b8 0
|
548 |
+
.b32 .debug_line
|
549 |
+
.b8 47
|
550 |
+
.b8 116
|
551 |
+
.b8 109
|
552 |
+
.b8 112
|
553 |
+
.b8 47
|
554 |
+
.b8 116
|
555 |
+
.b8 111
|
556 |
+
.b8 114
|
557 |
+
.b8 99
|
558 |
+
.b8 104
|
559 |
+
.b8 105
|
560 |
+
.b8 110
|
561 |
+
.b8 100
|
562 |
+
.b8 117
|
563 |
+
.b8 99
|
564 |
+
.b8 116
|
565 |
+
.b8 111
|
566 |
+
.b8 114
|
567 |
+
.b8 95
|
568 |
+
.b8 114
|
569 |
+
.b8 111
|
570 |
+
.b8 111
|
571 |
+
.b8 116
|
572 |
+
.b8 47
|
573 |
+
.b8 51
|
574 |
+
.b8 120
|
575 |
+
.b8 0
|
576 |
+
.b8 1
|
577 |
+
.b64 $L__func_begin0
|
578 |
+
.b64 $L__func_end0
|
579 |
+
.b8 2
|
580 |
+
.b8 116
|
581 |
+
.b8 114
|
582 |
+
.b8 105
|
583 |
+
.b8 116
|
584 |
+
.b8 111
|
585 |
+
.b8 110
|
586 |
+
.b8 95
|
587 |
+
.b8 95
|
588 |
+
.b8 48
|
589 |
+
.b8 100
|
590 |
+
.b8 49
|
591 |
+
.b8 100
|
592 |
+
.b8 50
|
593 |
+
.b8 100
|
594 |
+
.b8 51
|
595 |
+
.b8 100
|
596 |
+
.b8 52
|
597 |
+
.b8 100
|
598 |
+
.b8 53
|
599 |
+
.b8 100
|
600 |
+
.b8 54
|
601 |
+
.b8 100
|
602 |
+
.b8 55
|
603 |
+
.b8 100
|
604 |
+
.b8 56
|
605 |
+
.b8 100
|
606 |
+
.b8 57
|
607 |
+
.b8 100
|
608 |
+
.b8 49
|
609 |
+
.b8 48
|
610 |
+
.b8 100
|
611 |
+
.b8 101
|
612 |
+
.b8 49
|
613 |
+
.b8 49
|
614 |
+
.b8 100
|
615 |
+
.b8 101
|
616 |
+
.b8 0
|
617 |
+
.b8 116
|
618 |
+
.b8 114
|
619 |
+
.b8 105
|
620 |
+
.b8 116
|
621 |
+
.b8 111
|
622 |
+
.b8 110
|
623 |
+
.b8 95
|
624 |
+
.b8 95
|
625 |
+
.b8 48
|
626 |
+
.b8 100
|
627 |
+
.b8 49
|
628 |
+
.b8 100
|
629 |
+
.b8 50
|
630 |
+
.b8 100
|
631 |
+
.b8 51
|
632 |
+
.b8 100
|
633 |
+
.b8 52
|
634 |
+
.b8 100
|
635 |
+
.b8 53
|
636 |
+
.b8 100
|
637 |
+
.b8 54
|
638 |
+
.b8 100
|
639 |
+
.b8 55
|
640 |
+
.b8 100
|
641 |
+
.b8 56
|
642 |
+
.b8 100
|
643 |
+
.b8 57
|
644 |
+
.b8 100
|
645 |
+
.b8 49
|
646 |
+
.b8 48
|
647 |
+
.b8 100
|
648 |
+
.b8 101
|
649 |
+
.b8 49
|
650 |
+
.b8 49
|
651 |
+
.b8 100
|
652 |
+
.b8 101
|
653 |
+
.b8 0
|
654 |
+
.b8 1
|
655 |
+
.b8 18
|
656 |
+
.b8 1
|
657 |
+
.b8 1
|
658 |
+
.b8 3
|
659 |
+
.b64 $L__func_begin0
|
660 |
+
.b64 $L__func_end0
|
661 |
+
.b8 1
|
662 |
+
.b8 156
|
663 |
+
.b32 125
|
664 |
+
.b8 4
|
665 |
+
.b32 125
|
666 |
+
.b64 $L__tmp1
|
667 |
+
.b64 $L__tmp4
|
668 |
+
.b8 2
|
669 |
+
.b8 58
|
670 |
+
.b8 27
|
671 |
+
.b8 5
|
672 |
+
.b32 125
|
673 |
+
.b64 $L__tmp1
|
674 |
+
.b64 $L__tmp4
|
675 |
+
.b8 2
|
676 |
+
.b8 243
|
677 |
+
.b8 36
|
678 |
+
.b8 0
|
679 |
+
.b8 5
|
680 |
+
.b32 125
|
681 |
+
.b64 $L__tmp2
|
682 |
+
.b64 $L__tmp3
|
683 |
+
.b8 2
|
684 |
+
.b8 58
|
685 |
+
.b8 27
|
686 |
+
.b8 4
|
687 |
+
.b32 125
|
688 |
+
.b64 $L__tmp5
|
689 |
+
.b64 $L__tmp8
|
690 |
+
.b8 2
|
691 |
+
.b8 60
|
692 |
+
.b8 27
|
693 |
+
.b8 5
|
694 |
+
.b32 125
|
695 |
+
.b64 $L__tmp5
|
696 |
+
.b64 $L__tmp8
|
697 |
+
.b8 2
|
698 |
+
.b8 243
|
699 |
+
.b8 36
|
700 |
+
.b8 0
|
701 |
+
.b8 5
|
702 |
+
.b32 125
|
703 |
+
.b64 $L__tmp6
|
704 |
+
.b64 $L__tmp7
|
705 |
+
.b8 2
|
706 |
+
.b8 60
|
707 |
+
.b8 27
|
708 |
+
.b8 0
|
709 |
+
.b8 0
|
710 |
+
}
|
711 |
+
.section .debug_pubnames
|
712 |
+
{
|
713 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
714 |
+
$L__pubNames_start0:
|
715 |
+
.b8 2
|
716 |
+
.b8 0
|
717 |
+
.b32 .debug_info
|
718 |
+
.b32 375
|
719 |
+
.b32 125
|
720 |
+
.b8 116
|
721 |
+
.b8 114
|
722 |
+
.b8 105
|
723 |
+
.b8 116
|
724 |
+
.b8 111
|
725 |
+
.b8 110
|
726 |
+
.b8 95
|
727 |
+
.b8 95
|
728 |
+
.b8 48
|
729 |
+
.b8 100
|
730 |
+
.b8 49
|
731 |
+
.b8 100
|
732 |
+
.b8 50
|
733 |
+
.b8 100
|
734 |
+
.b8 51
|
735 |
+
.b8 100
|
736 |
+
.b8 52
|
737 |
+
.b8 100
|
738 |
+
.b8 53
|
739 |
+
.b8 100
|
740 |
+
.b8 54
|
741 |
+
.b8 100
|
742 |
+
.b8 55
|
743 |
+
.b8 100
|
744 |
+
.b8 56
|
745 |
+
.b8 100
|
746 |
+
.b8 57
|
747 |
+
.b8 100
|
748 |
+
.b8 49
|
749 |
+
.b8 48
|
750 |
+
.b8 100
|
751 |
+
.b8 101
|
752 |
+
.b8 49
|
753 |
+
.b8 49
|
754 |
+
.b8 100
|
755 |
+
.b8 101
|
756 |
+
.b8 0
|
757 |
+
.b32 0
|
758 |
+
$L__pubNames_end0:
|
759 |
+
}
|
760 |
+
.section .debug_pubtypes
|
761 |
+
{
|
762 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
763 |
+
$L__pubTypes_start0:
|
764 |
+
.b8 2
|
765 |
+
.b8 0
|
766 |
+
.b32 .debug_info
|
767 |
+
.b32 375
|
768 |
+
.b32 0
|
769 |
+
$L__pubTypes_end0:
|
770 |
+
}
|
771 |
+
.section .debug_loc { }
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
|
4 |
+
%c8_i32 = arith.constant 8 : i32
|
5 |
+
%c128_i32 = arith.constant 128 : i32
|
6 |
+
%c0_i32 = arith.constant 0 : i32
|
7 |
+
%cst_0 = arith.constant dense<128> : tensor<64x1xi32>
|
8 |
+
%cst_1 = arith.constant dense<32768> : tensor<64x1xi32>
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<1x8xi32>
|
10 |
+
%cst_3 = arith.constant dense<128> : tensor<1x8xi32>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi32>
|
13 |
+
%c64_i32 = arith.constant 64 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
17 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
18 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
19 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
20 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
21 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
22 |
+
%8 = arith.remsi %5, %cst_5 : tensor<64x1xi32>
|
23 |
+
%9 = arith.divsi %5, %cst_5 : tensor<64x1xi32>
|
24 |
+
%10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
25 |
+
%11 = arith.muli %9, %cst_1 : tensor<64x1xi32>
|
26 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
27 |
+
%13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
28 |
+
%14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
29 |
+
%15 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
30 |
+
%16 = arith.muli %9, %cst_0 : tensor<64x1xi32>
|
31 |
+
%17 = tt.broadcast %16 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
32 |
+
%18 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
33 |
+
%19 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
34 |
+
%20 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
35 |
+
%21 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
36 |
+
%22 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
37 |
+
%23:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_4, %arg14 = %cst_4) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
|
38 |
+
%32 = tt.splat %arg12 : (i32) -> tensor<1x8xi32>
|
39 |
+
%33 = arith.addi %32, %7 : tensor<1x8xi32>
|
40 |
+
%34 = arith.cmpi slt, %33, %cst_3 : tensor<1x8xi32>
|
41 |
+
%35 = arith.muli %33, %cst_2 : tensor<1x8xi32>
|
42 |
+
%36 = tt.broadcast %35 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
43 |
+
%37 = arith.addi %10, %36 : tensor<64x8xi32>
|
44 |
+
%38 = arith.addi %37, %12 : tensor<64x8xi32>
|
45 |
+
%39 = tt.addptr %13, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
46 |
+
%40 = tt.broadcast %34 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
47 |
+
%41 = tt.load %39, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
48 |
+
%42 = arith.extf %41 : tensor<64x8xbf16> to tensor<64x8xf32>
|
49 |
+
%43 = tt.addptr %14, %38 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
50 |
+
%44 = tt.load %43, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
51 |
+
%45 = tt.addptr %15, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
52 |
+
%46 = tt.load %45, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
53 |
+
%47 = arith.extf %46 : tensor<64x8xbf16> to tensor<64x8xf32>
|
54 |
+
%48 = tt.broadcast %33 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
55 |
+
%49 = arith.addi %48, %17 : tensor<64x8xi32>
|
56 |
+
%50 = tt.addptr %18, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
57 |
+
%51 = tt.load %50, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
58 |
+
%52 = tt.addptr %19, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
59 |
+
%53 = tt.load %52, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
60 |
+
%54 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
61 |
+
%55 = tt.load %54, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
62 |
+
%56 = arith.extf %55 : tensor<64x8xbf16> to tensor<64x8xf32>
|
63 |
+
%57 = tt.addptr %21, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
64 |
+
%58 = tt.load %57, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
65 |
+
%59 = tt.addptr %22, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
66 |
+
%60 = tt.load %59, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
67 |
+
%61 = arith.addf %44, %47 : tensor<64x8xf32>
|
68 |
+
%62 = arith.subf %61, %51 : tensor<64x8xf32>
|
69 |
+
%63 = arith.mulf %62, %53 : tensor<64x8xf32>
|
70 |
+
%64 = arith.mulf %42, %63 : tensor<64x8xf32>
|
71 |
+
%65 = arith.addf %arg13, %64 : tensor<64x8xf32>
|
72 |
+
%66 = arith.select %40, %65, %arg13 : tensor<64x8xi1>, tensor<64x8xf32>
|
73 |
+
%67 = arith.subf %44, %58 : tensor<64x8xf32>
|
74 |
+
%68 = arith.mulf %67, %60 : tensor<64x8xf32>
|
75 |
+
%69 = arith.mulf %56, %68 : tensor<64x8xf32>
|
76 |
+
%70 = arith.addf %arg14, %69 : tensor<64x8xf32>
|
77 |
+
%71 = arith.select %40, %70, %arg14 : tensor<64x8xi1>, tensor<64x8xf32>
|
78 |
+
scf.yield %66, %71 : tensor<64x8xf32>, tensor<64x8xf32>
|
79 |
+
}
|
80 |
+
%24 = "tt.reduce"(%23#0) <{axis = 1 : i32}> ({
|
81 |
+
^bb0(%arg12: f32, %arg13: f32):
|
82 |
+
%32 = arith.addf %arg12, %arg13 : f32
|
83 |
+
tt.reduce.return %32 : f32
|
84 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
85 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
86 |
+
%26 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
87 |
+
%27 = tt.addptr %26, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
88 |
+
tt.store %27, %25 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
89 |
+
%28 = "tt.reduce"(%23#1) <{axis = 1 : i32}> ({
|
90 |
+
^bb0(%arg12: f32, %arg13: f32):
|
91 |
+
%32 = arith.addf %arg12, %arg13 : f32
|
92 |
+
tt.reduce.return %32 : f32
|
93 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
94 |
+
%29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
95 |
+
%30 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
96 |
+
%31 = tt.addptr %30, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
97 |
+
tt.store %31, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
98 |
+
tt.return
|
99 |
+
}
|
100 |
+
}
|
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ptx
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 256, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<3>;
|
20 |
+
.reg .b32 %r<13>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r7, %tid.x;
|
31 |
+
shl.b32 %r8, %r7, 1;
|
32 |
+
and.b32 %r9, %r8, 510;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r10, %r1, 9;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r11, %r10, %r9;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd5, %r11, 4;
|
41 |
+
add.s64 %rd1, %rd3, %rd5;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 35
|
44 |
+
mov.u32 %r4, 0x0;
|
45 |
+
mov.u32 %r5, 0x0;
|
46 |
+
@%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
|
47 |
+
.loc 1 26 25
|
48 |
+
mul.wide.s32 %rd6, %r11, 2;
|
49 |
+
add.s64 %rd2, %rd4, %rd6;
|
50 |
+
.loc 1 26 36
|
51 |
+
cvt.rn.bf16.f32 %rs1, %r4;
|
52 |
+
cvt.rn.bf16.f32 %rs2, %r5;
|
53 |
+
mov.b32 %r12, {%rs1, %rs2};
|
54 |
+
@%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
|
55 |
+
.loc 1 26 4
|
56 |
+
ret;
|
57 |
+
$L__tmp1:
|
58 |
+
$L__func_end0:
|
59 |
+
|
60 |
+
}
|
61 |
+
.file 1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py"
|
62 |
+
.section .debug_abbrev
|
63 |
+
{
|
64 |
+
.b8 1
|
65 |
+
.b8 17
|
66 |
+
.b8 1
|
67 |
+
.b8 37
|
68 |
+
.b8 8
|
69 |
+
.b8 19
|
70 |
+
.b8 5
|
71 |
+
.b8 3
|
72 |
+
.b8 8
|
73 |
+
.b8 16
|
74 |
+
.b8 6
|
75 |
+
.b8 27
|
76 |
+
.b8 8
|
77 |
+
.b8 180
|
78 |
+
.b8 66
|
79 |
+
.b8 12
|
80 |
+
.b8 17
|
81 |
+
.b8 1
|
82 |
+
.b8 18
|
83 |
+
.b8 1
|
84 |
+
.b8 0
|
85 |
+
.b8 0
|
86 |
+
.b8 2
|
87 |
+
.b8 46
|
88 |
+
.b8 0
|
89 |
+
.b8 17
|
90 |
+
.b8 1
|
91 |
+
.b8 18
|
92 |
+
.b8 1
|
93 |
+
.b8 64
|
94 |
+
.b8 10
|
95 |
+
.b8 135
|
96 |
+
.b8 64
|
97 |
+
.b8 8
|
98 |
+
.b8 3
|
99 |
+
.b8 8
|
100 |
+
.b8 58
|
101 |
+
.b8 11
|
102 |
+
.b8 59
|
103 |
+
.b8 11
|
104 |
+
.b8 63
|
105 |
+
.b8 12
|
106 |
+
.b8 0
|
107 |
+
.b8 0
|
108 |
+
.b8 0
|
109 |
+
}
|
110 |
+
.section .debug_info
|
111 |
+
{
|
112 |
+
.b32 176
|
113 |
+
.b8 2
|
114 |
+
.b8 0
|
115 |
+
.b32 .debug_abbrev
|
116 |
+
.b8 8
|
117 |
+
.b8 1
|
118 |
+
.b8 116
|
119 |
+
.b8 114
|
120 |
+
.b8 105
|
121 |
+
.b8 116
|
122 |
+
.b8 111
|
123 |
+
.b8 110
|
124 |
+
.b8 0
|
125 |
+
.b8 2
|
126 |
+
.b8 0
|
127 |
+
.b8 99
|
128 |
+
.b8 53
|
129 |
+
.b8 116
|
130 |
+
.b8 114
|
131 |
+
.b8 121
|
132 |
+
.b8 112
|
133 |
+
.b8 53
|
134 |
+
.b8 113
|
135 |
+
.b8 119
|
136 |
+
.b8 107
|
137 |
+
.b8 104
|
138 |
+
.b8 114
|
139 |
+
.b8 101
|
140 |
+
.b8 105
|
141 |
+
.b8 106
|
142 |
+
.b8 107
|
143 |
+
.b8 55
|
144 |
+
.b8 115
|
145 |
+
.b8 53
|
146 |
+
.b8 120
|
147 |
+
.b8 51
|
148 |
+
.b8 50
|
149 |
+
.b8 55
|
150 |
+
.b8 119
|
151 |
+
.b8 111
|
152 |
+
.b8 102
|
153 |
+
.b8 122
|
154 |
+
.b8 53
|
155 |
+
.b8 52
|
156 |
+
.b8 108
|
157 |
+
.b8 119
|
158 |
+
.b8 106
|
159 |
+
.b8 52
|
160 |
+
.b8 107
|
161 |
+
.b8 118
|
162 |
+
.b8 99
|
163 |
+
.b8 116
|
164 |
+
.b8 117
|
165 |
+
.b8 113
|
166 |
+
.b8 100
|
167 |
+
.b8 122
|
168 |
+
.b8 118
|
169 |
+
.b8 50
|
170 |
+
.b8 118
|
171 |
+
.b8 114
|
172 |
+
.b8 102
|
173 |
+
.b8 50
|
174 |
+
.b8 120
|
175 |
+
.b8 121
|
176 |
+
.b8 111
|
177 |
+
.b8 110
|
178 |
+
.b8 115
|
179 |
+
.b8 46
|
180 |
+
.b8 112
|
181 |
+
.b8 121
|
182 |
+
.b8 0
|
183 |
+
.b32 .debug_line
|
184 |
+
.b8 47
|
185 |
+
.b8 116
|
186 |
+
.b8 109
|
187 |
+
.b8 112
|
188 |
+
.b8 47
|
189 |
+
.b8 116
|
190 |
+
.b8 111
|
191 |
+
.b8 114
|
192 |
+
.b8 99
|
193 |
+
.b8 104
|
194 |
+
.b8 105
|
195 |
+
.b8 110
|
196 |
+
.b8 100
|
197 |
+
.b8 117
|
198 |
+
.b8 99
|
199 |
+
.b8 116
|
200 |
+
.b8 111
|
201 |
+
.b8 114
|
202 |
+
.b8 95
|
203 |
+
.b8 114
|
204 |
+
.b8 111
|
205 |
+
.b8 111
|
206 |
+
.b8 116
|
207 |
+
.b8 47
|
208 |
+
.b8 53
|
209 |
+
.b8 116
|
210 |
+
.b8 0
|
211 |
+
.b8 1
|
212 |
+
.b64 $L__func_begin0
|
213 |
+
.b64 $L__func_end0
|
214 |
+
.b8 2
|
215 |
+
.b64 $L__func_begin0
|
216 |
+
.b64 $L__func_end0
|
217 |
+
.b8 1
|
218 |
+
.b8 156
|
219 |
+
.b8 116
|
220 |
+
.b8 114
|
221 |
+
.b8 105
|
222 |
+
.b8 116
|
223 |
+
.b8 111
|
224 |
+
.b8 110
|
225 |
+
.b8 95
|
226 |
+
.b8 95
|
227 |
+
.b8 48
|
228 |
+
.b8 100
|
229 |
+
.b8 49
|
230 |
+
.b8 100
|
231 |
+
.b8 50
|
232 |
+
.b8 100
|
233 |
+
.b8 101
|
234 |
+
.b8 0
|
235 |
+
.b8 116
|
236 |
+
.b8 114
|
237 |
+
.b8 105
|
238 |
+
.b8 116
|
239 |
+
.b8 111
|
240 |
+
.b8 110
|
241 |
+
.b8 95
|
242 |
+
.b8 95
|
243 |
+
.b8 48
|
244 |
+
.b8 100
|
245 |
+
.b8 49
|
246 |
+
.b8 100
|
247 |
+
.b8 50
|
248 |
+
.b8 100
|
249 |
+
.b8 101
|
250 |
+
.b8 0
|
251 |
+
.b8 1
|
252 |
+
.b8 18
|
253 |
+
.b8 1
|
254 |
+
.b8 0
|
255 |
+
}
|
256 |
+
.section .debug_pubnames
|
257 |
+
{
|
258 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
259 |
+
$L__pubNames_start0:
|
260 |
+
.b8 2
|
261 |
+
.b8 0
|
262 |
+
.b32 .debug_info
|
263 |
+
.b32 180
|
264 |
+
.b32 125
|
265 |
+
.b8 116
|
266 |
+
.b8 114
|
267 |
+
.b8 105
|
268 |
+
.b8 116
|
269 |
+
.b8 111
|
270 |
+
.b8 110
|
271 |
+
.b8 95
|
272 |
+
.b8 95
|
273 |
+
.b8 48
|
274 |
+
.b8 100
|
275 |
+
.b8 49
|
276 |
+
.b8 100
|
277 |
+
.b8 50
|
278 |
+
.b8 100
|
279 |
+
.b8 101
|
280 |
+
.b8 0
|
281 |
+
.b32 0
|
282 |
+
$L__pubNames_end0:
|
283 |
+
}
|
284 |
+
.section .debug_pubtypes
|
285 |
+
{
|
286 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
287 |
+
$L__pubTypes_start0:
|
288 |
+
.b8 2
|
289 |
+
.b8 0
|
290 |
+
.b32 .debug_info
|
291 |
+
.b32 180
|
292 |
+
.b32 0
|
293 |
+
$L__pubTypes_end0:
|
294 |
+
}
|
295 |
+
.section .debug_loc { }
|
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c512_i32 = arith.constant 512 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32>
|
12 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
13 |
+
%9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
14 |
+
%10 = arith.truncf %7 : tensor<512xf32> to tensor<512xbf16>
|
15 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx
ADDED
@@ -0,0 +1,565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 128, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<18>;
|
22 |
+
.reg .b32 %r<92>;
|
23 |
+
.reg .f32 %f<43>;
|
24 |
+
.reg .b64 %rd<16>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
|
30 |
+
ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
|
31 |
+
ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
|
32 |
+
$L__tmp0:
|
33 |
+
.loc 1 22 44
|
34 |
+
mov.u32 %r1, %tid.x;
|
35 |
+
and.b32 %r2, %r1, 31;
|
36 |
+
shl.b32 %r13, %r1, 2;
|
37 |
+
and.b32 %r3, %r13, 60;
|
38 |
+
.loc 1 24 33
|
39 |
+
bfe.u32 %r4, %r1, 5, 2;
|
40 |
+
.loc 1 21 28
|
41 |
+
mov.u32 %r11, %ctaid.x;
|
42 |
+
.loc 1 21 33
|
43 |
+
shl.b32 %r5, %r11, 6;
|
44 |
+
.loc 1 27 36
|
45 |
+
shl.b32 %r14, %r4, 18;
|
46 |
+
shl.b32 %r15, %r1, 13;
|
47 |
+
and.b32 %r16, %r15, 131072;
|
48 |
+
or.b32 %r17, %r14, %r16;
|
49 |
+
add.s32 %r18, %r17, %r5;
|
50 |
+
or.b32 %r90, %r18, %r3;
|
51 |
+
mov.f32 %f39, 0f00000000;
|
52 |
+
mov.b32 %r91, -8;
|
53 |
+
mov.pred %p1, -1;
|
54 |
+
mov.f32 %f40, %f39;
|
55 |
+
mov.f32 %f41, %f39;
|
56 |
+
mov.f32 %f42, %f39;
|
57 |
+
$L__BB0_1:
|
58 |
+
.loc 1 31 34
|
59 |
+
mul.wide.s32 %rd5, %r90, 4;
|
60 |
+
add.s64 %rd4, %rd1, %rd5;
|
61 |
+
mov.b32 %r23, 0;
|
62 |
+
.loc 1 31 53
|
63 |
+
mov.u32 %r19, 0x0;
|
64 |
+
mov.u32 %r20, 0x0;
|
65 |
+
mov.u32 %r21, 0x0;
|
66 |
+
mov.u32 %r22, 0x0;
|
67 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r19, %r20, %r21, %r22 }, [ %rd4 + 0 ];
|
68 |
+
@!%p1 mov.u32 %r19, %r23;
|
69 |
+
@!%p1 mov.u32 %r20, %r23;
|
70 |
+
@!%p1 mov.u32 %r21, %r23;
|
71 |
+
@!%p1 mov.u32 %r22, %r23;
|
72 |
+
mov.b32 %f13, %r19;
|
73 |
+
mov.b32 %f14, %r20;
|
74 |
+
mov.b32 %f15, %r21;
|
75 |
+
mov.b32 %f16, %r22;
|
76 |
+
.loc 1 34 38
|
77 |
+
add.f32 %f42, %f42, %f16;
|
78 |
+
add.f32 %f41, %f41, %f15;
|
79 |
+
add.f32 %f40, %f40, %f14;
|
80 |
+
add.f32 %f39, %f39, %f13;
|
81 |
+
.loc 1 27 36
|
82 |
+
add.s32 %r91, %r91, 8;
|
83 |
+
add.s32 %r90, %r90, 1048576;
|
84 |
+
setp.lt.u32 %p6, %r91, 112;
|
85 |
+
@%p6 bra $L__BB0_1;
|
86 |
+
.loc 1 22 44
|
87 |
+
and.b32 %r45, %r1, 63;
|
88 |
+
.loc 1 22 23
|
89 |
+
or.b32 %r46, %r5, %r45;
|
90 |
+
$L__tmp1:
|
91 |
+
.loc 2 243 36
|
92 |
+
mov.b32 %r47, %f39;
|
93 |
+
shfl.sync.bfly.b32 %r48, %r47, 16, 31, -1;
|
94 |
+
mov.b32 %f17, %r48;
|
95 |
+
$L__tmp2:
|
96 |
+
.loc 2 233 15
|
97 |
+
add.f32 %f18, %f39, %f17;
|
98 |
+
$L__tmp3:
|
99 |
+
.loc 2 243 36
|
100 |
+
mov.b32 %r49, %f40;
|
101 |
+
shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1;
|
102 |
+
mov.b32 %f19, %r50;
|
103 |
+
$L__tmp4:
|
104 |
+
.loc 2 233 15
|
105 |
+
add.f32 %f20, %f40, %f19;
|
106 |
+
$L__tmp5:
|
107 |
+
.loc 2 243 36
|
108 |
+
mov.b32 %r51, %f41;
|
109 |
+
shfl.sync.bfly.b32 %r52, %r51, 16, 31, -1;
|
110 |
+
mov.b32 %f21, %r52;
|
111 |
+
$L__tmp6:
|
112 |
+
.loc 2 233 15
|
113 |
+
add.f32 %f22, %f41, %f21;
|
114 |
+
$L__tmp7:
|
115 |
+
.loc 2 243 36
|
116 |
+
mov.b32 %r53, %f42;
|
117 |
+
shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1;
|
118 |
+
mov.b32 %f23, %r54;
|
119 |
+
$L__tmp8:
|
120 |
+
.loc 2 233 15
|
121 |
+
add.f32 %f24, %f42, %f23;
|
122 |
+
$L__tmp9:
|
123 |
+
.loc 2 243 36
|
124 |
+
setp.lt.u32 %p7, %r2, 16;
|
125 |
+
shl.b32 %r55, %r3, 2;
|
126 |
+
or.b32 %r56, %r55, %r4;
|
127 |
+
shl.b32 %r57, %r56, 2;
|
128 |
+
mov.u32 %r58, global_smem;
|
129 |
+
add.s32 %r27, %r58, %r57;
|
130 |
+
mov.b32 %r28, %f18;
|
131 |
+
@%p7 st.shared.b32 [ %r27 + 0 ], %r28;
|
132 |
+
shl.b32 %r59, %r4, 2;
|
133 |
+
shl.b32 %r60, %r3, 4;
|
134 |
+
or.b32 %r61, %r60, 16;
|
135 |
+
or.b32 %r62, %r61, %r59;
|
136 |
+
add.s32 %r29, %r58, %r62;
|
137 |
+
mov.b32 %r30, %f20;
|
138 |
+
@%p7 st.shared.b32 [ %r29 + 0 ], %r30;
|
139 |
+
or.b32 %r63, %r60, 32;
|
140 |
+
or.b32 %r64, %r63, %r59;
|
141 |
+
add.s32 %r31, %r58, %r64;
|
142 |
+
mov.b32 %r32, %f22;
|
143 |
+
@%p7 st.shared.b32 [ %r31 + 0 ], %r32;
|
144 |
+
or.b32 %r65, %r60, 48;
|
145 |
+
or.b32 %r66, %r65, %r59;
|
146 |
+
add.s32 %r33, %r58, %r66;
|
147 |
+
mov.b32 %r34, %f24;
|
148 |
+
@%p7 st.shared.b32 [ %r33 + 0 ], %r34;
|
149 |
+
bar.sync 0;
|
150 |
+
setp.lt.s32 %p11, %r1, 256;
|
151 |
+
add.s32 %r36, %r58, %r13;
|
152 |
+
@%p11 ld.shared.b32 %r35, [ %r36 + 0 ];
|
153 |
+
mov.b32 %f25, %r35;
|
154 |
+
shfl.sync.bfly.b32 %r68, %r35, 2, 31, -1;
|
155 |
+
mov.b32 %f26, %r68;
|
156 |
+
$L__tmp10:
|
157 |
+
.loc 2 233 15
|
158 |
+
add.f32 %f27, %f25, %f26;
|
159 |
+
$L__tmp11:
|
160 |
+
.loc 2 243 36
|
161 |
+
mov.b32 %r69, %f27;
|
162 |
+
shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1;
|
163 |
+
mov.b32 %f28, %r70;
|
164 |
+
$L__tmp12:
|
165 |
+
.loc 2 233 15
|
166 |
+
add.f32 %f29, %f27, %f28;
|
167 |
+
$L__tmp13:
|
168 |
+
.loc 2 243 36
|
169 |
+
and.b32 %r71, %r1, 3;
|
170 |
+
setp.eq.s32 %p17, %r71, 0;
|
171 |
+
and.pred %p12, %p11, %p17;
|
172 |
+
mov.b32 %r38, %f29;
|
173 |
+
@%p12 st.shared.b32 [ %r36 + 0 ], %r38;
|
174 |
+
add.s32 %r40, %r36, 512;
|
175 |
+
@%p11 ld.shared.b32 %r39, [ %r40 + 0 ];
|
176 |
+
mov.b32 %f30, %r39;
|
177 |
+
shfl.sync.bfly.b32 %r72, %r39, 2, 31, -1;
|
178 |
+
mov.b32 %f31, %r72;
|
179 |
+
$L__tmp14:
|
180 |
+
.loc 2 233 15
|
181 |
+
add.f32 %f32, %f30, %f31;
|
182 |
+
$L__tmp15:
|
183 |
+
.loc 2 243 36
|
184 |
+
mov.b32 %r73, %f32;
|
185 |
+
shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1;
|
186 |
+
mov.b32 %f33, %r74;
|
187 |
+
$L__tmp16:
|
188 |
+
.loc 2 233 15
|
189 |
+
add.f32 %f34, %f32, %f33;
|
190 |
+
$L__tmp17:
|
191 |
+
.loc 2 243 36
|
192 |
+
mov.b32 %r42, %f34;
|
193 |
+
@%p12 st.shared.b32 [ %r40 + 0 ], %r42;
|
194 |
+
bar.sync 0;
|
195 |
+
add.s32 %r75, %r58, %r60;
|
196 |
+
ld.shared.f32 %f35, [%r75];
|
197 |
+
add.s32 %r76, %r58, %r61;
|
198 |
+
ld.shared.f32 %f36, [%r76];
|
199 |
+
add.s32 %r77, %r58, %r63;
|
200 |
+
ld.shared.f32 %f37, [%r77];
|
201 |
+
add.s32 %r78, %r58, %r65;
|
202 |
+
ld.shared.f32 %f38, [%r78];
|
203 |
+
$L__tmp18:
|
204 |
+
.loc 1 35 28
|
205 |
+
bar.sync 0;
|
206 |
+
add.s32 %r79, %r58, %r55;
|
207 |
+
st.shared.f32 [%r79], %f35;
|
208 |
+
st.shared.f32 [%r79+4], %f36;
|
209 |
+
st.shared.f32 [%r79+8], %f37;
|
210 |
+
st.shared.f32 [%r79+12], %f38;
|
211 |
+
bar.sync 0;
|
212 |
+
shl.b32 %r80, %r45, 2;
|
213 |
+
add.s32 %r81, %r58, %r80;
|
214 |
+
.loc 1 36 20
|
215 |
+
shr.s32 %r83, %r46, 31;
|
216 |
+
shr.u32 %r84, %r83, 24;
|
217 |
+
add.s32 %r85, %r46, %r84;
|
218 |
+
shr.s32 %r86, %r85, 8;
|
219 |
+
and.b32 %r87, %r85, -256;
|
220 |
+
sub.s32 %r88, %r46, %r87;
|
221 |
+
.loc 1 38 30
|
222 |
+
mul.wide.s32 %rd9, %r86, 8;
|
223 |
+
add.s64 %rd7, %rd2, %rd9;
|
224 |
+
.loc 1 45 55
|
225 |
+
ld.shared.u32 %r44, [%r81];
|
226 |
+
.loc 1 38 35
|
227 |
+
mov.u64 %rd6, 0x0;
|
228 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
|
229 |
+
.loc 1 41 32
|
230 |
+
shr.u64 %rd10, %rd6, 54;
|
231 |
+
and.b64 %rd11, %rd10, 512;
|
232 |
+
add.s64 %rd12, %rd11, %rd6;
|
233 |
+
.loc 1 45 30
|
234 |
+
shl.b64 %rd13, %rd12, 10;
|
235 |
+
add.s64 %rd14, %rd3, %rd13;
|
236 |
+
mul.wide.s32 %rd15, %r88, 4;
|
237 |
+
add.s64 %rd8, %rd14, %rd15;
|
238 |
+
.loc 1 45 55
|
239 |
+
and.b32 %r89, %r1, 64;
|
240 |
+
setp.eq.s32 %p16, %r89, 0;
|
241 |
+
mov.u32 %r43, 0x0;
|
242 |
+
@%p16 atom.global.gpu.acq_rel.add.f32 %r43, [ %rd8 + 0 ], %r44;
|
243 |
+
.loc 1 45 4
|
244 |
+
ret;
|
245 |
+
$L__tmp19:
|
246 |
+
$L__func_end0:
|
247 |
+
|
248 |
+
}
|
249 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
250 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
251 |
+
.section .debug_abbrev
|
252 |
+
{
|
253 |
+
.b8 1
|
254 |
+
.b8 17
|
255 |
+
.b8 1
|
256 |
+
.b8 37
|
257 |
+
.b8 8
|
258 |
+
.b8 19
|
259 |
+
.b8 5
|
260 |
+
.b8 3
|
261 |
+
.b8 8
|
262 |
+
.b8 16
|
263 |
+
.b8 6
|
264 |
+
.b8 27
|
265 |
+
.b8 8
|
266 |
+
.b8 180
|
267 |
+
.b8 66
|
268 |
+
.b8 12
|
269 |
+
.b8 17
|
270 |
+
.b8 1
|
271 |
+
.b8 18
|
272 |
+
.b8 1
|
273 |
+
.b8 0
|
274 |
+
.b8 0
|
275 |
+
.b8 2
|
276 |
+
.b8 46
|
277 |
+
.b8 0
|
278 |
+
.b8 135
|
279 |
+
.b8 64
|
280 |
+
.b8 8
|
281 |
+
.b8 3
|
282 |
+
.b8 8
|
283 |
+
.b8 58
|
284 |
+
.b8 11
|
285 |
+
.b8 59
|
286 |
+
.b8 11
|
287 |
+
.b8 63
|
288 |
+
.b8 12
|
289 |
+
.b8 32
|
290 |
+
.b8 11
|
291 |
+
.b8 0
|
292 |
+
.b8 0
|
293 |
+
.b8 3
|
294 |
+
.b8 46
|
295 |
+
.b8 1
|
296 |
+
.b8 17
|
297 |
+
.b8 1
|
298 |
+
.b8 18
|
299 |
+
.b8 1
|
300 |
+
.b8 64
|
301 |
+
.b8 10
|
302 |
+
.b8 49
|
303 |
+
.b8 19
|
304 |
+
.b8 0
|
305 |
+
.b8 0
|
306 |
+
.b8 4
|
307 |
+
.b8 29
|
308 |
+
.b8 0
|
309 |
+
.b8 49
|
310 |
+
.b8 19
|
311 |
+
.b8 17
|
312 |
+
.b8 1
|
313 |
+
.b8 18
|
314 |
+
.b8 1
|
315 |
+
.b8 88
|
316 |
+
.b8 11
|
317 |
+
.b8 89
|
318 |
+
.b8 11
|
319 |
+
.b8 87
|
320 |
+
.b8 11
|
321 |
+
.b8 0
|
322 |
+
.b8 0
|
323 |
+
.b8 5
|
324 |
+
.b8 29
|
325 |
+
.b8 1
|
326 |
+
.b8 49
|
327 |
+
.b8 19
|
328 |
+
.b8 17
|
329 |
+
.b8 1
|
330 |
+
.b8 18
|
331 |
+
.b8 1
|
332 |
+
.b8 88
|
333 |
+
.b8 11
|
334 |
+
.b8 89
|
335 |
+
.b8 11
|
336 |
+
.b8 87
|
337 |
+
.b8 11
|
338 |
+
.b8 0
|
339 |
+
.b8 0
|
340 |
+
.b8 0
|
341 |
+
}
|
342 |
+
.section .debug_info
|
343 |
+
{
|
344 |
+
.b32 264
|
345 |
+
.b8 2
|
346 |
+
.b8 0
|
347 |
+
.b32 .debug_abbrev
|
348 |
+
.b8 8
|
349 |
+
.b8 1
|
350 |
+
.b8 116
|
351 |
+
.b8 114
|
352 |
+
.b8 105
|
353 |
+
.b8 116
|
354 |
+
.b8 111
|
355 |
+
.b8 110
|
356 |
+
.b8 0
|
357 |
+
.b8 2
|
358 |
+
.b8 0
|
359 |
+
.b8 99
|
360 |
+
.b8 54
|
361 |
+
.b8 105
|
362 |
+
.b8 107
|
363 |
+
.b8 53
|
364 |
+
.b8 118
|
365 |
+
.b8 120
|
366 |
+
.b8 55
|
367 |
+
.b8 112
|
368 |
+
.b8 50
|
369 |
+
.b8 50
|
370 |
+
.b8 102
|
371 |
+
.b8 112
|
372 |
+
.b8 107
|
373 |
+
.b8 52
|
374 |
+
.b8 100
|
375 |
+
.b8 99
|
376 |
+
.b8 118
|
377 |
+
.b8 104
|
378 |
+
.b8 53
|
379 |
+
.b8 53
|
380 |
+
.b8 122
|
381 |
+
.b8 105
|
382 |
+
.b8 109
|
383 |
+
.b8 119
|
384 |
+
.b8 52
|
385 |
+
.b8 116
|
386 |
+
.b8 53
|
387 |
+
.b8 110
|
388 |
+
.b8 114
|
389 |
+
.b8 53
|
390 |
+
.b8 122
|
391 |
+
.b8 110
|
392 |
+
.b8 50
|
393 |
+
.b8 98
|
394 |
+
.b8 55
|
395 |
+
.b8 105
|
396 |
+
.b8 110
|
397 |
+
.b8 117
|
398 |
+
.b8 106
|
399 |
+
.b8 120
|
400 |
+
.b8 106
|
401 |
+
.b8 97
|
402 |
+
.b8 117
|
403 |
+
.b8 120
|
404 |
+
.b8 115
|
405 |
+
.b8 104
|
406 |
+
.b8 108
|
407 |
+
.b8 106
|
408 |
+
.b8 117
|
409 |
+
.b8 109
|
410 |
+
.b8 109
|
411 |
+
.b8 46
|
412 |
+
.b8 112
|
413 |
+
.b8 121
|
414 |
+
.b8 0
|
415 |
+
.b32 .debug_line
|
416 |
+
.b8 47
|
417 |
+
.b8 116
|
418 |
+
.b8 109
|
419 |
+
.b8 112
|
420 |
+
.b8 47
|
421 |
+
.b8 116
|
422 |
+
.b8 111
|
423 |
+
.b8 114
|
424 |
+
.b8 99
|
425 |
+
.b8 104
|
426 |
+
.b8 105
|
427 |
+
.b8 110
|
428 |
+
.b8 100
|
429 |
+
.b8 117
|
430 |
+
.b8 99
|
431 |
+
.b8 116
|
432 |
+
.b8 111
|
433 |
+
.b8 114
|
434 |
+
.b8 95
|
435 |
+
.b8 114
|
436 |
+
.b8 111
|
437 |
+
.b8 111
|
438 |
+
.b8 116
|
439 |
+
.b8 47
|
440 |
+
.b8 54
|
441 |
+
.b8 105
|
442 |
+
.b8 0
|
443 |
+
.b8 1
|
444 |
+
.b64 $L__func_begin0
|
445 |
+
.b64 $L__func_end0
|
446 |
+
.b8 2
|
447 |
+
.b8 116
|
448 |
+
.b8 114
|
449 |
+
.b8 105
|
450 |
+
.b8 116
|
451 |
+
.b8 111
|
452 |
+
.b8 110
|
453 |
+
.b8 95
|
454 |
+
.b8 95
|
455 |
+
.b8 48
|
456 |
+
.b8 100
|
457 |
+
.b8 49
|
458 |
+
.b8 100
|
459 |
+
.b8 50
|
460 |
+
.b8 100
|
461 |
+
.b8 51
|
462 |
+
.b8 100
|
463 |
+
.b8 101
|
464 |
+
.b8 52
|
465 |
+
.b8 101
|
466 |
+
.b8 0
|
467 |
+
.b8 116
|
468 |
+
.b8 114
|
469 |
+
.b8 105
|
470 |
+
.b8 116
|
471 |
+
.b8 111
|
472 |
+
.b8 110
|
473 |
+
.b8 95
|
474 |
+
.b8 95
|
475 |
+
.b8 48
|
476 |
+
.b8 100
|
477 |
+
.b8 49
|
478 |
+
.b8 100
|
479 |
+
.b8 50
|
480 |
+
.b8 100
|
481 |
+
.b8 51
|
482 |
+
.b8 100
|
483 |
+
.b8 101
|
484 |
+
.b8 52
|
485 |
+
.b8 101
|
486 |
+
.b8 0
|
487 |
+
.b8 1
|
488 |
+
.b8 18
|
489 |
+
.b8 1
|
490 |
+
.b8 1
|
491 |
+
.b8 3
|
492 |
+
.b64 $L__func_begin0
|
493 |
+
.b64 $L__func_end0
|
494 |
+
.b8 1
|
495 |
+
.b8 156
|
496 |
+
.b32 125
|
497 |
+
.b8 4
|
498 |
+
.b32 125
|
499 |
+
.b64 $L__tmp1
|
500 |
+
.b64 $L__tmp18
|
501 |
+
.b8 2
|
502 |
+
.b8 35
|
503 |
+
.b8 25
|
504 |
+
.b8 5
|
505 |
+
.b32 125
|
506 |
+
.b64 $L__tmp2
|
507 |
+
.b64 $L__tmp17
|
508 |
+
.b8 2
|
509 |
+
.b8 35
|
510 |
+
.b8 25
|
511 |
+
.b8 4
|
512 |
+
.b32 125
|
513 |
+
.b64 $L__tmp2
|
514 |
+
.b64 $L__tmp17
|
515 |
+
.b8 2
|
516 |
+
.b8 243
|
517 |
+
.b8 36
|
518 |
+
.b8 0
|
519 |
+
.b8 0
|
520 |
+
.b8 0
|
521 |
+
}
|
522 |
+
.section .debug_pubnames
|
523 |
+
{
|
524 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
525 |
+
$L__pubNames_start0:
|
526 |
+
.b8 2
|
527 |
+
.b8 0
|
528 |
+
.b32 .debug_info
|
529 |
+
.b32 268
|
530 |
+
.b32 125
|
531 |
+
.b8 116
|
532 |
+
.b8 114
|
533 |
+
.b8 105
|
534 |
+
.b8 116
|
535 |
+
.b8 111
|
536 |
+
.b8 110
|
537 |
+
.b8 95
|
538 |
+
.b8 95
|
539 |
+
.b8 48
|
540 |
+
.b8 100
|
541 |
+
.b8 49
|
542 |
+
.b8 100
|
543 |
+
.b8 50
|
544 |
+
.b8 100
|
545 |
+
.b8 51
|
546 |
+
.b8 100
|
547 |
+
.b8 101
|
548 |
+
.b8 52
|
549 |
+
.b8 101
|
550 |
+
.b8 0
|
551 |
+
.b32 0
|
552 |
+
$L__pubNames_end0:
|
553 |
+
}
|
554 |
+
.section .debug_pubtypes
|
555 |
+
{
|
556 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
557 |
+
$L__pubTypes_start0:
|
558 |
+
.b8 2
|
559 |
+
.b8 0
|
560 |
+
.b32 .debug_info
|
561 |
+
.b32 268
|
562 |
+
.b32 0
|
563 |
+
$L__pubTypes_end0:
|
564 |
+
}
|
565 |
+
.section .debug_loc { }
|
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttgir
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x8xi32, #blocked1>
|
10 |
+
%cst_4 = arith.constant dense<120> : tensor<1x8xi32, #blocked1>
|
11 |
+
%c0_i32 = arith.constant 0 : i32
|
12 |
+
%c120_i32 = arith.constant 120 : i32
|
13 |
+
%c8_i32 = arith.constant 8 : i32
|
14 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
|
15 |
+
%cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
|
16 |
+
%c64_i32 = arith.constant 64 : i32
|
17 |
+
%0 = tt.get_program_id x : i32
|
18 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
19 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
20 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
21 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
22 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
23 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
24 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
25 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked1>
|
26 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked>
|
27 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
28 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
|
29 |
+
%12 = tt.broadcast %8 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
30 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
31 |
+
%14 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c8_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x8xf32, #blocked1>) : i32 {
|
32 |
+
%32 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked1>
|
33 |
+
%33 = arith.addi %32, %11 : tensor<1x8xi32, #blocked1>
|
34 |
+
%34 = arith.cmpi slt, %33, %cst_4 : tensor<1x8xi32, #blocked1>
|
35 |
+
%35 = arith.muli %33, %cst_3 : tensor<1x8xi32, #blocked1>
|
36 |
+
%36 = tt.broadcast %35 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
37 |
+
%37 = arith.addi %12, %36 : tensor<64x8xi32, #blocked1>
|
38 |
+
%38 = tt.addptr %13, %37 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
39 |
+
%39 = tt.broadcast %34 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
|
40 |
+
%40 = tt.load %38, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
41 |
+
%41 = arith.addf %arg6, %40 : tensor<64x8xf32, #blocked1>
|
42 |
+
%42 = arith.select %39, %41, %arg6 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
|
43 |
+
scf.yield %42 : tensor<64x8xf32, #blocked1>
|
44 |
+
}
|
45 |
+
%15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({
|
46 |
+
^bb0(%arg5: f32, %arg6: f32):
|
47 |
+
%32 = arith.addf %arg5, %arg6 : f32
|
48 |
+
tt.reduce.return %32 : f32
|
49 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
50 |
+
%16 = triton_gpu.convert_layout %15 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
51 |
+
%17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
52 |
+
%18 = arith.divsi %9, %cst_2 : tensor<64x1xi32, #blocked>
|
53 |
+
%19 = arith.remsi %9, %cst_2 : tensor<64x1xi32, #blocked>
|
54 |
+
%20 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
55 |
+
%21 = tt.addptr %20, %18 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
56 |
+
%22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
57 |
+
%23 = arith.addi %22, %cst_1 : tensor<64x1xi64, #blocked>
|
58 |
+
%24 = arith.cmpi slt, %22, %cst_0 : tensor<64x1xi64, #blocked>
|
59 |
+
%25 = arith.select %24, %23, %22 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
60 |
+
%26 = arith.muli %25, %cst : tensor<64x1xi64, #blocked>
|
61 |
+
%27 = arith.extsi %19 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
|
62 |
+
%28 = arith.addi %27, %26 : tensor<64x1xi64, #blocked>
|
63 |
+
%29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
|
64 |
+
%30 = tt.addptr %29, %28 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
|
65 |
+
%31 = "tt.atomic_rmw"(%30, %17, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
|
66 |
+
tt.return
|
67 |
+
}
|
68 |
+
}
|
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<64x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<64x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<64x1xi64>
|
6 |
+
%c8_i32 = arith.constant 8 : i32
|
7 |
+
%c120_i32 = arith.constant 120 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<true> : tensor<64x1xi1>
|
10 |
+
%cst_3 = arith.constant dense<256> : tensor<64x1xi32>
|
11 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x8xi32>
|
12 |
+
%cst_5 = arith.constant dense<120> : tensor<1x8xi32>
|
13 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
14 |
+
%c64_i32 = arith.constant 64 : i32
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
17 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
18 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
19 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
20 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
21 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
22 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
23 |
+
%8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
24 |
+
%9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
25 |
+
%10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c8_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x8xf32>) : i32 {
|
26 |
+
%27 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
|
27 |
+
%28 = arith.addi %27, %7 : tensor<1x8xi32>
|
28 |
+
%29 = arith.cmpi slt, %28, %cst_5 : tensor<1x8xi32>
|
29 |
+
%30 = arith.muli %28, %cst_4 : tensor<1x8xi32>
|
30 |
+
%31 = tt.broadcast %30 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
31 |
+
%32 = arith.addi %8, %31 : tensor<64x8xi32>
|
32 |
+
%33 = tt.addptr %9, %32 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
33 |
+
%34 = tt.broadcast %29 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
34 |
+
%35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
35 |
+
%36 = arith.addf %arg6, %35 : tensor<64x8xf32>
|
36 |
+
%37 = arith.select %34, %36, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
|
37 |
+
scf.yield %37 : tensor<64x8xf32>
|
38 |
+
}
|
39 |
+
%11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
|
40 |
+
^bb0(%arg5: f32, %arg6: f32):
|
41 |
+
%27 = arith.addf %arg5, %arg6 : f32
|
42 |
+
tt.reduce.return %27 : f32
|
43 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
44 |
+
%12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
45 |
+
%13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
|
46 |
+
%14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
|
47 |
+
%15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
48 |
+
%16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
49 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
50 |
+
%18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
|
51 |
+
%19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
|
52 |
+
%20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
|
53 |
+
%21 = arith.muli %20, %cst : tensor<64x1xi64>
|
54 |
+
%22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
|
55 |
+
%23 = arith.addi %22, %21 : tensor<64x1xi64>
|
56 |
+
%24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
57 |
+
%25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
|
58 |
+
%26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
|
59 |
+
tt.return
|
60 |
+
}
|
61 |
+
}
|
.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 256, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<3>;
|
19 |
+
.reg .b16 %rs<3>;
|
20 |
+
.reg .b32 %r<12>;
|
21 |
+
.reg .b64 %rd<7>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd3, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r7, %tid.x;
|
31 |
+
shl.b32 %r8, %r7, 1;
|
32 |
+
and.b32 %r9, %r8, 510;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r10, %r1, 9;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r11, %r10, %r9;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd5, %r11, 2;
|
41 |
+
add.s64 %rd1, %rd3, %rd5;
|
42 |
+
mov.pred %p1, -1;
|
43 |
+
.loc 1 24 35
|
44 |
+
mov.u32 %r2, 0x0;
|
45 |
+
@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
|
46 |
+
cvt.u16.u32 %rs1, %r2;
|
47 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
48 |
+
.loc 1 24 44
|
49 |
+
cvt.f32.bf16 %r5, %rs1;
|
50 |
+
cvt.f32.bf16 %r6, %rs2;
|
51 |
+
.loc 1 26 25
|
52 |
+
mul.wide.s32 %rd6, %r11, 4;
|
53 |
+
add.s64 %rd2, %rd4, %rd6;
|
54 |
+
.loc 1 26 36
|
55 |
+
@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
|
56 |
+
.loc 1 26 4
|
57 |
+
ret;
|
58 |
+
$L__tmp1:
|
59 |
+
$L__func_end0:
|
60 |
+
|
61 |
+
}
|
62 |
+
.file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
|
63 |
+
.section .debug_abbrev
|
64 |
+
{
|
65 |
+
.b8 1
|
66 |
+
.b8 17
|
67 |
+
.b8 1
|
68 |
+
.b8 37
|
69 |
+
.b8 8
|
70 |
+
.b8 19
|
71 |
+
.b8 5
|
72 |
+
.b8 3
|
73 |
+
.b8 8
|
74 |
+
.b8 16
|
75 |
+
.b8 6
|
76 |
+
.b8 27
|
77 |
+
.b8 8
|
78 |
+
.b8 180
|
79 |
+
.b8 66
|
80 |
+
.b8 12
|
81 |
+
.b8 17
|
82 |
+
.b8 1
|
83 |
+
.b8 18
|
84 |
+
.b8 1
|
85 |
+
.b8 0
|
86 |
+
.b8 0
|
87 |
+
.b8 2
|
88 |
+
.b8 46
|
89 |
+
.b8 0
|
90 |
+
.b8 17
|
91 |
+
.b8 1
|
92 |
+
.b8 18
|
93 |
+
.b8 1
|
94 |
+
.b8 64
|
95 |
+
.b8 10
|
96 |
+
.b8 135
|
97 |
+
.b8 64
|
98 |
+
.b8 8
|
99 |
+
.b8 3
|
100 |
+
.b8 8
|
101 |
+
.b8 58
|
102 |
+
.b8 11
|
103 |
+
.b8 59
|
104 |
+
.b8 11
|
105 |
+
.b8 63
|
106 |
+
.b8 12
|
107 |
+
.b8 0
|
108 |
+
.b8 0
|
109 |
+
.b8 0
|
110 |
+
}
|
111 |
+
.section .debug_info
|
112 |
+
{
|
113 |
+
.b32 176
|
114 |
+
.b8 2
|
115 |
+
.b8 0
|
116 |
+
.b32 .debug_abbrev
|
117 |
+
.b8 8
|
118 |
+
.b8 1
|
119 |
+
.b8 116
|
120 |
+
.b8 114
|
121 |
+
.b8 105
|
122 |
+
.b8 116
|
123 |
+
.b8 111
|
124 |
+
.b8 110
|
125 |
+
.b8 0
|
126 |
+
.b8 2
|
127 |
+
.b8 0
|
128 |
+
.b8 99
|
129 |
+
.b8 107
|
130 |
+
.b8 54
|
131 |
+
.b8 50
|
132 |
+
.b8 107
|
133 |
+
.b8 50
|
134 |
+
.b8 120
|
135 |
+
.b8 122
|
136 |
+
.b8 98
|
137 |
+
.b8 98
|
138 |
+
.b8 54
|
139 |
+
.b8 53
|
140 |
+
.b8 55
|
141 |
+
.b8 115
|
142 |
+
.b8 110
|
143 |
+
.b8 102
|
144 |
+
.b8 100
|
145 |
+
.b8 111
|
146 |
+
.b8 119
|
147 |
+
.b8 119
|
148 |
+
.b8 97
|
149 |
+
.b8 110
|
150 |
+
.b8 122
|
151 |
+
.b8 115
|
152 |
+
.b8 122
|
153 |
+
.b8 97
|
154 |
+
.b8 105
|
155 |
+
.b8 106
|
156 |
+
.b8 54
|
157 |
+
.b8 113
|
158 |
+
.b8 122
|
159 |
+
.b8 119
|
160 |
+
.b8 54
|
161 |
+
.b8 118
|
162 |
+
.b8 117
|
163 |
+
.b8 99
|
164 |
+
.b8 55
|
165 |
+
.b8 99
|
166 |
+
.b8 102
|
167 |
+
.b8 105
|
168 |
+
.b8 100
|
169 |
+
.b8 111
|
170 |
+
.b8 109
|
171 |
+
.b8 106
|
172 |
+
.b8 112
|
173 |
+
.b8 107
|
174 |
+
.b8 107
|
175 |
+
.b8 54
|
176 |
+
.b8 105
|
177 |
+
.b8 103
|
178 |
+
.b8 99
|
179 |
+
.b8 109
|
180 |
+
.b8 46
|
181 |
+
.b8 112
|
182 |
+
.b8 121
|
183 |
+
.b8 0
|
184 |
+
.b32 .debug_line
|
185 |
+
.b8 47
|
186 |
+
.b8 116
|
187 |
+
.b8 109
|
188 |
+
.b8 112
|
189 |
+
.b8 47
|
190 |
+
.b8 116
|
191 |
+
.b8 111
|
192 |
+
.b8 114
|
193 |
+
.b8 99
|
194 |
+
.b8 104
|
195 |
+
.b8 105
|
196 |
+
.b8 110
|
197 |
+
.b8 100
|
198 |
+
.b8 117
|
199 |
+
.b8 99
|
200 |
+
.b8 116
|
201 |
+
.b8 111
|
202 |
+
.b8 114
|
203 |
+
.b8 95
|
204 |
+
.b8 114
|
205 |
+
.b8 111
|
206 |
+
.b8 111
|
207 |
+
.b8 116
|
208 |
+
.b8 47
|
209 |
+
.b8 107
|
210 |
+
.b8 54
|
211 |
+
.b8 0
|
212 |
+
.b8 1
|
213 |
+
.b64 $L__func_begin0
|
214 |
+
.b64 $L__func_end0
|
215 |
+
.b8 2
|
216 |
+
.b64 $L__func_begin0
|
217 |
+
.b64 $L__func_end0
|
218 |
+
.b8 1
|
219 |
+
.b8 156
|
220 |
+
.b8 116
|
221 |
+
.b8 114
|
222 |
+
.b8 105
|
223 |
+
.b8 116
|
224 |
+
.b8 111
|
225 |
+
.b8 110
|
226 |
+
.b8 95
|
227 |
+
.b8 95
|
228 |
+
.b8 48
|
229 |
+
.b8 100
|
230 |
+
.b8 49
|
231 |
+
.b8 100
|
232 |
+
.b8 50
|
233 |
+
.b8 100
|
234 |
+
.b8 101
|
235 |
+
.b8 0
|
236 |
+
.b8 116
|
237 |
+
.b8 114
|
238 |
+
.b8 105
|
239 |
+
.b8 116
|
240 |
+
.b8 111
|
241 |
+
.b8 110
|
242 |
+
.b8 95
|
243 |
+
.b8 95
|
244 |
+
.b8 48
|
245 |
+
.b8 100
|
246 |
+
.b8 49
|
247 |
+
.b8 100
|
248 |
+
.b8 50
|
249 |
+
.b8 100
|
250 |
+
.b8 101
|
251 |
+
.b8 0
|
252 |
+
.b8 1
|
253 |
+
.b8 18
|
254 |
+
.b8 1
|
255 |
+
.b8 0
|
256 |
+
}
|
257 |
+
.section .debug_pubnames
|
258 |
+
{
|
259 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
260 |
+
$L__pubNames_start0:
|
261 |
+
.b8 2
|
262 |
+
.b8 0
|
263 |
+
.b32 .debug_info
|
264 |
+
.b32 180
|
265 |
+
.b32 125
|
266 |
+
.b8 116
|
267 |
+
.b8 114
|
268 |
+
.b8 105
|
269 |
+
.b8 116
|
270 |
+
.b8 111
|
271 |
+
.b8 110
|
272 |
+
.b8 95
|
273 |
+
.b8 95
|
274 |
+
.b8 48
|
275 |
+
.b8 100
|
276 |
+
.b8 49
|
277 |
+
.b8 100
|
278 |
+
.b8 50
|
279 |
+
.b8 100
|
280 |
+
.b8 101
|
281 |
+
.b8 0
|
282 |
+
.b32 0
|
283 |
+
$L__pubNames_end0:
|
284 |
+
}
|
285 |
+
.section .debug_pubtypes
|
286 |
+
{
|
287 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
288 |
+
$L__pubTypes_start0:
|
289 |
+
.b8 2
|
290 |
+
.b8 0
|
291 |
+
.b32 .debug_info
|
292 |
+
.b32 180
|
293 |
+
.b32 0
|
294 |
+
$L__pubTypes_end0:
|
295 |
+
}
|
296 |
+
.section .debug_loc { }
|
.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c512_i32 = arith.constant 512 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
|
12 |
+
%8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
|
13 |
+
%9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
14 |
+
%10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
|
15 |
+
tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/a69784da01a97187168f22847465505f/triton_.ttir
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
30 |
+
%20 = arith.addf %8, %12 : tensor<256xf32>
|
31 |
+
%21 = arith.addf %20, %16 : tensor<256xf32>
|
32 |
+
%22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
33 |
+
%23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
|
34 |
+
^bb0(%arg9: f32, %arg10: f32):
|
35 |
+
%47 = arith.addf %arg9, %arg10 : f32
|
36 |
+
tt.reduce.return %47 : f32
|
37 |
+
}) : (tensor<256xf32>) -> f32
|
38 |
+
%24 = arith.addf %23, %cst_0 : f32
|
39 |
+
%25 = arith.divf %24, %cst_1 : f32
|
40 |
+
%26 = tt.splat %25 : (f32) -> tensor<1xf32>
|
41 |
+
%27 = tt.splat %25 : (f32) -> tensor<256xf32>
|
42 |
+
%28 = arith.subf %21, %27 : tensor<256xf32>
|
43 |
+
%29 = arith.mulf %28, %28 : tensor<256xf32>
|
44 |
+
%30 = arith.select %2, %29, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
45 |
+
%31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg9: f32, %arg10: f32):
|
47 |
+
%47 = arith.addf %arg9, %arg10 : f32
|
48 |
+
tt.reduce.return %47 : f32
|
49 |
+
}) : (tensor<256xf32>) -> f32
|
50 |
+
%32 = arith.addf %31, %cst_0 : f32
|
51 |
+
%33 = arith.divf %32, %cst_1 : f32
|
52 |
+
%34 = arith.addf %33, %cst_2 : f32
|
53 |
+
%35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
54 |
+
%36 = tt.splat %35 : (f32) -> tensor<1xf32>
|
55 |
+
%37 = tt.splat %35 : (f32) -> tensor<256xf32>
|
56 |
+
%38 = arith.mulf %28, %37 : tensor<256xf32>
|
57 |
+
%39 = arith.mulf %38, %19 : tensor<256xf32>
|
58 |
+
gpu.barrier
|
59 |
+
%40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
60 |
+
%41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
61 |
+
tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
62 |
+
%42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
63 |
+
%43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
64 |
+
%44 = arith.truncf %39 : tensor<256xf32> to tensor<256xbf16>
|
65 |
+
tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
66 |
+
%45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
67 |
+
%46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
68 |
+
tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
69 |
+
tt.return
|
70 |
+
}
|
71 |
+
}
|
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.llir
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 1, !dbg !8
|
7 |
+
%6 = and i32 %5, 254, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 8, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = sext i32 %9 to i64, !dbg !12
|
12 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
13 |
+
%12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
14 |
+
%13 = extractvalue { i32, i32 } %12, 0, !dbg !13
|
15 |
+
%14 = extractvalue { i32, i32 } %12, 1, !dbg !13
|
16 |
+
%15 = bitcast i32 %13 to float, !dbg !13
|
17 |
+
%16 = bitcast i32 %14 to float, !dbg !13
|
18 |
+
%17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
|
19 |
+
%18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
|
20 |
+
%19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
|
21 |
+
%20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
|
22 |
+
%21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
|
23 |
+
%22 = bitcast <2 x i16> %21 to i32, !dbg !15
|
24 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
|
25 |
+
ret void, !dbg !16
|
26 |
+
}
|
27 |
+
|
28 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
29 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
30 |
+
|
31 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
32 |
+
attributes #1 = { nounwind }
|
33 |
+
|
34 |
+
!llvm.module.flags = !{!0}
|
35 |
+
!llvm.dbg.cu = !{!1}
|
36 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
37 |
+
|
38 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
39 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
40 |
+
!2 = !DIFile(filename: "cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py", directory: "/tmp/torchinductor_root/pq")
|
41 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
42 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
43 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
44 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
45 |
+
!7 = !{}
|
46 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
47 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
48 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
49 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
50 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
51 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
52 |
+
!14 = !DILocation(line: 26, column: 25, scope: !5)
|
53 |
+
!15 = !DILocation(line: 26, column: 36, scope: !5)
|
54 |
+
!16 = !DILocation(line: 26, column: 4, scope: !5)
|