0-hero commited on
Commit
9ab9a5e
·
verified ·
1 Parent(s): 4bf7f8a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir +21 -0
  2. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx +734 -0
  3. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir +184 -0
  4. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir +38 -0
  5. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir +34 -0
  6. .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir +19 -0
  7. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir +18 -0
  8. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin +0 -0
  9. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx +525 -0
  10. .triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir +24 -0
  11. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir +793 -0
  12. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx +1517 -0
  13. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir +92 -0
  14. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir +43 -0
  15. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx +278 -0
  16. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir +18 -0
  17. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir +17 -0
  18. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin +0 -0
  19. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir +296 -0
  20. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir +73 -0
  21. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx +1927 -0
  22. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir +860 -0
  23. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir +27 -0
  24. .triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.llir +54 -0
  25. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin +0 -0
  26. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin +0 -0
  27. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir +89 -0
  28. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin +0 -0
  29. .triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir +55 -0
  30. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir +45 -0
  31. .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir +18 -0
  32. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir +48 -0
  33. .triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir +18 -0
  34. .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin +0 -0
  35. .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir +18 -0
  36. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir +28 -0
  37. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir +20 -0
  38. .triton/dump/962d1809855a53123762906133b1d960/triton_.llir +48 -0
  39. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin +0 -0
  40. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx +771 -0
  41. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir +100 -0
  42. .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ptx +295 -0
  43. .triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttir +18 -0
  44. .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx +565 -0
  45. .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttgir +68 -0
  46. .triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir +61 -0
  47. .triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx +296 -0
  48. .triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir +18 -0
  49. .triton/dump/a69784da01a97187168f22847465505f/triton_.ttir +71 -0
  50. .triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.llir +54 -0
.triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
14
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
15
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
16
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
17
+ %11 = arith.truncf %8 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
18
+ tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
19
+ tt.return
20
+ }
21
+ }
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+
11
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
12
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
21
+ )
22
+ .maxntid 128, 1, 1
23
+ {
24
+ .reg .pred %p<49>;
25
+ .reg .b16 %rs<33>;
26
+ .reg .b32 %r<72>;
27
+ .reg .f32 %f<98>;
28
+ .reg .b64 %rd<66>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
34
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
35
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
36
+ ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_0];
37
+ ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_1];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r13, %tid.x;
41
+ ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_2];
42
+ bfe.u32 %r14, %r13, 3, 4;
43
+ ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7de8_param_3];
44
+ .loc 1 24 33
45
+ and.b32 %r1, %r13, 7;
46
+ .loc 1 21 28
47
+ mov.u32 %r6, %ctaid.x;
48
+ .loc 1 21 34
49
+ cvt.s64.s32 %rd1, %r6;
50
+ .loc 1 21 46
51
+ mul.wide.s32 %rd30, %r6, 64;
52
+ cvt.u64.u32 %rd2, %r14;
53
+ .loc 1 22 23
54
+ or.b64 %rd31, %rd30, %rd2;
55
+ .loc 1 26 30
56
+ shl.b64 %rd32, %rd31, 3;
57
+ add.s64 %rd19, %rd29, %rd32;
58
+ add.s64 %rd21, %rd19, 128;
59
+ add.s64 %rd23, %rd19, 256;
60
+ add.s64 %rd25, %rd19, 384;
61
+ mov.pred %p1, -1;
62
+ .loc 1 26 35
63
+ mov.u64 %rd18, 0x0;
64
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
65
+ mov.u64 %rd20, 0x0;
66
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
67
+ mov.u64 %rd22, 0x0;
68
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
69
+ mov.u64 %rd24, 0x0;
70
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
71
+ .loc 1 27 19
72
+ mov.u32 %r10, 0x0;
73
+ @%p1 ld.global.b32 { %r10 }, [ %rd26 + 0 ];
74
+ .loc 1 29 19
75
+ mov.u32 %r11, 0x0;
76
+ @%p1 ld.global.b32 { %r11 }, [ %rd27 + 0 ];
77
+ .loc 1 38 23
78
+ setp.eq.s64 %p7, %rd18, -1;
79
+ setp.eq.s64 %p8, %rd20, -1;
80
+ setp.eq.s64 %p9, %rd22, -1;
81
+ setp.eq.s64 %p10, %rd24, -1;
82
+ .loc 1 39 22
83
+ div.full.f32 %r9, %r10, %r11;
84
+ mov.b32 %f25, %r9;
85
+ .loc 1 41 37
86
+ selp.f32 %f4, 0f00000000, %f25, %p10;
87
+ selp.f32 %f3, 0f00000000, %f25, %p9;
88
+ selp.f32 %f2, 0f00000000, %f25, %p8;
89
+ selp.f32 %f1, 0f00000000, %f25, %p7;
90
+ .loc 1 32 36
91
+ mul.wide.s32 %rd33, %r6, 12865792;
92
+ mul.wide.u32 %rd34, %r14, 201028;
93
+ add.s64 %rd35, %rd33, %rd34;
94
+ cvt.u64.u32 %rd36, %r13;
95
+ and.b64 %rd3, %rd36, 7;
96
+ mul.wide.u32 %rd37, %r1, 4;
97
+ add.s64 %rd38, %rd35, %rd37;
98
+ add.s64 %rd39, %rd38, %rd28;
99
+ add.s64 %rd65, %rd39, 9649344;
100
+ mov.f32 %f94, 0f00000000;
101
+ mov.b32 %r70, -8;
102
+ mov.u64 %rd63, %rd65;
103
+ mov.f32 %f95, %f94;
104
+ mov.f32 %f96, %f94;
105
+ mov.f32 %f97, %f94;
106
+ $L__BB0_1:
107
+ add.s32 %r70, %r70, 8;
108
+ .loc 1 33 27
109
+ add.s32 %r23, %r70, %r1;
110
+ .loc 1 34 25
111
+ setp.lt.u32 %p11, %r23, 50257;
112
+ .loc 1 36 34
113
+ add.s64 %rd40, %rd63, -9649344;
114
+ add.s64 %rd41, %rd63, -6432896;
115
+ add.s64 %rd42, %rd63, -3216448;
116
+ mov.b32 %r54, 0;
117
+ .loc 1 36 52
118
+ mov.u32 %r15, 0x0;
119
+ @%p11 ld.global.L1::evict_last.b32 { %r15 }, [ %rd40 + 0 ];
120
+ @!%p11 mov.u32 %r15, %r54;
121
+ mov.u32 %r17, 0x0;
122
+ @%p11 ld.global.L1::evict_last.b32 { %r17 }, [ %rd41 + 0 ];
123
+ @!%p11 mov.u32 %r17, %r54;
124
+ mov.u32 %r19, 0x0;
125
+ @%p11 ld.global.L1::evict_last.b32 { %r19 }, [ %rd42 + 0 ];
126
+ @!%p11 mov.u32 %r19, %r54;
127
+ mov.u32 %r21, 0x0;
128
+ @%p11 ld.global.L1::evict_last.b32 { %r21 }, [ %rd63 + 0 ];
129
+ @!%p11 mov.u32 %r21, %r54;
130
+ mov.b32 %f26, %r21;
131
+ mov.b32 %f27, %r19;
132
+ mov.b32 %f28, %r17;
133
+ mov.b32 %f29, %r15;
134
+ .loc 1 42 23
135
+ mul.f32 %f30, %f1, %f29;
136
+ mul.f32 %f31, %f2, %f28;
137
+ mul.f32 %f32, %f3, %f27;
138
+ mul.f32 %f33, %f4, %f26;
139
+ .loc 1 45 40
140
+ selp.f32 %f34, %f33, 0f80000000, %p11;
141
+ selp.f32 %f35, %f32, 0f80000000, %p11;
142
+ selp.f32 %f36, %f31, 0f80000000, %p11;
143
+ selp.f32 %f37, %f30, 0f80000000, %p11;
144
+ add.f32 %f94, %f94, %f37;
145
+ add.f32 %f95, %f95, %f36;
146
+ add.f32 %f96, %f96, %f35;
147
+ add.f32 %f97, %f97, %f34;
148
+ .loc 1 32 36
149
+ add.s64 %rd63, %rd63, 32;
150
+ setp.lt.u32 %p19, %r70, 50249;
151
+ @%p19 bra $L__BB0_1;
152
+ $L__tmp1:
153
+ .loc 2 243 36
154
+ mov.b32 %r25, %f94;
155
+ shfl.sync.bfly.b32 %r26, %r25, 4, 31, -1;
156
+ mov.b32 %f38, %r26;
157
+ $L__tmp2:
158
+ .loc 2 233 15
159
+ add.f32 %f39, %f94, %f38;
160
+ $L__tmp3:
161
+ .loc 2 243 36
162
+ mov.b32 %r27, %f39;
163
+ shfl.sync.bfly.b32 %r28, %r27, 2, 31, -1;
164
+ mov.b32 %f40, %r28;
165
+ $L__tmp4:
166
+ .loc 2 233 15
167
+ add.f32 %f41, %f39, %f40;
168
+ $L__tmp5:
169
+ .loc 2 243 36
170
+ mov.b32 %r29, %f41;
171
+ shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1;
172
+ mov.b32 %f42, %r30;
173
+ $L__tmp6:
174
+ .loc 2 233 15
175
+ add.f32 %f13, %f41, %f42;
176
+ $L__tmp7:
177
+ .loc 2 243 36
178
+ mov.b32 %r31, %f95;
179
+ shfl.sync.bfly.b32 %r32, %r31, 4, 31, -1;
180
+ mov.b32 %f43, %r32;
181
+ $L__tmp8:
182
+ .loc 2 233 15
183
+ add.f32 %f44, %f95, %f43;
184
+ $L__tmp9:
185
+ .loc 2 243 36
186
+ mov.b32 %r33, %f44;
187
+ shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1;
188
+ mov.b32 %f45, %r34;
189
+ $L__tmp10:
190
+ .loc 2 233 15
191
+ add.f32 %f46, %f44, %f45;
192
+ $L__tmp11:
193
+ .loc 2 243 36
194
+ mov.b32 %r35, %f46;
195
+ shfl.sync.bfly.b32 %r36, %r35, 1, 31, -1;
196
+ mov.b32 %f47, %r36;
197
+ $L__tmp12:
198
+ .loc 2 233 15
199
+ add.f32 %f14, %f46, %f47;
200
+ $L__tmp13:
201
+ .loc 2 243 36
202
+ mov.b32 %r37, %f96;
203
+ shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
204
+ mov.b32 %f48, %r38;
205
+ $L__tmp14:
206
+ .loc 2 233 15
207
+ add.f32 %f49, %f96, %f48;
208
+ $L__tmp15:
209
+ .loc 2 243 36
210
+ mov.b32 %r39, %f49;
211
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
212
+ mov.b32 %f50, %r40;
213
+ $L__tmp16:
214
+ .loc 2 233 15
215
+ add.f32 %f51, %f49, %f50;
216
+ $L__tmp17:
217
+ .loc 2 243 36
218
+ mov.b32 %r41, %f51;
219
+ shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
220
+ mov.b32 %f52, %r42;
221
+ $L__tmp18:
222
+ .loc 2 233 15
223
+ add.f32 %f15, %f51, %f52;
224
+ $L__tmp19:
225
+ .loc 2 243 36
226
+ mov.b32 %r43, %f97;
227
+ shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
228
+ mov.b32 %f53, %r44;
229
+ $L__tmp20:
230
+ .loc 2 233 15
231
+ add.f32 %f54, %f97, %f53;
232
+ $L__tmp21:
233
+ .loc 2 243 36
234
+ mov.b32 %r45, %f54;
235
+ shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1;
236
+ mov.b32 %f55, %r46;
237
+ $L__tmp22:
238
+ .loc 2 233 15
239
+ add.f32 %f56, %f54, %f55;
240
+ $L__tmp23:
241
+ .loc 2 243 36
242
+ mov.b32 %r47, %f56;
243
+ shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1;
244
+ mov.b32 %f57, %r48;
245
+ $L__tmp24:
246
+ .loc 2 233 15
247
+ add.f32 %f16, %f56, %f57;
248
+ $L__tmp25:
249
+ .loc 1 51 36
250
+ shl.b64 %rd44, %rd3, 1;
251
+ add.s64 %rd7, %rd17, %rd44;
252
+ mul.lo.s64 %rd45, %rd1, 6432896;
253
+ mul.lo.s64 %rd46, %rd2, 100514;
254
+ add.s64 %rd64, %rd45, %rd46;
255
+ add.s64 %rd9, %rd16, %rd44;
256
+ add.s64 %rd10, %rd15, %rd44;
257
+ mov.b32 %r71, -8;
258
+ mov.u16 %rs2, 0;
259
+ $L__BB0_3:
260
+ add.s32 %r71, %r71, 8;
261
+ .loc 1 52 27
262
+ add.s32 %r69, %r71, %r1;
263
+ .loc 1 53 25
264
+ setp.lt.u32 %p20, %r69, 50257;
265
+ .loc 1 55 35
266
+ add.s64 %rd47, %rd10, %rd64;
267
+ add.s64 %rd48, %rd47, 1608224;
268
+ add.s64 %rd49, %rd47, 3216448;
269
+ .loc 1 55 53
270
+ add.s64 %rd50, %rd47, 4824672;
271
+ mov.u16 %rs1, 0x0;
272
+ @%p20 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
273
+ @!%p20 mov.u16 %rs1, %rs2;
274
+ mov.u16 %rs3, 0x0;
275
+ @%p20 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd48 + 0 ];
276
+ @!%p20 mov.u16 %rs3, %rs2;
277
+ mov.u16 %rs5, 0x0;
278
+ @%p20 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd49 + 0 ];
279
+ @!%p20 mov.u16 %rs5, %rs2;
280
+ mov.u16 %rs7, 0x0;
281
+ @%p20 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd50 + 0 ];
282
+ @!%p20 mov.u16 %rs7, %rs2;
283
+ .loc 1 55 105
284
+ cvt.f32.bf16 %r49, %rs1;
285
+ mov.b32 %f66, %r49;
286
+ cvt.f32.bf16 %r50, %rs3;
287
+ mov.b32 %f67, %r50;
288
+ cvt.f32.bf16 %r51, %rs5;
289
+ mov.b32 %f68, %r51;
290
+ cvt.f32.bf16 %r52, %rs7;
291
+ mov.b32 %f69, %r52;
292
+ .loc 1 56 35
293
+ add.s64 %rd51, %rd65, -9649344;
294
+ add.s64 %rd52, %rd65, -6432896;
295
+ add.s64 %rd53, %rd65, -3216448;
296
+ .loc 1 56 53
297
+ mov.u32 %r53, 0x0;
298
+ @%p20 ld.global.L1::evict_first.b32 { %r53 }, [ %rd51 + 0 ];
299
+ @!%p20 mov.u32 %r53, %r54;
300
+ mov.b32 %f70, %r53;
301
+ mov.u32 %r55, 0x0;
302
+ @%p20 ld.global.L1::evict_first.b32 { %r55 }, [ %rd52 + 0 ];
303
+ @!%p20 mov.u32 %r55, %r54;
304
+ mov.b32 %f71, %r55;
305
+ mov.u32 %r57, 0x0;
306
+ @%p20 ld.global.L1::evict_first.b32 { %r57 }, [ %rd53 + 0 ];
307
+ @!%p20 mov.u32 %r57, %r54;
308
+ mov.b32 %f72, %r57;
309
+ mov.u32 %r59, 0x0;
310
+ @%p20 ld.global.L1::evict_first.b32 { %r59 }, [ %rd65 + 0 ];
311
+ @!%p20 mov.u32 %r59, %r54;
312
+ mov.b32 %f73, %r59;
313
+ .loc 1 57 35
314
+ add.s64 %rd55, %rd9, %rd64;
315
+ add.s64 %rd56, %rd55, 1608224;
316
+ add.s64 %rd57, %rd55, 3216448;
317
+ .loc 1 57 53
318
+ add.s64 %rd58, %rd55, 4824672;
319
+ mov.u16 %rs13, 0x0;
320
+ @%p20 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd55 + 0 ];
321
+ @!%p20 mov.u16 %rs13, %rs2;
322
+ mov.u16 %rs15, 0x0;
323
+ @%p20 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd56 + 0 ];
324
+ @!%p20 mov.u16 %rs15, %rs2;
325
+ mov.u16 %rs17, 0x0;
326
+ @%p20 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd57 + 0 ];
327
+ @!%p20 mov.u16 %rs17, %rs2;
328
+ mov.u16 %rs19, 0x0;
329
+ @%p20 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd58 + 0 ];
330
+ @!%p20 mov.u16 %rs19, %rs2;
331
+ .loc 1 57 105
332
+ cvt.f32.bf16 %r61, %rs13;
333
+ mov.b32 %f74, %r61;
334
+ cvt.f32.bf16 %r62, %rs15;
335
+ mov.b32 %f75, %r62;
336
+ cvt.f32.bf16 %r63, %rs17;
337
+ mov.b32 %f76, %r63;
338
+ cvt.f32.bf16 %r64, %rs19;
339
+ mov.b32 %f77, %r64;
340
+ .loc 1 65 23
341
+ mul.f32 %f59, %f74, 0f3FB8AA3B;
342
+ ex2.approx.f32 %f58, %f59;
343
+ mul.f32 %f61, %f75, 0f3FB8AA3B;
344
+ ex2.approx.f32 %f60, %f61;
345
+ mul.f32 %f63, %f76, 0f3FB8AA3B;
346
+ ex2.approx.f32 %f62, %f63;
347
+ mul.f32 %f65, %f77, 0f3FB8AA3B;
348
+ ex2.approx.f32 %f64, %f65;
349
+ .loc 1 66 24
350
+ mul.f32 %f78, %f13, %f58;
351
+ mul.f32 %f79, %f14, %f60;
352
+ mul.f32 %f80, %f15, %f62;
353
+ mul.f32 %f81, %f16, %f64;
354
+ .loc 1 67 24
355
+ neg.f32 %f82, %f78;
356
+ fma.rn.f32 %f83, %f1, %f70, %f82;
357
+ neg.f32 %f84, %f79;
358
+ fma.rn.f32 %f85, %f2, %f71, %f84;
359
+ neg.f32 %f86, %f80;
360
+ fma.rn.f32 %f87, %f3, %f72, %f86;
361
+ neg.f32 %f88, %f81;
362
+ fma.rn.f32 %f89, %f4, %f73, %f88;
363
+ .loc 1 69 24
364
+ add.f32 %f90, %f66, %f83;
365
+ add.f32 %f91, %f67, %f85;
366
+ add.f32 %f92, %f68, %f87;
367
+ add.f32 %f93, %f69, %f89;
368
+ .loc 1 70 29
369
+ add.s64 %rd59, %rd7, %rd64;
370
+ add.s64 %rd60, %rd59, 1608224;
371
+ add.s64 %rd61, %rd59, 3216448;
372
+ .loc 1 70 54
373
+ add.s64 %rd62, %rd59, 4824672;
374
+ mov.b32 %r65, %f90;
375
+ cvt.rn.bf16.f32 %rs25, %r65;
376
+ mov.b32 %r66, %f91;
377
+ cvt.rn.bf16.f32 %rs26, %r66;
378
+ mov.b32 %r67, %f92;
379
+ cvt.rn.bf16.f32 %rs27, %r67;
380
+ mov.b32 %r68, %f93;
381
+ cvt.rn.bf16.f32 %rs28, %r68;
382
+ @%p20 st.global.b16 [ %rd59 + 0 ], { %rs25 };
383
+ @%p20 st.global.b16 [ %rd60 + 0 ], { %rs26 };
384
+ @%p20 st.global.b16 [ %rd61 + 0 ], { %rs27 };
385
+ @%p20 st.global.b16 [ %rd62 + 0 ], { %rs28 };
386
+ .loc 1 51 36
387
+ add.s64 %rd65, %rd65, 32;
388
+ add.s64 %rd64, %rd64, 16;
389
+ setp.lt.u32 %p48, %r71, 50249;
390
+ @%p48 bra $L__BB0_3;
391
+ .loc 1 51 4
392
+ ret;
393
+ $L__tmp26:
394
+ $L__func_end0:
395
+
396
+ }
397
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
398
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
399
+ .section .debug_abbrev
400
+ {
401
+ .b8 1
402
+ .b8 17
403
+ .b8 1
404
+ .b8 37
405
+ .b8 8
406
+ .b8 19
407
+ .b8 5
408
+ .b8 3
409
+ .b8 8
410
+ .b8 16
411
+ .b8 6
412
+ .b8 27
413
+ .b8 8
414
+ .b8 180
415
+ .b8 66
416
+ .b8 12
417
+ .b8 17
418
+ .b8 1
419
+ .b8 18
420
+ .b8 1
421
+ .b8 0
422
+ .b8 0
423
+ .b8 2
424
+ .b8 46
425
+ .b8 0
426
+ .b8 135
427
+ .b8 64
428
+ .b8 8
429
+ .b8 3
430
+ .b8 8
431
+ .b8 58
432
+ .b8 11
433
+ .b8 59
434
+ .b8 11
435
+ .b8 63
436
+ .b8 12
437
+ .b8 32
438
+ .b8 11
439
+ .b8 0
440
+ .b8 0
441
+ .b8 3
442
+ .b8 46
443
+ .b8 1
444
+ .b8 17
445
+ .b8 1
446
+ .b8 18
447
+ .b8 1
448
+ .b8 64
449
+ .b8 10
450
+ .b8 49
451
+ .b8 19
452
+ .b8 0
453
+ .b8 0
454
+ .b8 4
455
+ .b8 29
456
+ .b8 0
457
+ .b8 49
458
+ .b8 19
459
+ .b8 17
460
+ .b8 1
461
+ .b8 18
462
+ .b8 1
463
+ .b8 88
464
+ .b8 11
465
+ .b8 89
466
+ .b8 11
467
+ .b8 87
468
+ .b8 11
469
+ .b8 0
470
+ .b8 0
471
+ .b8 5
472
+ .b8 29
473
+ .b8 1
474
+ .b8 49
475
+ .b8 19
476
+ .b8 17
477
+ .b8 1
478
+ .b8 18
479
+ .b8 1
480
+ .b8 88
481
+ .b8 11
482
+ .b8 89
483
+ .b8 11
484
+ .b8 87
485
+ .b8 11
486
+ .b8 0
487
+ .b8 0
488
+ .b8 0
489
+ }
490
+ .section .debug_info
491
+ {
492
+ .b32 278
493
+ .b8 2
494
+ .b8 0
495
+ .b32 .debug_abbrev
496
+ .b8 8
497
+ .b8 1
498
+ .b8 116
499
+ .b8 114
500
+ .b8 105
501
+ .b8 116
502
+ .b8 111
503
+ .b8 110
504
+ .b8 0
505
+ .b8 2
506
+ .b8 0
507
+ .b8 99
508
+ .b8 107
509
+ .b8 122
510
+ .b8 103
511
+ .b8 108
512
+ .b8 55
513
+ .b8 116
514
+ .b8 104
515
+ .b8 98
516
+ .b8 52
517
+ .b8 120
518
+ .b8 100
519
+ .b8 102
520
+ .b8 107
521
+ .b8 102
522
+ .b8 110
523
+ .b8 100
524
+ .b8 50
525
+ .b8 116
526
+ .b8 105
527
+ .b8 100
528
+ .b8 107
529
+ .b8 115
530
+ .b8 54
531
+ .b8 109
532
+ .b8 116
533
+ .b8 53
534
+ .b8 102
535
+ .b8 51
536
+ .b8 104
537
+ .b8 97
538
+ .b8 117
539
+ .b8 119
540
+ .b8 102
541
+ .b8 121
542
+ .b8 106
543
+ .b8 102
544
+ .b8 108
545
+ .b8 98
546
+ .b8 116
547
+ .b8 122
548
+ .b8 121
549
+ .b8 101
550
+ .b8 112
551
+ .b8 111
552
+ .b8 53
553
+ .b8 111
554
+ .b8 120
555
+ .b8 107
556
+ .b8 118
557
+ .b8 104
558
+ .b8 107
559
+ .b8 46
560
+ .b8 112
561
+ .b8 121
562
+ .b8 0
563
+ .b32 .debug_line
564
+ .b8 47
565
+ .b8 116
566
+ .b8 109
567
+ .b8 112
568
+ .b8 47
569
+ .b8 116
570
+ .b8 111
571
+ .b8 114
572
+ .b8 99
573
+ .b8 104
574
+ .b8 105
575
+ .b8 110
576
+ .b8 100
577
+ .b8 117
578
+ .b8 99
579
+ .b8 116
580
+ .b8 111
581
+ .b8 114
582
+ .b8 95
583
+ .b8 114
584
+ .b8 111
585
+ .b8 111
586
+ .b8 116
587
+ .b8 47
588
+ .b8 107
589
+ .b8 122
590
+ .b8 0
591
+ .b8 1
592
+ .b64 $L__func_begin0
593
+ .b64 $L__func_end0
594
+ .b8 2
595
+ .b8 116
596
+ .b8 114
597
+ .b8 105
598
+ .b8 116
599
+ .b8 111
600
+ .b8 110
601
+ .b8 95
602
+ .b8 95
603
+ .b8 48
604
+ .b8 100
605
+ .b8 49
606
+ .b8 100
607
+ .b8 50
608
+ .b8 100
609
+ .b8 51
610
+ .b8 100
611
+ .b8 52
612
+ .b8 100
613
+ .b8 53
614
+ .b8 100
615
+ .b8 54
616
+ .b8 100
617
+ .b8 55
618
+ .b8 100
619
+ .b8 101
620
+ .b8 56
621
+ .b8 0
622
+ .b8 116
623
+ .b8 114
624
+ .b8 105
625
+ .b8 116
626
+ .b8 111
627
+ .b8 110
628
+ .b8 95
629
+ .b8 95
630
+ .b8 48
631
+ .b8 100
632
+ .b8 49
633
+ .b8 100
634
+ .b8 50
635
+ .b8 100
636
+ .b8 51
637
+ .b8 100
638
+ .b8 52
639
+ .b8 100
640
+ .b8 53
641
+ .b8 100
642
+ .b8 54
643
+ .b8 100
644
+ .b8 55
645
+ .b8 100
646
+ .b8 101
647
+ .b8 56
648
+ .b8 0
649
+ .b8 1
650
+ .b8 18
651
+ .b8 1
652
+ .b8 1
653
+ .b8 3
654
+ .b64 $L__func_begin0
655
+ .b64 $L__func_end0
656
+ .b8 1
657
+ .b8 156
658
+ .b32 125
659
+ .b8 4
660
+ .b32 125
661
+ .b64 $L__tmp1
662
+ .b64 $L__tmp24
663
+ .b8 2
664
+ .b8 46
665
+ .b8 27
666
+ .b8 5
667
+ .b32 125
668
+ .b64 $L__tmp2
669
+ .b64 $L__tmp25
670
+ .b8 2
671
+ .b8 46
672
+ .b8 27
673
+ .b8 4
674
+ .b32 125
675
+ .b64 $L__tmp2
676
+ .b64 $L__tmp25
677
+ .b8 2
678
+ .b8 243
679
+ .b8 36
680
+ .b8 0
681
+ .b8 0
682
+ .b8 0
683
+ }
684
+ .section .debug_pubnames
685
+ {
686
+ .b32 $L__pubNames_end0-$L__pubNames_start0
687
+ $L__pubNames_start0:
688
+ .b8 2
689
+ .b8 0
690
+ .b32 .debug_info
691
+ .b32 282
692
+ .b32 125
693
+ .b8 116
694
+ .b8 114
695
+ .b8 105
696
+ .b8 116
697
+ .b8 111
698
+ .b8 110
699
+ .b8 95
700
+ .b8 95
701
+ .b8 48
702
+ .b8 100
703
+ .b8 49
704
+ .b8 100
705
+ .b8 50
706
+ .b8 100
707
+ .b8 51
708
+ .b8 100
709
+ .b8 52
710
+ .b8 100
711
+ .b8 53
712
+ .b8 100
713
+ .b8 54
714
+ .b8 100
715
+ .b8 55
716
+ .b8 100
717
+ .b8 101
718
+ .b8 56
719
+ .b8 0
720
+ .b32 0
721
+ $L__pubNames_end0:
722
+ }
723
+ .section .debug_pubtypes
724
+ {
725
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
726
+ $L__pubTypes_start0:
727
+ .b8 2
728
+ .b8 0
729
+ .b32 .debug_info
730
+ .b32 282
731
+ .b32 0
732
+ $L__pubTypes_end0:
733
+ }
734
+ .section .debug_loc { }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
7
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
8
+
9
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
10
+
11
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
12
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
13
+ %5 = and i32 %4, 127, !dbg !10
14
+ %6 = shl nuw nsw i32 %5, 1, !dbg !10
15
+ %7 = or i32 %6, 1, !dbg !10
16
+ %8 = or i32 %6, 256, !dbg !10
17
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
18
+ %10 = sext i32 %9 to i64, !dbg !12
19
+ %11 = shl nsw i64 %10, 9, !dbg !13
20
+ %12 = zext nneg i32 %6 to i64
21
+ %13 = zext nneg i32 %8 to i64
22
+ %14 = or i64 %11, %12, !dbg !14
23
+ %15 = or i64 %11, %13, !dbg !14
24
+ %16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
25
+ %17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
26
+ %18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
27
+ %19 = extractvalue { i64, i64 } %18, 0, !dbg !16
28
+ %20 = extractvalue { i64, i64 } %18, 1, !dbg !16
29
+ %21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
30
+ %22 = extractvalue { i64, i64 } %21, 0, !dbg !16
31
+ %23 = extractvalue { i64, i64 } %21, 1, !dbg !16
32
+ %24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
33
+ %25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
34
+ %26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
35
+ %27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
36
+ %28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
37
+ %29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
38
+ %30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
39
+ %31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
40
+ %32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
41
+ %33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
42
+ %34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
43
+ %35 = extractelement <4 x i1> %33, i64 3, !dbg !22
44
+ %36 = zext i1 %35 to i8, !dbg !22
45
+ %37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
46
+ store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
47
+ %38 = zext nneg i32 %7 to i64, !dbg !22
48
+ %39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
49
+ %40 = extractelement <4 x i1> %33, i64 2, !dbg !22
50
+ %41 = zext i1 %40 to i8, !dbg !22
51
+ %42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
52
+ store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
53
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
54
+ %43 = zext nneg i32 %5 to i64, !dbg !22
55
+ %44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
56
+ %45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
57
+ %46 = or i32 %5, 128, !dbg !22
58
+ %47 = zext nneg i32 %46 to i64, !dbg !22
59
+ %48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
60
+ %49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
61
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
62
+ %50 = extractelement <4 x i1> %33, i64 1, !dbg !22
63
+ %51 = zext i1 %50 to i8, !dbg !22
64
+ %52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
65
+ store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
66
+ %53 = extractelement <4 x i1> %33, i64 0, !dbg !22
67
+ %54 = zext i1 %53 to i8, !dbg !22
68
+ %55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
69
+ store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
70
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
71
+ %56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
72
+ %57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
73
+ %58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
74
+ %59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
75
+ %60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
76
+ %61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
77
+ %62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
78
+ %63 = bitcast <4 x i1> %62 to i4, !dbg !23
79
+ %.not = icmp eq i4 %63, 0, !dbg !23
80
+ br i1 %.not, label %65, label %64, !dbg !23
81
+
82
+ 64: ; preds = %3
83
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
84
+ br label %65, !dbg !23
85
+
86
+ 65: ; preds = %64, %3
87
+ %66 = or i32 %6, 257, !dbg !10
88
+ %67 = zext nneg i32 %66 to i64
89
+ %68 = or i64 %11, %67, !dbg !14
90
+ %69 = or i64 %11, %38, !dbg !14
91
+ %70 = mul nsw i64 %14, 50257, !dbg !24
92
+ %71 = mul nsw i64 %69, 50257, !dbg !24
93
+ %72 = mul nsw i64 %15, 50257, !dbg !24
94
+ %73 = mul nsw i64 %68, 50257, !dbg !24
95
+ %74 = extractelement <4 x i64> %32, i64 3, !dbg !25
96
+ %75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
97
+ %76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
98
+ %77 = extractelement <4 x i64> %32, i64 2, !dbg !25
99
+ %78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
100
+ %79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
101
+ %80 = extractelement <4 x i64> %32, i64 1, !dbg !25
102
+ %81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
103
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
104
+ %83 = extractelement <4 x i64> %32, i64 0, !dbg !25
105
+ %84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
106
+ %85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
107
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
108
+ %86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
109
+ %87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
110
+ %88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
111
+ store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
112
+ %89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
113
+ %90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
114
+ %91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
115
+ store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
117
+ %92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
118
+ %93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
119
+ %94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
120
+ %95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
121
+ %96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
122
+ %97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
123
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
124
+ %98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
125
+ %99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
126
+ store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
127
+ %100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
128
+ %101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
129
+ store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
130
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
131
+ %102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
132
+ %103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
133
+ %104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
134
+ %105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
135
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
136
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
137
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
138
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
139
+ ret void, !dbg !27
140
+ }
141
+
142
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
143
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
144
+
145
+ ; Function Attrs: convergent nocallback nounwind
146
+ declare void @llvm.nvvm.barrier0() #1
147
+
148
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
149
+ attributes #1 = { convergent nocallback nounwind }
150
+ attributes #2 = { nounwind }
151
+
152
+ !llvm.module.flags = !{!0, !1}
153
+ !llvm.dbg.cu = !{!2}
154
+ !nvvm.annotations = !{!4, !5, !5, !4}
155
+ !llvm.ident = !{!6}
156
+
157
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
158
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
159
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
160
+ !3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
161
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
162
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
163
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
164
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
165
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
166
+ !9 = !{}
167
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
168
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
169
+ !12 = !DILocation(line: 20, column: 34, scope: !7)
170
+ !13 = !DILocation(line: 20, column: 46, scope: !7)
171
+ !14 = !DILocation(line: 21, column: 23, scope: !7)
172
+ !15 = !DILocation(line: 24, column: 30, scope: !7)
173
+ !16 = !DILocation(line: 24, column: 35, scope: !7)
174
+ !17 = !DILocation(line: 26, column: 19, scope: !7)
175
+ !18 = !DILocation(line: 28, column: 32, scope: !7)
176
+ !19 = !DILocation(line: 29, column: 18, scope: !7)
177
+ !20 = !DILocation(line: 30, column: 18, scope: !7)
178
+ !21 = !DILocation(line: 31, column: 32, scope: !7)
179
+ !22 = !DILocation(line: 32, column: 36, scope: !7)
180
+ !23 = !DILocation(line: 32, column: 51, scope: !7)
181
+ !24 = !DILocation(line: 34, column: 39, scope: !7)
182
+ !25 = !DILocation(line: 34, column: 25, scope: !7)
183
+ !26 = !DILocation(line: 34, column: 51, scope: !7)
184
+ !27 = !DILocation(line: 34, column: 4, scope: !7)
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<50257> : tensor<512xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked>
7
+ %cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked>
8
+ %cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1>
9
+ %c512_i64 = arith.constant 512 : i64
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.extsi %0 : i32 to i64
12
+ %2 = arith.muli %1, %c512_i64 : i64
13
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
14
+ %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
15
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
16
+ %6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
17
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>, #blocked>
18
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>, #blocked>, tensor<512xi64, #blocked>
19
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked>
20
+ %10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked>
21
+ %11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
22
+ %12 = arith.addi %11, %cst : tensor<512xi64, #blocked>
23
+ %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked>
24
+ %14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
25
+ %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked>
26
+ %16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked>
27
+ %17 = arith.andi %15, %16 : tensor<512xi1, #blocked>
28
+ %18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1>
29
+ tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1>
30
+ %19 = arith.muli %6, %cst : tensor<512xi64, #blocked>
31
+ %20 = arith.addi %14, %19 : tensor<512xi64, #blocked>
32
+ %21 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
33
+ %22 = tt.addptr %21, %20 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
34
+ %23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr<f32, 1>, #blocked>) -> tensor<512x!tt.ptr<f32, 1>, #blocked1>
35
+ tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<50257> : tensor<512xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<512xi64>
5
+ %c512_i64 = arith.constant 512 : i64
6
+ %cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32>
7
+ %cst_2 = arith.constant dense<-1> : tensor<512xi64>
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.extsi %0 : i32 to i64
10
+ %2 = arith.muli %1, %c512_i64 : i64
11
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
12
+ %4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
13
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64>
14
+ %6 = arith.addi %5, %4 : tensor<512xi64>
15
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>>
16
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>>, tensor<512xi64>
17
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64>
18
+ %10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64>
19
+ %11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64>
20
+ %12 = arith.addi %11, %cst : tensor<512xi64>
21
+ %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64>
22
+ %14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64>
23
+ %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64>
24
+ %16 = arith.cmpi slt, %14, %cst : tensor<512xi64>
25
+ %17 = arith.andi %15, %16 : tensor<512xi1>
26
+ tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1>
27
+ %18 = arith.muli %6, %cst : tensor<512xi64>
28
+ %19 = arith.addi %14, %18 : tensor<512xi64>
29
+ %20 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
30
+ %21 = tt.addptr %20, %19 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
31
+ tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
32
+ tt.return
33
+ }
34
+ }
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
11
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
12
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
13
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
14
+ %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
16
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
17
+ tt.return
18
+ }
19
+ }
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
5
+ %c512_i64 = arith.constant 512 : i64
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.extsi %0 : i32 to i64
8
+ %2 = arith.muli %1, %c512_i64 : i64
9
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
10
+ %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
11
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
12
+ %6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
13
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
14
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
15
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin ADDED
Binary file (10.5 kB). View file
 
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+
11
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
12
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
21
+ )
22
+ .maxntid 256, 1, 1
23
+ {
24
+ .reg .pred %p<16>;
25
+ .reg .b16 %rs<9>;
26
+ .reg .b32 %r<31>;
27
+ .reg .f32 %f<23>;
28
+ .reg .b64 %rd<51>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6];
34
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5];
35
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4];
36
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_0];
37
+ ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_1];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r13, %tid.x;
41
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_2];
42
+ bfe.u32 %r14, %r13, 2, 6;
43
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_3];
44
+ .loc 1 24 33
45
+ and.b32 %r1, %r13, 3;
46
+ .loc 1 21 28
47
+ mov.u32 %r6, %ctaid.x;
48
+ .loc 1 21 34
49
+ cvt.s64.s32 %rd1, %r6;
50
+ .loc 1 21 46
51
+ mul.wide.s32 %rd27, %r6, 64;
52
+ cvt.u64.u32 %rd2, %r14;
53
+ .loc 1 22 23
54
+ or.b64 %rd28, %rd27, %rd2;
55
+ .loc 1 26 30
56
+ shl.b64 %rd29, %rd28, 3;
57
+ add.s64 %rd22, %rd26, %rd29;
58
+ mov.pred %p1, -1;
59
+ .loc 1 26 35
60
+ mov.u64 %rd21, 0x0;
61
+ @%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ];
62
+ .loc 1 27 19
63
+ mov.u32 %r10, 0x0;
64
+ @%p1 ld.global.b32 { %r10 }, [ %rd23 + 0 ];
65
+ .loc 1 29 19
66
+ mov.u32 %r11, 0x0;
67
+ @%p1 ld.global.b32 { %r11 }, [ %rd24 + 0 ];
68
+ .loc 1 38 23
69
+ setp.eq.s64 %p4, %rd21, -1;
70
+ .loc 1 39 22
71
+ div.full.f32 %r9, %r10, %r11;
72
+ mov.b32 %f6, %r9;
73
+ .loc 1 41 37
74
+ selp.f32 %f1, 0f00000000, %f6, %p4;
75
+ .loc 1 32 36
76
+ mul.wide.s32 %rd30, %r6, 12865792;
77
+ mul.wide.u32 %rd31, %r14, 201028;
78
+ add.s64 %rd32, %rd30, %rd31;
79
+ cvt.u64.u32 %rd33, %r13;
80
+ and.b64 %rd3, %rd33, 3;
81
+ mul.wide.u32 %rd34, %r1, 4;
82
+ add.s64 %rd35, %rd32, %rd34;
83
+ add.s64 %rd50, %rd25, %rd35;
84
+ mov.f32 %f22, 0f00000000;
85
+ mov.b32 %r29, -4;
86
+ mov.u64 %rd46, %rd50;
87
+ $L__BB0_1:
88
+ add.s32 %r29, %r29, 4;
89
+ .loc 1 33 27
90
+ add.s32 %r17, %r29, %r1;
91
+ .loc 1 34 25
92
+ setp.lt.u32 %p5, %r17, 50257;
93
+ mov.b32 %r16, 0;
94
+ .loc 1 36 52
95
+ mov.u32 %r15, 0x0;
96
+ @%p5 ld.global.L1::evict_last.b32 { %r15 }, [ %rd46 + 0 ];
97
+ @!%p5 mov.u32 %r15, %r16;
98
+ mov.b32 %f7, %r15;
99
+ .loc 1 42 23
100
+ mul.f32 %f8, %f1, %f7;
101
+ .loc 1 45 40
102
+ selp.f32 %f9, %f8, 0f80000000, %p5;
103
+ add.f32 %f22, %f22, %f9;
104
+ .loc 1 32 36
105
+ add.s64 %rd46, %rd46, 16;
106
+ setp.lt.u32 %p7, %r29, 50253;
107
+ @%p7 bra $L__BB0_1;
108
+ $L__tmp1:
109
+ .loc 2 243 36
110
+ mov.b32 %r19, %f22;
111
+ shfl.sync.bfly.b32 %r20, %r19, 2, 31, -1;
112
+ mov.b32 %f10, %r20;
113
+ $L__tmp2:
114
+ .loc 2 233 15
115
+ add.f32 %f11, %f22, %f10;
116
+ $L__tmp3:
117
+ .loc 2 243 36
118
+ mov.b32 %r21, %f11;
119
+ shfl.sync.bfly.b32 %r22, %r21, 1, 31, -1;
120
+ mov.b32 %f12, %r22;
121
+ $L__tmp4:
122
+ .loc 2 233 15
123
+ add.f32 %f4, %f11, %f12;
124
+ $L__tmp5:
125
+ .loc 1 51 36
126
+ mul.lo.s64 %rd37, %rd1, 3216448;
127
+ mul.lo.s64 %rd38, %rd2, 50257;
128
+ add.s64 %rd39, %rd37, %rd38;
129
+ add.s64 %rd40, %rd39, %rd3;
130
+ shl.b64 %rd41, %rd40, 1;
131
+ add.s64 %rd49, %rd20, %rd41;
132
+ add.s64 %rd48, %rd19, %rd41;
133
+ add.s64 %rd47, %rd18, %rd41;
134
+ mov.b32 %r30, -4;
135
+ mov.u16 %rs2, 0;
136
+ $L__BB0_3:
137
+ add.s32 %r30, %r30, 4;
138
+ .loc 1 52 27
139
+ add.s32 %r28, %r30, %r1;
140
+ .loc 1 53 25
141
+ setp.lt.u32 %p8, %r28, 50257;
142
+ .loc 1 55 53
143
+ mov.u16 %rs1, 0x0;
144
+ @%p8 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
145
+ @!%p8 mov.u16 %rs1, %rs2;
146
+ .loc 1 55 105
147
+ cvt.f32.bf16 %r23, %rs1;
148
+ mov.b32 %f15, %r23;
149
+ .loc 1 56 53
150
+ mov.u32 %r24, 0x0;
151
+ @%p8 ld.global.L1::evict_first.b32 { %r24 }, [ %rd50 + 0 ];
152
+ @!%p8 mov.u32 %r24, %r16;
153
+ mov.b32 %f16, %r24;
154
+ .loc 1 57 53
155
+ mov.u16 %rs4, 0x0;
156
+ @%p8 ld.global.L1::evict_first.b16 { %rs4 }, [ %rd48 + 0 ];
157
+ @!%p8 mov.u16 %rs4, %rs2;
158
+ .loc 1 57 105
159
+ cvt.f32.bf16 %r26, %rs4;
160
+ mov.b32 %f17, %r26;
161
+ .loc 1 65 23
162
+ mul.f32 %f14, %f17, 0f3FB8AA3B;
163
+ ex2.approx.f32 %f13, %f14;
164
+ .loc 1 66 24
165
+ mul.f32 %f18, %f4, %f13;
166
+ .loc 1 67 24
167
+ neg.f32 %f19, %f18;
168
+ fma.rn.f32 %f20, %f1, %f16, %f19;
169
+ .loc 1 69 24
170
+ add.f32 %f21, %f15, %f20;
171
+ .loc 1 70 54
172
+ mov.b32 %r27, %f21;
173
+ cvt.rn.bf16.f32 %rs7, %r27;
174
+ @%p8 st.global.b16 [ %rd49 + 0 ], { %rs7 };
175
+ .loc 1 51 36
176
+ add.s64 %rd50, %rd50, 16;
177
+ add.s64 %rd49, %rd49, 8;
178
+ add.s64 %rd48, %rd48, 8;
179
+ add.s64 %rd47, %rd47, 8;
180
+ setp.lt.u32 %p15, %r30, 50253;
181
+ @%p15 bra $L__BB0_3;
182
+ .loc 1 51 4
183
+ ret;
184
+ $L__tmp6:
185
+ $L__func_end0:
186
+
187
+ }
188
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
189
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
190
+ .section .debug_abbrev
191
+ {
192
+ .b8 1
193
+ .b8 17
194
+ .b8 1
195
+ .b8 37
196
+ .b8 8
197
+ .b8 19
198
+ .b8 5
199
+ .b8 3
200
+ .b8 8
201
+ .b8 16
202
+ .b8 6
203
+ .b8 27
204
+ .b8 8
205
+ .b8 180
206
+ .b8 66
207
+ .b8 12
208
+ .b8 17
209
+ .b8 1
210
+ .b8 18
211
+ .b8 1
212
+ .b8 0
213
+ .b8 0
214
+ .b8 2
215
+ .b8 46
216
+ .b8 0
217
+ .b8 135
218
+ .b8 64
219
+ .b8 8
220
+ .b8 3
221
+ .b8 8
222
+ .b8 58
223
+ .b8 11
224
+ .b8 59
225
+ .b8 11
226
+ .b8 63
227
+ .b8 12
228
+ .b8 32
229
+ .b8 11
230
+ .b8 0
231
+ .b8 0
232
+ .b8 3
233
+ .b8 46
234
+ .b8 1
235
+ .b8 17
236
+ .b8 1
237
+ .b8 18
238
+ .b8 1
239
+ .b8 64
240
+ .b8 10
241
+ .b8 49
242
+ .b8 19
243
+ .b8 0
244
+ .b8 0
245
+ .b8 4
246
+ .b8 29
247
+ .b8 0
248
+ .b8 49
249
+ .b8 19
250
+ .b8 17
251
+ .b8 1
252
+ .b8 18
253
+ .b8 1
254
+ .b8 88
255
+ .b8 11
256
+ .b8 89
257
+ .b8 11
258
+ .b8 87
259
+ .b8 11
260
+ .b8 0
261
+ .b8 0
262
+ .b8 5
263
+ .b8 29
264
+ .b8 1
265
+ .b8 49
266
+ .b8 19
267
+ .b8 17
268
+ .b8 1
269
+ .b8 18
270
+ .b8 1
271
+ .b8 88
272
+ .b8 11
273
+ .b8 89
274
+ .b8 11
275
+ .b8 87
276
+ .b8 11
277
+ .b8 0
278
+ .b8 0
279
+ .b8 0
280
+ }
281
+ .section .debug_info
282
+ {
283
+ .b32 278
284
+ .b8 2
285
+ .b8 0
286
+ .b32 .debug_abbrev
287
+ .b8 8
288
+ .b8 1
289
+ .b8 116
290
+ .b8 114
291
+ .b8 105
292
+ .b8 116
293
+ .b8 111
294
+ .b8 110
295
+ .b8 0
296
+ .b8 2
297
+ .b8 0
298
+ .b8 99
299
+ .b8 107
300
+ .b8 122
301
+ .b8 103
302
+ .b8 108
303
+ .b8 55
304
+ .b8 116
305
+ .b8 104
306
+ .b8 98
307
+ .b8 52
308
+ .b8 120
309
+ .b8 100
310
+ .b8 102
311
+ .b8 107
312
+ .b8 102
313
+ .b8 110
314
+ .b8 100
315
+ .b8 50
316
+ .b8 116
317
+ .b8 105
318
+ .b8 100
319
+ .b8 107
320
+ .b8 115
321
+ .b8 54
322
+ .b8 109
323
+ .b8 116
324
+ .b8 53
325
+ .b8 102
326
+ .b8 51
327
+ .b8 104
328
+ .b8 97
329
+ .b8 117
330
+ .b8 119
331
+ .b8 102
332
+ .b8 121
333
+ .b8 106
334
+ .b8 102
335
+ .b8 108
336
+ .b8 98
337
+ .b8 116
338
+ .b8 122
339
+ .b8 121
340
+ .b8 101
341
+ .b8 112
342
+ .b8 111
343
+ .b8 53
344
+ .b8 111
345
+ .b8 120
346
+ .b8 107
347
+ .b8 118
348
+ .b8 104
349
+ .b8 107
350
+ .b8 46
351
+ .b8 112
352
+ .b8 121
353
+ .b8 0
354
+ .b32 .debug_line
355
+ .b8 47
356
+ .b8 116
357
+ .b8 109
358
+ .b8 112
359
+ .b8 47
360
+ .b8 116
361
+ .b8 111
362
+ .b8 114
363
+ .b8 99
364
+ .b8 104
365
+ .b8 105
366
+ .b8 110
367
+ .b8 100
368
+ .b8 117
369
+ .b8 99
370
+ .b8 116
371
+ .b8 111
372
+ .b8 114
373
+ .b8 95
374
+ .b8 114
375
+ .b8 111
376
+ .b8 111
377
+ .b8 116
378
+ .b8 47
379
+ .b8 107
380
+ .b8 122
381
+ .b8 0
382
+ .b8 1
383
+ .b64 $L__func_begin0
384
+ .b64 $L__func_end0
385
+ .b8 2
386
+ .b8 116
387
+ .b8 114
388
+ .b8 105
389
+ .b8 116
390
+ .b8 111
391
+ .b8 110
392
+ .b8 95
393
+ .b8 95
394
+ .b8 48
395
+ .b8 100
396
+ .b8 49
397
+ .b8 100
398
+ .b8 50
399
+ .b8 100
400
+ .b8 51
401
+ .b8 100
402
+ .b8 52
403
+ .b8 100
404
+ .b8 53
405
+ .b8 100
406
+ .b8 54
407
+ .b8 100
408
+ .b8 55
409
+ .b8 100
410
+ .b8 101
411
+ .b8 56
412
+ .b8 0
413
+ .b8 116
414
+ .b8 114
415
+ .b8 105
416
+ .b8 116
417
+ .b8 111
418
+ .b8 110
419
+ .b8 95
420
+ .b8 95
421
+ .b8 48
422
+ .b8 100
423
+ .b8 49
424
+ .b8 100
425
+ .b8 50
426
+ .b8 100
427
+ .b8 51
428
+ .b8 100
429
+ .b8 52
430
+ .b8 100
431
+ .b8 53
432
+ .b8 100
433
+ .b8 54
434
+ .b8 100
435
+ .b8 55
436
+ .b8 100
437
+ .b8 101
438
+ .b8 56
439
+ .b8 0
440
+ .b8 1
441
+ .b8 18
442
+ .b8 1
443
+ .b8 1
444
+ .b8 3
445
+ .b64 $L__func_begin0
446
+ .b64 $L__func_end0
447
+ .b8 1
448
+ .b8 156
449
+ .b32 125
450
+ .b8 4
451
+ .b32 125
452
+ .b64 $L__tmp1
453
+ .b64 $L__tmp4
454
+ .b8 2
455
+ .b8 46
456
+ .b8 27
457
+ .b8 5
458
+ .b32 125
459
+ .b64 $L__tmp2
460
+ .b64 $L__tmp5
461
+ .b8 2
462
+ .b8 46
463
+ .b8 27
464
+ .b8 4
465
+ .b32 125
466
+ .b64 $L__tmp2
467
+ .b64 $L__tmp5
468
+ .b8 2
469
+ .b8 243
470
+ .b8 36
471
+ .b8 0
472
+ .b8 0
473
+ .b8 0
474
+ }
475
+ .section .debug_pubnames
476
+ {
477
+ .b32 $L__pubNames_end0-$L__pubNames_start0
478
+ $L__pubNames_start0:
479
+ .b8 2
480
+ .b8 0
481
+ .b32 .debug_info
482
+ .b32 282
483
+ .b32 125
484
+ .b8 116
485
+ .b8 114
486
+ .b8 105
487
+ .b8 116
488
+ .b8 111
489
+ .b8 110
490
+ .b8 95
491
+ .b8 95
492
+ .b8 48
493
+ .b8 100
494
+ .b8 49
495
+ .b8 100
496
+ .b8 50
497
+ .b8 100
498
+ .b8 51
499
+ .b8 100
500
+ .b8 52
501
+ .b8 100
502
+ .b8 53
503
+ .b8 100
504
+ .b8 54
505
+ .b8 100
506
+ .b8 55
507
+ .b8 100
508
+ .b8 101
509
+ .b8 56
510
+ .b8 0
511
+ .b32 0
512
+ $L__pubNames_end0:
513
+ }
514
+ .section .debug_pubtypes
515
+ {
516
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
517
+ $L__pubTypes_start0:
518
+ .b8 2
519
+ .b8 0
520
+ .b32 .debug_info
521
+ .b32 282
522
+ .b32 0
523
+ $L__pubTypes_end0:
524
+ }
525
+ .section .debug_loc { }
.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
10
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
11
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
12
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
13
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
14
+ %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
15
+ %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
16
+ %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
17
+ %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
18
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
19
+ %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
20
+ %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
21
+ tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
22
+ tt.return
23
+ }
24
+ }
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = lshr i32 %10, 5, !dbg !8
9
+ %urem = and i32 %10, 255, !dbg !9
10
+ %12 = or i32 %urem, 256, !dbg !9
11
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
12
+ %14 = sext i32 %13 to i64, !dbg !11
13
+ %15 = shl nsw i64 %14, 3, !dbg !12
14
+ %16 = or i64 %15, 1, !dbg !13
15
+ %17 = or i64 %15, 2, !dbg !13
16
+ %18 = or i64 %15, 3, !dbg !13
17
+ %19 = or i64 %15, 4, !dbg !13
18
+ %20 = or i64 %15, 5, !dbg !13
19
+ %21 = or i64 %15, 6, !dbg !13
20
+ %22 = or i64 %15, 7, !dbg !13
21
+ %23 = insertelement <2 x i32> poison, i32 %urem, i64 0
22
+ %24 = insertelement <2 x i32> %23, i32 %12, i64 1
23
+ %25 = zext nneg <2 x i32> %24 to <2 x i64>
24
+ %26 = getelementptr i64, ptr addrspace(1) %1, i64 %15, !dbg !14
25
+ %27 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !14
26
+ %28 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !14
27
+ %29 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
28
+ %30 = getelementptr i64, ptr addrspace(1) %1, i64 %19, !dbg !14
29
+ %31 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !14
30
+ %32 = getelementptr i64, ptr addrspace(1) %1, i64 %21, !dbg !14
31
+ %33 = getelementptr i64, ptr addrspace(1) %1, i64 %22, !dbg !14
32
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #3, !dbg !15
33
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #3, !dbg !15
34
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #3, !dbg !15
35
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #3, !dbg !15
36
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !15
37
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %31, i1 true) #3, !dbg !15
38
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %32, i1 true) #3, !dbg !15
39
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %33, i1 true) #3, !dbg !15
40
+ %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16
41
+ %43 = bitcast i32 %42 to float, !dbg !16
42
+ %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17
43
+ %45 = bitcast i32 %44 to float, !dbg !17
44
+ %46 = mul nsw i64 %14, 402056, !dbg !18
45
+ %47 = mul nsw i64 %16, 50257, !dbg !18
46
+ %48 = mul nsw i64 %17, 50257, !dbg !18
47
+ %49 = mul nsw i64 %18, 50257, !dbg !18
48
+ %50 = mul nsw i64 %19, 50257, !dbg !18
49
+ %51 = mul nsw i64 %20, 50257, !dbg !18
50
+ %52 = mul nsw i64 %21, 50257, !dbg !18
51
+ %53 = mul nsw i64 %22, 50257, !dbg !18
52
+ %54 = insertelement <8 x i64> poison, i64 %34, i64 0, !dbg !19
53
+ %55 = insertelement <8 x i64> %54, i64 %35, i64 1, !dbg !19
54
+ %56 = insertelement <8 x i64> %55, i64 %36, i64 2, !dbg !19
55
+ %57 = insertelement <8 x i64> %56, i64 %37, i64 3, !dbg !19
56
+ %58 = insertelement <8 x i64> %57, i64 %38, i64 4, !dbg !19
57
+ %59 = insertelement <8 x i64> %58, i64 %39, i64 5, !dbg !19
58
+ %60 = insertelement <8 x i64> %59, i64 %40, i64 6, !dbg !19
59
+ %61 = insertelement <8 x i64> %60, i64 %41, i64 7, !dbg !19
60
+ %62 = icmp eq <8 x i64> %61, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
61
+ %63 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %43, float %45) #3, !dbg !20
62
+ %64 = insertelement <8 x float> poison, float %63, i64 0, !dbg !21
63
+ %65 = shufflevector <8 x float> %64, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !21
64
+ %66 = select <8 x i1> %62, <8 x float> zeroinitializer, <8 x float> %65, !dbg !21
65
+ %67 = shufflevector <8 x float> %66, <8 x float> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>, !dbg !21
66
+ br label %68, !dbg !22
67
+
68
+ 68: ; preds = %9, %68
69
+ %69 = phi i32 [ 0, %9 ], [ %135, %68 ]
70
+ %70 = phi <16 x float> [ zeroinitializer, %9 ], [ %134, %68 ]
71
+ %71 = zext nneg i32 %69 to i64, !dbg !23
72
+ %72 = insertelement <2 x i64> poison, i64 %71, i64 0, !dbg !23
73
+ %73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23
74
+ %74 = or <2 x i64> %73, %25, !dbg !23
75
+ %75 = icmp ult <2 x i64> %74, <i64 50257, i64 50257>, !dbg !24
76
+ %76 = shufflevector <2 x i1> %75, <2 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, !dbg !24
77
+ %77 = extractelement <2 x i64> %74, i64 0, !dbg !25
78
+ %78 = getelementptr float, ptr addrspace(1) %0, i64 %77, !dbg !25
79
+ %79 = getelementptr float, ptr addrspace(1) %78, i64 %46, !dbg !25
80
+ %80 = extractelement <2 x i64> %74, i64 1, !dbg !25
81
+ %81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !25
82
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %46, !dbg !25
83
+ %83 = getelementptr float, ptr addrspace(1) %78, i64 %47, !dbg !25
84
+ %84 = getelementptr float, ptr addrspace(1) %81, i64 %47, !dbg !25
85
+ %85 = getelementptr float, ptr addrspace(1) %78, i64 %48, !dbg !25
86
+ %86 = getelementptr float, ptr addrspace(1) %81, i64 %48, !dbg !25
87
+ %87 = getelementptr float, ptr addrspace(1) %78, i64 %49, !dbg !25
88
+ %88 = getelementptr float, ptr addrspace(1) %81, i64 %49, !dbg !25
89
+ %89 = getelementptr float, ptr addrspace(1) %78, i64 %50, !dbg !25
90
+ %90 = getelementptr float, ptr addrspace(1) %81, i64 %50, !dbg !25
91
+ %91 = getelementptr float, ptr addrspace(1) %78, i64 %51, !dbg !25
92
+ %92 = getelementptr float, ptr addrspace(1) %81, i64 %51, !dbg !25
93
+ %93 = getelementptr float, ptr addrspace(1) %78, i64 %52, !dbg !25
94
+ %94 = getelementptr float, ptr addrspace(1) %81, i64 %52, !dbg !25
95
+ %95 = getelementptr float, ptr addrspace(1) %78, i64 %53, !dbg !25
96
+ %96 = getelementptr float, ptr addrspace(1) %81, i64 %53, !dbg !25
97
+ %97 = extractelement <2 x i1> %75, i64 0, !dbg !26
98
+ %98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %79, i1 %97, i32 0, i1 %97) #3, !dbg !26
99
+ %99 = extractelement <2 x i1> %75, i64 1, !dbg !26
100
+ %100 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 %99, i32 0, i1 %99) #3, !dbg !26
101
+ %101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %83, i1 %97, i32 0, i1 %97) #3, !dbg !26
102
+ %102 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %84, i1 %99, i32 0, i1 %99) #3, !dbg !26
103
+ %103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %85, i1 %97, i32 0, i1 %97) #3, !dbg !26
104
+ %104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %86, i1 %99, i32 0, i1 %99) #3, !dbg !26
105
+ %105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %87, i1 %97, i32 0, i1 %97) #3, !dbg !26
106
+ %106 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %88, i1 %99, i32 0, i1 %99) #3, !dbg !26
107
+ %107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %89, i1 %97, i32 0, i1 %97) #3, !dbg !26
108
+ %108 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %90, i1 %99, i32 0, i1 %99) #3, !dbg !26
109
+ %109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 %97, i32 0, i1 %97) #3, !dbg !26
110
+ %110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %92, i1 %99, i32 0, i1 %99) #3, !dbg !26
111
+ %111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %93, i1 %97, i32 0, i1 %97) #3, !dbg !26
112
+ %112 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %94, i1 %99, i32 0, i1 %99) #3, !dbg !26
113
+ %113 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %95, i1 %97, i32 0, i1 %97) #3, !dbg !26
114
+ %114 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %96, i1 %99, i32 0, i1 %99) #3, !dbg !26
115
+ %115 = insertelement <16 x i32> poison, i32 %98, i64 0, !dbg !26
116
+ %116 = insertelement <16 x i32> %115, i32 %100, i64 1, !dbg !26
117
+ %117 = insertelement <16 x i32> %116, i32 %101, i64 2, !dbg !26
118
+ %118 = insertelement <16 x i32> %117, i32 %102, i64 3, !dbg !26
119
+ %119 = insertelement <16 x i32> %118, i32 %103, i64 4, !dbg !26
120
+ %120 = insertelement <16 x i32> %119, i32 %104, i64 5, !dbg !26
121
+ %121 = insertelement <16 x i32> %120, i32 %105, i64 6, !dbg !26
122
+ %122 = insertelement <16 x i32> %121, i32 %106, i64 7, !dbg !26
123
+ %123 = insertelement <16 x i32> %122, i32 %107, i64 8, !dbg !26
124
+ %124 = insertelement <16 x i32> %123, i32 %108, i64 9, !dbg !26
125
+ %125 = insertelement <16 x i32> %124, i32 %109, i64 10, !dbg !26
126
+ %126 = insertelement <16 x i32> %125, i32 %110, i64 11, !dbg !26
127
+ %127 = insertelement <16 x i32> %126, i32 %111, i64 12, !dbg !26
128
+ %128 = insertelement <16 x i32> %127, i32 %112, i64 13, !dbg !26
129
+ %129 = insertelement <16 x i32> %128, i32 %113, i64 14, !dbg !26
130
+ %130 = insertelement <16 x i32> %129, i32 %114, i64 15, !dbg !26
131
+ %131 = bitcast <16 x i32> %130 to <16 x float>, !dbg !26
132
+ %132 = fmul <16 x float> %67, %131, !dbg !27
133
+ %133 = select <16 x i1> %76, <16 x float> %132, <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
134
+ %134 = fadd <16 x float> %70, %133, !dbg !28
135
+ %135 = add nuw nsw i32 %69, 512, !dbg !22
136
+ %136 = icmp ult i32 %69, 49745, !dbg !22
137
+ br i1 %136, label %68, label %137, !dbg !22
138
+
139
+ 137: ; preds = %68
140
+ %138 = and i32 %10, 31, !dbg !8
141
+ %139 = and i32 %11, 7, !dbg !9
142
+ %shift = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
143
+ %140 = fadd <16 x float> %134, %shift, !dbg !29
144
+ %141 = extractelement <16 x float> %140, i64 0, !dbg !29
145
+ %shift54 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
146
+ %142 = fadd <16 x float> %134, %shift54, !dbg !29
147
+ %143 = extractelement <16 x float> %142, i64 2, !dbg !29
148
+ %shift55 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
149
+ %144 = fadd <16 x float> %134, %shift55, !dbg !29
150
+ %145 = extractelement <16 x float> %144, i64 4, !dbg !29
151
+ %shift56 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
152
+ %146 = fadd <16 x float> %134, %shift56, !dbg !29
153
+ %147 = extractelement <16 x float> %146, i64 6, !dbg !29
154
+ %shift57 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
155
+ %148 = fadd <16 x float> %134, %shift57, !dbg !29
156
+ %149 = extractelement <16 x float> %148, i64 8, !dbg !29
157
+ %shift58 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
158
+ %150 = fadd <16 x float> %134, %shift58, !dbg !29
159
+ %151 = extractelement <16 x float> %150, i64 10, !dbg !29
160
+ %shift59 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 13, i32 poison, i32 poison, i32 poison>, !dbg !29
161
+ %152 = fadd <16 x float> %134, %shift59, !dbg !29
162
+ %153 = extractelement <16 x float> %152, i64 12, !dbg !29
163
+ %shift60 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>, !dbg !29
164
+ %154 = fadd <16 x float> %134, %shift60, !dbg !29
165
+ %155 = extractelement <16 x float> %154, i64 14, !dbg !29
166
+ %156 = bitcast float %141 to i32, !dbg !35
167
+ %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 16, i32 31), !dbg !35
168
+ %158 = bitcast i32 %157 to float, !dbg !35
169
+ %159 = fadd float %141, %158, !dbg !29
170
+ %160 = bitcast float %159 to i32, !dbg !35
171
+ %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 8, i32 31), !dbg !35
172
+ %162 = bitcast i32 %161 to float, !dbg !35
173
+ %163 = fadd float %159, %162, !dbg !29
174
+ %164 = bitcast float %163 to i32, !dbg !35
175
+ %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 4, i32 31), !dbg !35
176
+ %166 = bitcast i32 %165 to float, !dbg !35
177
+ %167 = fadd float %163, %166, !dbg !29
178
+ %168 = bitcast float %167 to i32, !dbg !35
179
+ %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !35
180
+ %170 = bitcast i32 %169 to float, !dbg !35
181
+ %171 = fadd float %167, %170, !dbg !29
182
+ %172 = bitcast float %171 to i32, !dbg !35
183
+ %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !35
184
+ %174 = bitcast i32 %173 to float, !dbg !35
185
+ %175 = fadd float %171, %174, !dbg !29
186
+ %176 = bitcast float %143 to i32, !dbg !35
187
+ %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 16, i32 31), !dbg !35
188
+ %178 = bitcast i32 %177 to float, !dbg !35
189
+ %179 = fadd float %143, %178, !dbg !29
190
+ %180 = bitcast float %179 to i32, !dbg !35
191
+ %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !35
192
+ %182 = bitcast i32 %181 to float, !dbg !35
193
+ %183 = fadd float %179, %182, !dbg !29
194
+ %184 = bitcast float %183 to i32, !dbg !35
195
+ %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 4, i32 31), !dbg !35
196
+ %186 = bitcast i32 %185 to float, !dbg !35
197
+ %187 = fadd float %183, %186, !dbg !29
198
+ %188 = bitcast float %187 to i32, !dbg !35
199
+ %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 2, i32 31), !dbg !35
200
+ %190 = bitcast i32 %189 to float, !dbg !35
201
+ %191 = fadd float %187, %190, !dbg !29
202
+ %192 = bitcast float %191 to i32, !dbg !35
203
+ %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !35
204
+ %194 = bitcast i32 %193 to float, !dbg !35
205
+ %195 = fadd float %191, %194, !dbg !29
206
+ %196 = bitcast float %145 to i32, !dbg !35
207
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !35
208
+ %198 = bitcast i32 %197 to float, !dbg !35
209
+ %199 = fadd float %145, %198, !dbg !29
210
+ %200 = bitcast float %199 to i32, !dbg !35
211
+ %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !35
212
+ %202 = bitcast i32 %201 to float, !dbg !35
213
+ %203 = fadd float %199, %202, !dbg !29
214
+ %204 = bitcast float %203 to i32, !dbg !35
215
+ %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 4, i32 31), !dbg !35
216
+ %206 = bitcast i32 %205 to float, !dbg !35
217
+ %207 = fadd float %203, %206, !dbg !29
218
+ %208 = bitcast float %207 to i32, !dbg !35
219
+ %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 2, i32 31), !dbg !35
220
+ %210 = bitcast i32 %209 to float, !dbg !35
221
+ %211 = fadd float %207, %210, !dbg !29
222
+ %212 = bitcast float %211 to i32, !dbg !35
223
+ %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !35
224
+ %214 = bitcast i32 %213 to float, !dbg !35
225
+ %215 = fadd float %211, %214, !dbg !29
226
+ %216 = bitcast float %147 to i32, !dbg !35
227
+ %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 16, i32 31), !dbg !35
228
+ %218 = bitcast i32 %217 to float, !dbg !35
229
+ %219 = fadd float %147, %218, !dbg !29
230
+ %220 = bitcast float %219 to i32, !dbg !35
231
+ %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 8, i32 31), !dbg !35
232
+ %222 = bitcast i32 %221 to float, !dbg !35
233
+ %223 = fadd float %219, %222, !dbg !29
234
+ %224 = bitcast float %223 to i32, !dbg !35
235
+ %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !35
236
+ %226 = bitcast i32 %225 to float, !dbg !35
237
+ %227 = fadd float %223, %226, !dbg !29
238
+ %228 = bitcast float %227 to i32, !dbg !35
239
+ %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !35
240
+ %230 = bitcast i32 %229 to float, !dbg !35
241
+ %231 = fadd float %227, %230, !dbg !29
242
+ %232 = bitcast float %231 to i32, !dbg !35
243
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !35
244
+ %234 = bitcast i32 %233 to float, !dbg !35
245
+ %235 = fadd float %231, %234, !dbg !29
246
+ %236 = bitcast float %149 to i32, !dbg !35
247
+ %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 16, i32 31), !dbg !35
248
+ %238 = bitcast i32 %237 to float, !dbg !35
249
+ %239 = fadd float %149, %238, !dbg !29
250
+ %240 = bitcast float %239 to i32, !dbg !35
251
+ %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !35
252
+ %242 = bitcast i32 %241 to float, !dbg !35
253
+ %243 = fadd float %239, %242, !dbg !29
254
+ %244 = bitcast float %243 to i32, !dbg !35
255
+ %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !35
256
+ %246 = bitcast i32 %245 to float, !dbg !35
257
+ %247 = fadd float %243, %246, !dbg !29
258
+ %248 = bitcast float %247 to i32, !dbg !35
259
+ %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !35
260
+ %250 = bitcast i32 %249 to float, !dbg !35
261
+ %251 = fadd float %247, %250, !dbg !29
262
+ %252 = bitcast float %251 to i32, !dbg !35
263
+ %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !35
264
+ %254 = bitcast i32 %253 to float, !dbg !35
265
+ %255 = fadd float %251, %254, !dbg !29
266
+ %256 = bitcast float %151 to i32, !dbg !35
267
+ %257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 16, i32 31), !dbg !35
268
+ %258 = bitcast i32 %257 to float, !dbg !35
269
+ %259 = fadd float %151, %258, !dbg !29
270
+ %260 = bitcast float %259 to i32, !dbg !35
271
+ %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 8, i32 31), !dbg !35
272
+ %262 = bitcast i32 %261 to float, !dbg !35
273
+ %263 = fadd float %259, %262, !dbg !29
274
+ %264 = bitcast float %263 to i32, !dbg !35
275
+ %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 4, i32 31), !dbg !35
276
+ %266 = bitcast i32 %265 to float, !dbg !35
277
+ %267 = fadd float %263, %266, !dbg !29
278
+ %268 = bitcast float %267 to i32, !dbg !35
279
+ %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 2, i32 31), !dbg !35
280
+ %270 = bitcast i32 %269 to float, !dbg !35
281
+ %271 = fadd float %267, %270, !dbg !29
282
+ %272 = bitcast float %271 to i32, !dbg !35
283
+ %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !35
284
+ %274 = bitcast i32 %273 to float, !dbg !35
285
+ %275 = fadd float %271, %274, !dbg !29
286
+ %276 = bitcast float %153 to i32, !dbg !35
287
+ %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 16, i32 31), !dbg !35
288
+ %278 = bitcast i32 %277 to float, !dbg !35
289
+ %279 = fadd float %153, %278, !dbg !29
290
+ %280 = bitcast float %279 to i32, !dbg !35
291
+ %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 8, i32 31), !dbg !35
292
+ %282 = bitcast i32 %281 to float, !dbg !35
293
+ %283 = fadd float %279, %282, !dbg !29
294
+ %284 = bitcast float %283 to i32, !dbg !35
295
+ %285 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 4, i32 31), !dbg !35
296
+ %286 = bitcast i32 %285 to float, !dbg !35
297
+ %287 = fadd float %283, %286, !dbg !29
298
+ %288 = bitcast float %287 to i32, !dbg !35
299
+ %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !35
300
+ %290 = bitcast i32 %289 to float, !dbg !35
301
+ %291 = fadd float %287, %290, !dbg !29
302
+ %292 = bitcast float %291 to i32, !dbg !35
303
+ %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !35
304
+ %294 = bitcast i32 %293 to float, !dbg !35
305
+ %295 = fadd float %291, %294, !dbg !29
306
+ %296 = bitcast float %155 to i32, !dbg !35
307
+ %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 16, i32 31), !dbg !35
308
+ %298 = bitcast i32 %297 to float, !dbg !35
309
+ %299 = fadd float %155, %298, !dbg !29
310
+ %300 = bitcast float %299 to i32, !dbg !35
311
+ %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %300, i32 8, i32 31), !dbg !35
312
+ %302 = bitcast i32 %301 to float, !dbg !35
313
+ %303 = fadd float %299, %302, !dbg !29
314
+ %304 = bitcast float %303 to i32, !dbg !35
315
+ %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 4, i32 31), !dbg !35
316
+ %306 = bitcast i32 %305 to float, !dbg !35
317
+ %307 = fadd float %303, %306, !dbg !29
318
+ %308 = bitcast float %307 to i32, !dbg !35
319
+ %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !35
320
+ %310 = bitcast i32 %309 to float, !dbg !35
321
+ %311 = fadd float %307, %310, !dbg !29
322
+ %312 = bitcast float %311 to i32, !dbg !35
323
+ %313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !35
324
+ %314 = bitcast i32 %313 to float, !dbg !35
325
+ %315 = fadd float %311, %314, !dbg !29
326
+ %316 = icmp eq i32 %138, 0, !dbg !35
327
+ %317 = zext nneg i32 %139 to i64, !dbg !35
328
+ %318 = getelementptr float, ptr addrspace(3) @global_smem, i64 %317, !dbg !35
329
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %318, float %175, i1 %316) #3, !dbg !35
330
+ %319 = or i32 %139, 8, !dbg !35
331
+ %320 = zext nneg i32 %319 to i64, !dbg !35
332
+ %321 = getelementptr float, ptr addrspace(3) @global_smem, i64 %320, !dbg !35
333
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %321, float %195, i1 %316) #3, !dbg !35
334
+ %322 = or i32 %139, 16, !dbg !35
335
+ %323 = zext nneg i32 %322 to i64, !dbg !35
336
+ %324 = getelementptr float, ptr addrspace(3) @global_smem, i64 %323, !dbg !35
337
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %324, float %215, i1 %316) #3, !dbg !35
338
+ %325 = or i32 %139, 24, !dbg !35
339
+ %326 = zext nneg i32 %325 to i64, !dbg !35
340
+ %327 = getelementptr float, ptr addrspace(3) @global_smem, i64 %326, !dbg !35
341
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %327, float %235, i1 %316) #3, !dbg !35
342
+ %328 = or i32 %139, 32, !dbg !35
343
+ %329 = zext nneg i32 %328 to i64, !dbg !35
344
+ %330 = getelementptr float, ptr addrspace(3) @global_smem, i64 %329, !dbg !35
345
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, float %255, i1 %316) #3, !dbg !35
346
+ %331 = or i32 %139, 40, !dbg !35
347
+ %332 = zext nneg i32 %331 to i64, !dbg !35
348
+ %333 = getelementptr float, ptr addrspace(3) @global_smem, i64 %332, !dbg !35
349
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %333, float %275, i1 %316) #3, !dbg !35
350
+ %334 = or i32 %139, 48, !dbg !35
351
+ %335 = zext nneg i32 %334 to i64, !dbg !35
352
+ %336 = getelementptr float, ptr addrspace(3) @global_smem, i64 %335, !dbg !35
353
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %336, float %295, i1 %316) #3, !dbg !35
354
+ %337 = or i32 %139, 56, !dbg !35
355
+ %338 = zext nneg i32 %337 to i64, !dbg !35
356
+ %339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !35
357
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %315, i1 %316) #3, !dbg !35
358
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
359
+ %340 = icmp slt i32 %10, 64, !dbg !35
360
+ %341 = sext i32 %10 to i64, !dbg !35
361
+ %342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !35
362
+ %343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !35
363
+ %344 = bitcast float %343 to i32, !dbg !35
364
+ %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 4, i32 31), !dbg !35
365
+ %346 = bitcast i32 %345 to float, !dbg !35
366
+ %347 = fadd float %343, %346, !dbg !29
367
+ %348 = bitcast float %347 to i32, !dbg !35
368
+ %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !35
369
+ %350 = bitcast i32 %349 to float, !dbg !35
370
+ %351 = fadd float %347, %350, !dbg !29
371
+ %352 = bitcast float %351 to i32, !dbg !35
372
+ %353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %352, i32 1, i32 31), !dbg !35
373
+ %354 = bitcast i32 %353 to float, !dbg !35
374
+ %355 = fadd float %351, %354, !dbg !29
375
+ %356 = and i32 %10, 7, !dbg !35
376
+ %357 = icmp eq i32 %356, 0, !dbg !35
377
+ %358 = and i1 %340, %357, !dbg !35
378
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %355, i1 %358) #3, !dbg !35
379
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
380
+ %359 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35
381
+ %360 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !35
382
+ %361 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 64), align 4, !dbg !35
383
+ %362 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 96), align 4, !dbg !35
384
+ %363 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 128), align 4, !dbg !35
385
+ %364 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 160), align 4, !dbg !35
386
+ %365 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 192), align 4, !dbg !35
387
+ %366 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 224), align 4, !dbg !35
388
+ %367 = extractelement <2 x i64> %25, i64 0, !dbg !37
389
+ %368 = extractelement <2 x i64> %25, i64 1, !dbg !37
390
+ %369 = extractelement <8 x float> %66, i64 0, !dbg !38
391
+ %370 = extractelement <8 x float> %66, i64 1, !dbg !38
392
+ %371 = extractelement <8 x float> %66, i64 2, !dbg !38
393
+ %372 = extractelement <8 x float> %66, i64 3, !dbg !38
394
+ %373 = extractelement <8 x float> %66, i64 4, !dbg !38
395
+ %374 = extractelement <8 x float> %66, i64 5, !dbg !38
396
+ %375 = extractelement <8 x float> %66, i64 6, !dbg !38
397
+ %376 = extractelement <8 x float> %66, i64 7, !dbg !38
398
+ br label %377, !dbg !39
399
+
400
+ 377: ; preds = %137, %377
401
+ %378 = phi i32 [ 0, %137 ], [ %672, %377 ]
402
+ %379 = zext nneg i32 %378 to i64, !dbg !37
403
+ %380 = or i64 %367, %379, !dbg !37
404
+ %381 = or i64 %368, %379, !dbg !37
405
+ %382 = icmp ult i64 %380, 50257, !dbg !40
406
+ %383 = icmp ult i64 %381, 50257, !dbg !40
407
+ %384 = add nsw i64 %380, %46, !dbg !41
408
+ %385 = add nsw i64 %381, %46, !dbg !41
409
+ %386 = add nsw i64 %380, %47, !dbg !41
410
+ %387 = add nsw i64 %381, %47, !dbg !41
411
+ %388 = add nsw i64 %380, %48, !dbg !41
412
+ %389 = add nsw i64 %381, %48, !dbg !41
413
+ %390 = add nsw i64 %380, %49, !dbg !41
414
+ %391 = add nsw i64 %381, %49, !dbg !41
415
+ %392 = add nsw i64 %380, %50, !dbg !41
416
+ %393 = add nsw i64 %381, %50, !dbg !41
417
+ %394 = add nsw i64 %380, %51, !dbg !41
418
+ %395 = add nsw i64 %381, %51, !dbg !41
419
+ %396 = add nsw i64 %380, %52, !dbg !41
420
+ %397 = add nsw i64 %381, %52, !dbg !41
421
+ %398 = add nsw i64 %380, %53, !dbg !41
422
+ %399 = add nsw i64 %381, %53, !dbg !41
423
+ %400 = getelementptr i16, ptr addrspace(1) %4, i64 %384, !dbg !42
424
+ %401 = getelementptr i16, ptr addrspace(1) %4, i64 %385, !dbg !42
425
+ %402 = getelementptr i16, ptr addrspace(1) %4, i64 %386, !dbg !42
426
+ %403 = getelementptr i16, ptr addrspace(1) %4, i64 %387, !dbg !42
427
+ %404 = getelementptr i16, ptr addrspace(1) %4, i64 %388, !dbg !42
428
+ %405 = getelementptr i16, ptr addrspace(1) %4, i64 %389, !dbg !42
429
+ %406 = getelementptr i16, ptr addrspace(1) %4, i64 %390, !dbg !42
430
+ %407 = getelementptr i16, ptr addrspace(1) %4, i64 %391, !dbg !42
431
+ %408 = getelementptr i16, ptr addrspace(1) %4, i64 %392, !dbg !42
432
+ %409 = getelementptr i16, ptr addrspace(1) %4, i64 %393, !dbg !42
433
+ %410 = getelementptr i16, ptr addrspace(1) %4, i64 %394, !dbg !42
434
+ %411 = getelementptr i16, ptr addrspace(1) %4, i64 %395, !dbg !42
435
+ %412 = getelementptr i16, ptr addrspace(1) %4, i64 %396, !dbg !42
436
+ %413 = getelementptr i16, ptr addrspace(1) %4, i64 %397, !dbg !42
437
+ %414 = getelementptr i16, ptr addrspace(1) %4, i64 %398, !dbg !42
438
+ %415 = getelementptr i16, ptr addrspace(1) %4, i64 %399, !dbg !42
439
+ %416 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %382, i16 0, i1 %382) #3, !dbg !43
440
+ %417 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %383, i16 0, i1 %383) #3, !dbg !43
441
+ %418 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %382, i16 0, i1 %382) #3, !dbg !43
442
+ %419 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %383, i16 0, i1 %383) #3, !dbg !43
443
+ %420 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %382, i16 0, i1 %382) #3, !dbg !43
444
+ %421 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %383, i16 0, i1 %383) #3, !dbg !43
445
+ %422 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %406, i1 %382, i16 0, i1 %382) #3, !dbg !43
446
+ %423 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %407, i1 %383, i16 0, i1 %383) #3, !dbg !43
447
+ %424 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %408, i1 %382, i16 0, i1 %382) #3, !dbg !43
448
+ %425 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %409, i1 %383, i16 0, i1 %383) #3, !dbg !43
449
+ %426 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %410, i1 %382, i16 0, i1 %382) #3, !dbg !43
450
+ %427 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %411, i1 %383, i16 0, i1 %383) #3, !dbg !43
451
+ %428 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %412, i1 %382, i16 0, i1 %382) #3, !dbg !43
452
+ %429 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %413, i1 %383, i16 0, i1 %383) #3, !dbg !43
453
+ %430 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %414, i1 %382, i16 0, i1 %382) #3, !dbg !43
454
+ %431 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %415, i1 %383, i16 0, i1 %383) #3, !dbg !43
455
+ %432 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %416) #3, !dbg !44
456
+ %433 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %417) #3, !dbg !44
457
+ %434 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %418) #3, !dbg !44
458
+ %435 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %419) #3, !dbg !44
459
+ %436 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %420) #3, !dbg !44
460
+ %437 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %421) #3, !dbg !44
461
+ %438 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %422) #3, !dbg !44
462
+ %439 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %423) #3, !dbg !44
463
+ %440 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %424) #3, !dbg !44
464
+ %441 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %425) #3, !dbg !44
465
+ %442 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %426) #3, !dbg !44
466
+ %443 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %427) #3, !dbg !44
467
+ %444 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %428) #3, !dbg !44
468
+ %445 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %429) #3, !dbg !44
469
+ %446 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %430) #3, !dbg !44
470
+ %447 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %431) #3, !dbg !44
471
+ %448 = getelementptr float, ptr addrspace(1) %0, i64 %384, !dbg !45
472
+ %449 = getelementptr float, ptr addrspace(1) %0, i64 %385, !dbg !45
473
+ %450 = getelementptr float, ptr addrspace(1) %0, i64 %386, !dbg !45
474
+ %451 = getelementptr float, ptr addrspace(1) %0, i64 %387, !dbg !45
475
+ %452 = getelementptr float, ptr addrspace(1) %0, i64 %388, !dbg !45
476
+ %453 = getelementptr float, ptr addrspace(1) %0, i64 %389, !dbg !45
477
+ %454 = getelementptr float, ptr addrspace(1) %0, i64 %390, !dbg !45
478
+ %455 = getelementptr float, ptr addrspace(1) %0, i64 %391, !dbg !45
479
+ %456 = getelementptr float, ptr addrspace(1) %0, i64 %392, !dbg !45
480
+ %457 = getelementptr float, ptr addrspace(1) %0, i64 %393, !dbg !45
481
+ %458 = getelementptr float, ptr addrspace(1) %0, i64 %394, !dbg !45
482
+ %459 = getelementptr float, ptr addrspace(1) %0, i64 %395, !dbg !45
483
+ %460 = getelementptr float, ptr addrspace(1) %0, i64 %396, !dbg !45
484
+ %461 = getelementptr float, ptr addrspace(1) %0, i64 %397, !dbg !45
485
+ %462 = getelementptr float, ptr addrspace(1) %0, i64 %398, !dbg !45
486
+ %463 = getelementptr float, ptr addrspace(1) %0, i64 %399, !dbg !45
487
+ %464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %448, i1 %382, i32 0, i1 %382) #3, !dbg !46
488
+ %465 = bitcast i32 %464 to float, !dbg !46
489
+ %466 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %449, i1 %383, i32 0, i1 %383) #3, !dbg !46
490
+ %467 = bitcast i32 %466 to float, !dbg !46
491
+ %468 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %450, i1 %382, i32 0, i1 %382) #3, !dbg !46
492
+ %469 = bitcast i32 %468 to float, !dbg !46
493
+ %470 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %451, i1 %383, i32 0, i1 %383) #3, !dbg !46
494
+ %471 = bitcast i32 %470 to float, !dbg !46
495
+ %472 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %452, i1 %382, i32 0, i1 %382) #3, !dbg !46
496
+ %473 = bitcast i32 %472 to float, !dbg !46
497
+ %474 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %453, i1 %383, i32 0, i1 %383) #3, !dbg !46
498
+ %475 = bitcast i32 %474 to float, !dbg !46
499
+ %476 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %454, i1 %382, i32 0, i1 %382) #3, !dbg !46
500
+ %477 = bitcast i32 %476 to float, !dbg !46
501
+ %478 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %455, i1 %383, i32 0, i1 %383) #3, !dbg !46
502
+ %479 = bitcast i32 %478 to float, !dbg !46
503
+ %480 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %456, i1 %382, i32 0, i1 %382) #3, !dbg !46
504
+ %481 = bitcast i32 %480 to float, !dbg !46
505
+ %482 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %457, i1 %383, i32 0, i1 %383) #3, !dbg !46
506
+ %483 = bitcast i32 %482 to float, !dbg !46
507
+ %484 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %458, i1 %382, i32 0, i1 %382) #3, !dbg !46
508
+ %485 = bitcast i32 %484 to float, !dbg !46
509
+ %486 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %459, i1 %383, i32 0, i1 %383) #3, !dbg !46
510
+ %487 = bitcast i32 %486 to float, !dbg !46
511
+ %488 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %460, i1 %382, i32 0, i1 %382) #3, !dbg !46
512
+ %489 = bitcast i32 %488 to float, !dbg !46
513
+ %490 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %461, i1 %383, i32 0, i1 %383) #3, !dbg !46
514
+ %491 = bitcast i32 %490 to float, !dbg !46
515
+ %492 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %462, i1 %382, i32 0, i1 %382) #3, !dbg !46
516
+ %493 = bitcast i32 %492 to float, !dbg !46
517
+ %494 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %463, i1 %383, i32 0, i1 %383) #3, !dbg !46
518
+ %495 = bitcast i32 %494 to float, !dbg !46
519
+ %496 = getelementptr i16, ptr addrspace(1) %5, i64 %384, !dbg !47
520
+ %497 = getelementptr i16, ptr addrspace(1) %5, i64 %385, !dbg !47
521
+ %498 = getelementptr i16, ptr addrspace(1) %5, i64 %386, !dbg !47
522
+ %499 = getelementptr i16, ptr addrspace(1) %5, i64 %387, !dbg !47
523
+ %500 = getelementptr i16, ptr addrspace(1) %5, i64 %388, !dbg !47
524
+ %501 = getelementptr i16, ptr addrspace(1) %5, i64 %389, !dbg !47
525
+ %502 = getelementptr i16, ptr addrspace(1) %5, i64 %390, !dbg !47
526
+ %503 = getelementptr i16, ptr addrspace(1) %5, i64 %391, !dbg !47
527
+ %504 = getelementptr i16, ptr addrspace(1) %5, i64 %392, !dbg !47
528
+ %505 = getelementptr i16, ptr addrspace(1) %5, i64 %393, !dbg !47
529
+ %506 = getelementptr i16, ptr addrspace(1) %5, i64 %394, !dbg !47
530
+ %507 = getelementptr i16, ptr addrspace(1) %5, i64 %395, !dbg !47
531
+ %508 = getelementptr i16, ptr addrspace(1) %5, i64 %396, !dbg !47
532
+ %509 = getelementptr i16, ptr addrspace(1) %5, i64 %397, !dbg !47
533
+ %510 = getelementptr i16, ptr addrspace(1) %5, i64 %398, !dbg !47
534
+ %511 = getelementptr i16, ptr addrspace(1) %5, i64 %399, !dbg !47
535
+ %512 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %496, i1 %382, i16 0, i1 %382) #3, !dbg !48
536
+ %513 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %497, i1 %383, i16 0, i1 %383) #3, !dbg !48
537
+ %514 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %498, i1 %382, i16 0, i1 %382) #3, !dbg !48
538
+ %515 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %499, i1 %383, i16 0, i1 %383) #3, !dbg !48
539
+ %516 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %500, i1 %382, i16 0, i1 %382) #3, !dbg !48
540
+ %517 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %501, i1 %383, i16 0, i1 %383) #3, !dbg !48
541
+ %518 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %502, i1 %382, i16 0, i1 %382) #3, !dbg !48
542
+ %519 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %503, i1 %383, i16 0, i1 %383) #3, !dbg !48
543
+ %520 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %504, i1 %382, i16 0, i1 %382) #3, !dbg !48
544
+ %521 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %505, i1 %383, i16 0, i1 %383) #3, !dbg !48
545
+ %522 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %506, i1 %382, i16 0, i1 %382) #3, !dbg !48
546
+ %523 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %507, i1 %383, i16 0, i1 %383) #3, !dbg !48
547
+ %524 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %508, i1 %382, i16 0, i1 %382) #3, !dbg !48
548
+ %525 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %509, i1 %383, i16 0, i1 %383) #3, !dbg !48
549
+ %526 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %510, i1 %382, i16 0, i1 %382) #3, !dbg !48
550
+ %527 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %511, i1 %383, i16 0, i1 %383) #3, !dbg !48
551
+ %528 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #3, !dbg !49
552
+ %529 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #3, !dbg !49
553
+ %530 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #3, !dbg !49
554
+ %531 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #3, !dbg !49
555
+ %532 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #3, !dbg !49
556
+ %533 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %517) #3, !dbg !49
557
+ %534 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %518) #3, !dbg !49
558
+ %535 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %519) #3, !dbg !49
559
+ %536 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %520) #3, !dbg !49
560
+ %537 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %521) #3, !dbg !49
561
+ %538 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %522) #3, !dbg !49
562
+ %539 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %523) #3, !dbg !49
563
+ %540 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %524) #3, !dbg !49
564
+ %541 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %525) #3, !dbg !49
565
+ %542 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %526) #3, !dbg !49
566
+ %543 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %527) #3, !dbg !49
567
+ %544 = fmul float %369, %465, !dbg !38
568
+ %545 = fmul float %369, %467, !dbg !38
569
+ %546 = fmul float %370, %469, !dbg !38
570
+ %547 = fmul float %370, %471, !dbg !38
571
+ %548 = fmul float %371, %473, !dbg !38
572
+ %549 = fmul float %371, %475, !dbg !38
573
+ %550 = fmul float %372, %477, !dbg !38
574
+ %551 = fmul float %372, %479, !dbg !38
575
+ %552 = fmul float %373, %481, !dbg !38
576
+ %553 = fmul float %373, %483, !dbg !38
577
+ %554 = fmul float %374, %485, !dbg !38
578
+ %555 = fmul float %374, %487, !dbg !38
579
+ %556 = fmul float %375, %489, !dbg !38
580
+ %557 = fmul float %375, %491, !dbg !38
581
+ %558 = fmul float %376, %493, !dbg !38
582
+ %559 = fmul float %376, %495, !dbg !38
583
+ %560 = fmul float %528, 0x3FF7154760000000, !dbg !50
584
+ %561 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %560) #3, !dbg !50
585
+ %562 = fmul float %529, 0x3FF7154760000000, !dbg !50
586
+ %563 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %562) #3, !dbg !50
587
+ %564 = fmul float %530, 0x3FF7154760000000, !dbg !50
588
+ %565 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %564) #3, !dbg !50
589
+ %566 = fmul float %531, 0x3FF7154760000000, !dbg !50
590
+ %567 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %566) #3, !dbg !50
591
+ %568 = fmul float %532, 0x3FF7154760000000, !dbg !50
592
+ %569 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %568) #3, !dbg !50
593
+ %570 = fmul float %533, 0x3FF7154760000000, !dbg !50
594
+ %571 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %570) #3, !dbg !50
595
+ %572 = fmul float %534, 0x3FF7154760000000, !dbg !50
596
+ %573 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %572) #3, !dbg !50
597
+ %574 = fmul float %535, 0x3FF7154760000000, !dbg !50
598
+ %575 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %574) #3, !dbg !50
599
+ %576 = fmul float %536, 0x3FF7154760000000, !dbg !50
600
+ %577 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %576) #3, !dbg !50
601
+ %578 = fmul float %537, 0x3FF7154760000000, !dbg !50
602
+ %579 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %578) #3, !dbg !50
603
+ %580 = fmul float %538, 0x3FF7154760000000, !dbg !50
604
+ %581 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %580) #3, !dbg !50
605
+ %582 = fmul float %539, 0x3FF7154760000000, !dbg !50
606
+ %583 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %582) #3, !dbg !50
607
+ %584 = fmul float %540, 0x3FF7154760000000, !dbg !50
608
+ %585 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %584) #3, !dbg !50
609
+ %586 = fmul float %541, 0x3FF7154760000000, !dbg !50
610
+ %587 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %586) #3, !dbg !50
611
+ %588 = fmul float %542, 0x3FF7154760000000, !dbg !50
612
+ %589 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %588) #3, !dbg !50
613
+ %590 = fmul float %543, 0x3FF7154760000000, !dbg !50
614
+ %591 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %590) #3, !dbg !50
615
+ %592 = fmul float %359, %561, !dbg !51
616
+ %593 = fmul float %359, %563, !dbg !51
617
+ %594 = fmul float %360, %565, !dbg !51
618
+ %595 = fmul float %360, %567, !dbg !51
619
+ %596 = fmul float %361, %569, !dbg !51
620
+ %597 = fmul float %361, %571, !dbg !51
621
+ %598 = fmul float %362, %573, !dbg !51
622
+ %599 = fmul float %362, %575, !dbg !51
623
+ %600 = fmul float %363, %577, !dbg !51
624
+ %601 = fmul float %363, %579, !dbg !51
625
+ %602 = fmul float %364, %581, !dbg !51
626
+ %603 = fmul float %364, %583, !dbg !51
627
+ %604 = fmul float %365, %585, !dbg !51
628
+ %605 = fmul float %365, %587, !dbg !51
629
+ %606 = fmul float %366, %589, !dbg !51
630
+ %607 = fmul float %366, %591, !dbg !51
631
+ %608 = fsub float %544, %592, !dbg !52
632
+ %609 = fsub float %545, %593, !dbg !52
633
+ %610 = fsub float %546, %594, !dbg !52
634
+ %611 = fsub float %547, %595, !dbg !52
635
+ %612 = fsub float %548, %596, !dbg !52
636
+ %613 = fsub float %549, %597, !dbg !52
637
+ %614 = fsub float %550, %598, !dbg !52
638
+ %615 = fsub float %551, %599, !dbg !52
639
+ %616 = fsub float %552, %600, !dbg !52
640
+ %617 = fsub float %553, %601, !dbg !52
641
+ %618 = fsub float %554, %602, !dbg !52
642
+ %619 = fsub float %555, %603, !dbg !52
643
+ %620 = fsub float %556, %604, !dbg !52
644
+ %621 = fsub float %557, %605, !dbg !52
645
+ %622 = fsub float %558, %606, !dbg !52
646
+ %623 = fsub float %559, %607, !dbg !52
647
+ %624 = fadd float %432, %608, !dbg !53
648
+ %625 = fadd float %433, %609, !dbg !53
649
+ %626 = fadd float %434, %610, !dbg !53
650
+ %627 = fadd float %435, %611, !dbg !53
651
+ %628 = fadd float %436, %612, !dbg !53
652
+ %629 = fadd float %437, %613, !dbg !53
653
+ %630 = fadd float %438, %614, !dbg !53
654
+ %631 = fadd float %439, %615, !dbg !53
655
+ %632 = fadd float %440, %616, !dbg !53
656
+ %633 = fadd float %441, %617, !dbg !53
657
+ %634 = fadd float %442, %618, !dbg !53
658
+ %635 = fadd float %443, %619, !dbg !53
659
+ %636 = fadd float %444, %620, !dbg !53
660
+ %637 = fadd float %445, %621, !dbg !53
661
+ %638 = fadd float %446, %622, !dbg !53
662
+ %639 = fadd float %447, %623, !dbg !53
663
+ %640 = getelementptr i16, ptr addrspace(1) %6, i64 %384, !dbg !54
664
+ %641 = getelementptr i16, ptr addrspace(1) %6, i64 %385, !dbg !54
665
+ %642 = getelementptr i16, ptr addrspace(1) %6, i64 %386, !dbg !54
666
+ %643 = getelementptr i16, ptr addrspace(1) %6, i64 %387, !dbg !54
667
+ %644 = getelementptr i16, ptr addrspace(1) %6, i64 %388, !dbg !54
668
+ %645 = getelementptr i16, ptr addrspace(1) %6, i64 %389, !dbg !54
669
+ %646 = getelementptr i16, ptr addrspace(1) %6, i64 %390, !dbg !54
670
+ %647 = getelementptr i16, ptr addrspace(1) %6, i64 %391, !dbg !54
671
+ %648 = getelementptr i16, ptr addrspace(1) %6, i64 %392, !dbg !54
672
+ %649 = getelementptr i16, ptr addrspace(1) %6, i64 %393, !dbg !54
673
+ %650 = getelementptr i16, ptr addrspace(1) %6, i64 %394, !dbg !54
674
+ %651 = getelementptr i16, ptr addrspace(1) %6, i64 %395, !dbg !54
675
+ %652 = getelementptr i16, ptr addrspace(1) %6, i64 %396, !dbg !54
676
+ %653 = getelementptr i16, ptr addrspace(1) %6, i64 %397, !dbg !54
677
+ %654 = getelementptr i16, ptr addrspace(1) %6, i64 %398, !dbg !54
678
+ %655 = getelementptr i16, ptr addrspace(1) %6, i64 %399, !dbg !54
679
+ %656 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %624) #3, !dbg !55
680
+ %657 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %625) #3, !dbg !55
681
+ %658 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %626) #3, !dbg !55
682
+ %659 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %627) #3, !dbg !55
683
+ %660 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %628) #3, !dbg !55
684
+ %661 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %629) #3, !dbg !55
685
+ %662 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %630) #3, !dbg !55
686
+ %663 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %631) #3, !dbg !55
687
+ %664 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %632) #3, !dbg !55
688
+ %665 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %633) #3, !dbg !55
689
+ %666 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %634) #3, !dbg !55
690
+ %667 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %635) #3, !dbg !55
691
+ %668 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %636) #3, !dbg !55
692
+ %669 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %637) #3, !dbg !55
693
+ %670 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %638) #3, !dbg !55
694
+ %671 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %639) #3, !dbg !55
695
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %656, ptr addrspace(1) %640, i1 %382) #3, !dbg !55
696
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %657, ptr addrspace(1) %641, i1 %383) #3, !dbg !55
697
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %658, ptr addrspace(1) %642, i1 %382) #3, !dbg !55
698
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %659, ptr addrspace(1) %643, i1 %383) #3, !dbg !55
699
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %382) #3, !dbg !55
700
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %383) #3, !dbg !55
701
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %382) #3, !dbg !55
702
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %383) #3, !dbg !55
703
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %382) #3, !dbg !55
704
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %383) #3, !dbg !55
705
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %382) #3, !dbg !55
706
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %383) #3, !dbg !55
707
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %668, ptr addrspace(1) %652, i1 %382) #3, !dbg !55
708
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %669, ptr addrspace(1) %653, i1 %383) #3, !dbg !55
709
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %670, ptr addrspace(1) %654, i1 %382) #3, !dbg !55
710
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %671, ptr addrspace(1) %655, i1 %383) #3, !dbg !55
711
+ %672 = add nuw nsw i32 %378, 512, !dbg !39
712
+ %673 = icmp ult i32 %378, 49745, !dbg !39
713
+ br i1 %673, label %377, label %674, !dbg !39
714
+
715
+ 674: ; preds = %377
716
+ ret void, !dbg !56
717
+ }
718
+
719
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
720
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
721
+
722
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
723
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
724
+
725
+ ; Function Attrs: convergent nocallback nounwind
726
+ declare void @llvm.nvvm.barrier0() #2
727
+
728
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
729
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
730
+ attributes #2 = { convergent nocallback nounwind }
731
+ attributes #3 = { nounwind }
732
+
733
+ !llvm.module.flags = !{!0}
734
+ !llvm.dbg.cu = !{!1}
735
+ !nvvm.annotations = !{!3, !4, !4, !3}
736
+
737
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
738
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
739
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
740
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
741
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
742
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
743
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
744
+ !7 = !{}
745
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
746
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
747
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
748
+ !11 = !DILocation(line: 21, column: 34, scope: !5)
749
+ !12 = !DILocation(line: 21, column: 46, scope: !5)
750
+ !13 = !DILocation(line: 22, column: 23, scope: !5)
751
+ !14 = !DILocation(line: 26, column: 30, scope: !5)
752
+ !15 = !DILocation(line: 26, column: 35, scope: !5)
753
+ !16 = !DILocation(line: 27, column: 19, scope: !5)
754
+ !17 = !DILocation(line: 29, column: 19, scope: !5)
755
+ !18 = !DILocation(line: 36, column: 46, scope: !5)
756
+ !19 = !DILocation(line: 38, column: 23, scope: !5)
757
+ !20 = !DILocation(line: 39, column: 22, scope: !5)
758
+ !21 = !DILocation(line: 41, column: 37, scope: !5)
759
+ !22 = !DILocation(line: 32, column: 36, scope: !5)
760
+ !23 = !DILocation(line: 33, column: 27, scope: !5)
761
+ !24 = !DILocation(line: 34, column: 25, scope: !5)
762
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
763
+ !26 = !DILocation(line: 36, column: 52, scope: !5)
764
+ !27 = !DILocation(line: 42, column: 23, scope: !5)
765
+ !28 = !DILocation(line: 45, column: 40, scope: !5)
766
+ !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33)
767
+ !30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0)
768
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
769
+ !32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
770
+ !33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34)
771
+ !34 = !DILocation(line: 46, column: 27, scope: !30)
772
+ !35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36)
773
+ !36 = !DILocation(line: 46, column: 27, scope: !32)
774
+ !37 = !DILocation(line: 52, column: 27, scope: !5)
775
+ !38 = !DILocation(line: 63, column: 24, scope: !5)
776
+ !39 = !DILocation(line: 51, column: 36, scope: !5)
777
+ !40 = !DILocation(line: 53, column: 25, scope: !5)
778
+ !41 = !DILocation(line: 55, column: 41, scope: !5)
779
+ !42 = !DILocation(line: 55, column: 35, scope: !5)
780
+ !43 = !DILocation(line: 55, column: 53, scope: !5)
781
+ !44 = !DILocation(line: 55, column: 105, scope: !5)
782
+ !45 = !DILocation(line: 56, column: 35, scope: !5)
783
+ !46 = !DILocation(line: 56, column: 53, scope: !5)
784
+ !47 = !DILocation(line: 57, column: 35, scope: !5)
785
+ !48 = !DILocation(line: 57, column: 53, scope: !5)
786
+ !49 = !DILocation(line: 57, column: 105, scope: !5)
787
+ !50 = !DILocation(line: 65, column: 23, scope: !5)
788
+ !51 = !DILocation(line: 66, column: 24, scope: !5)
789
+ !52 = !DILocation(line: 67, column: 24, scope: !5)
790
+ !53 = !DILocation(line: 69, column: 24, scope: !5)
791
+ !54 = !DILocation(line: 70, column: 29, scope: !5)
792
+ !55 = !DILocation(line: 70, column: 54, scope: !5)
793
+ !56 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx ADDED
@@ -0,0 +1,1517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
22
+ )
23
+ .maxntid 256, 1, 1
24
+ {
25
+ .reg .pred %p<176>;
26
+ .reg .b16 %rs<129>;
27
+ .reg .b32 %r<238>;
28
+ .reg .f32 %f<393>;
29
+ .reg .b64 %rd<166>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_6];
35
+ ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_5];
36
+ ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_4];
37
+ ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6d7de8_param_0];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r1, %tid.x;
41
+ ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6d7de8_param_1];
42
+ shr.u32 %r2, %r1, 5;
43
+ ld.param.u64 %rd56, [triton__0d1d2d3d4d5d6d7de8_param_2];
44
+ .loc 1 24 33
45
+ and.b32 %r9, %r1, 255;
46
+ ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6d7de8_param_3];
47
+ or.b32 %r10, %r9, 256;
48
+ .loc 1 21 28
49
+ mov.u32 %r3, %ctaid.x;
50
+ .loc 1 21 34
51
+ cvt.s64.s32 %rd1, %r3;
52
+ .loc 1 21 46
53
+ mul.wide.s32 %rd60, %r3, 8;
54
+ .loc 1 22 23
55
+ or.b64 %rd61, %rd60, 1;
56
+ cvt.u64.u32 %rd2, %r9;
57
+ cvt.u64.u32 %rd3, %r10;
58
+ .loc 1 26 30
59
+ shl.b64 %rd62, %rd60, 3;
60
+ add.s64 %rd41, %rd59, %rd62;
61
+ add.s64 %rd43, %rd41, 8;
62
+ add.s64 %rd45, %rd41, 16;
63
+ add.s64 %rd47, %rd41, 24;
64
+ add.s64 %rd49, %rd41, 32;
65
+ add.s64 %rd51, %rd41, 40;
66
+ add.s64 %rd53, %rd41, 48;
67
+ add.s64 %rd55, %rd41, 56;
68
+ mov.pred %p1, -1;
69
+ .loc 1 26 35
70
+ mov.u64 %rd40, 0x0;
71
+ @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
72
+ mov.u64 %rd42, 0x0;
73
+ @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
74
+ mov.u64 %rd44, 0x0;
75
+ @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
76
+ mov.u64 %rd46, 0x0;
77
+ @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
78
+ mov.u64 %rd48, 0x0;
79
+ @%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
80
+ mov.u64 %rd50, 0x0;
81
+ @%p1 ld.global.L1::evict_last.b64 { %rd50 }, [ %rd51 + 0 ];
82
+ mov.u64 %rd52, 0x0;
83
+ @%p1 ld.global.L1::evict_last.b64 { %rd52 }, [ %rd53 + 0 ];
84
+ mov.u64 %rd54, 0x0;
85
+ @%p1 ld.global.L1::evict_last.b64 { %rd54 }, [ %rd55 + 0 ];
86
+ .loc 1 27 19
87
+ mov.u32 %r7, 0x0;
88
+ @%p1 ld.global.b32 { %r7 }, [ %rd56 + 0 ];
89
+ .loc 1 29 19
90
+ mov.u32 %r8, 0x0;
91
+ @%p1 ld.global.b32 { %r8 }, [ %rd57 + 0 ];
92
+ .loc 1 36 46
93
+ mul.wide.s32 %rd4, %r3, 402056;
94
+ mul.lo.s64 %rd5, %rd61, 50257;
95
+ .loc 1 38 23
96
+ setp.eq.s64 %p11, %rd40, -1;
97
+ setp.eq.s64 %p12, %rd42, -1;
98
+ setp.eq.s64 %p13, %rd44, -1;
99
+ setp.eq.s64 %p14, %rd46, -1;
100
+ setp.eq.s64 %p15, %rd48, -1;
101
+ setp.eq.s64 %p16, %rd50, -1;
102
+ setp.eq.s64 %p17, %rd52, -1;
103
+ setp.eq.s64 %p18, %rd54, -1;
104
+ .loc 1 39 22
105
+ div.full.f32 %r6, %r7, %r8;
106
+ mov.b32 %f89, %r6;
107
+ .loc 1 41 37
108
+ selp.f32 %f8, 0f00000000, %f89, %p18;
109
+ selp.f32 %f7, 0f00000000, %f89, %p17;
110
+ selp.f32 %f6, 0f00000000, %f89, %p16;
111
+ selp.f32 %f5, 0f00000000, %f89, %p15;
112
+ selp.f32 %f4, 0f00000000, %f89, %p14;
113
+ selp.f32 %f3, 0f00000000, %f89, %p13;
114
+ selp.f32 %f2, 0f00000000, %f89, %p12;
115
+ selp.f32 %f1, 0f00000000, %f89, %p11;
116
+ mov.f32 %f377, 0f00000000;
117
+ mov.u64 %rd157, 0;
118
+ shl.b64 %rd83, %rd4, 2;
119
+ shl.b64 %rd86, %rd5, 2;
120
+ mov.f32 %f378, %f377;
121
+ mov.f32 %f379, %f377;
122
+ mov.f32 %f380, %f377;
123
+ mov.f32 %f381, %f377;
124
+ mov.f32 %f382, %f377;
125
+ mov.f32 %f383, %f377;
126
+ mov.f32 %f384, %f377;
127
+ mov.f32 %f385, %f377;
128
+ mov.f32 %f386, %f377;
129
+ mov.f32 %f387, %f377;
130
+ mov.f32 %f388, %f377;
131
+ mov.f32 %f389, %f377;
132
+ mov.f32 %f390, %f377;
133
+ mov.f32 %f391, %f377;
134
+ mov.f32 %f392, %f377;
135
+ $L__BB0_1:
136
+ .loc 1 33 27
137
+ or.b64 %rd79, %rd157, %rd2;
138
+ or.b64 %rd80, %rd157, %rd3;
139
+ .loc 1 34 25
140
+ setp.lt.u64 %p22, %rd80, 50257;
141
+ setp.lt.u64 %p20, %rd79, 50257;
142
+ .loc 1 36 34
143
+ shl.b64 %rd81, %rd79, 2;
144
+ add.s64 %rd82, %rd36, %rd81;
145
+ add.s64 %rd63, %rd82, %rd83;
146
+ shl.b64 %rd84, %rd80, 2;
147
+ add.s64 %rd85, %rd36, %rd84;
148
+ add.s64 %rd64, %rd85, %rd83;
149
+ add.s64 %rd65, %rd82, %rd86;
150
+ add.s64 %rd66, %rd85, %rd86;
151
+ add.s64 %rd67, %rd65, 201028;
152
+ add.s64 %rd68, %rd66, 201028;
153
+ add.s64 %rd69, %rd65, 402056;
154
+ add.s64 %rd70, %rd66, 402056;
155
+ add.s64 %rd71, %rd65, 603084;
156
+ add.s64 %rd72, %rd66, 603084;
157
+ add.s64 %rd73, %rd65, 804112;
158
+ add.s64 %rd74, %rd66, 804112;
159
+ add.s64 %rd75, %rd65, 1005140;
160
+ add.s64 %rd76, %rd66, 1005140;
161
+ add.s64 %rd77, %rd65, 1206168;
162
+ add.s64 %rd78, %rd66, 1206168;
163
+ mov.b32 %r173, 0;
164
+ .loc 1 36 52
165
+ mov.u32 %r11, 0x0;
166
+ @%p20 ld.global.L1::evict_last.b32 { %r11 }, [ %rd63 + 0 ];
167
+ @!%p20 mov.u32 %r11, %r173;
168
+ mov.u32 %r13, 0x0;
169
+ @%p22 ld.global.L1::evict_last.b32 { %r13 }, [ %rd64 + 0 ];
170
+ @!%p22 mov.u32 %r13, %r173;
171
+ mov.u32 %r15, 0x0;
172
+ @%p20 ld.global.L1::evict_last.b32 { %r15 }, [ %rd65 + 0 ];
173
+ @!%p20 mov.u32 %r15, %r173;
174
+ mov.u32 %r17, 0x0;
175
+ @%p22 ld.global.L1::evict_last.b32 { %r17 }, [ %rd66 + 0 ];
176
+ @!%p22 mov.u32 %r17, %r173;
177
+ mov.u32 %r19, 0x0;
178
+ @%p20 ld.global.L1::evict_last.b32 { %r19 }, [ %rd67 + 0 ];
179
+ @!%p20 mov.u32 %r19, %r173;
180
+ mov.u32 %r21, 0x0;
181
+ @%p22 ld.global.L1::evict_last.b32 { %r21 }, [ %rd68 + 0 ];
182
+ @!%p22 mov.u32 %r21, %r173;
183
+ mov.u32 %r23, 0x0;
184
+ @%p20 ld.global.L1::evict_last.b32 { %r23 }, [ %rd69 + 0 ];
185
+ @!%p20 mov.u32 %r23, %r173;
186
+ mov.u32 %r25, 0x0;
187
+ @%p22 ld.global.L1::evict_last.b32 { %r25 }, [ %rd70 + 0 ];
188
+ @!%p22 mov.u32 %r25, %r173;
189
+ mov.u32 %r27, 0x0;
190
+ @%p20 ld.global.L1::evict_last.b32 { %r27 }, [ %rd71 + 0 ];
191
+ @!%p20 mov.u32 %r27, %r173;
192
+ mov.u32 %r29, 0x0;
193
+ @%p22 ld.global.L1::evict_last.b32 { %r29 }, [ %rd72 + 0 ];
194
+ @!%p22 mov.u32 %r29, %r173;
195
+ mov.u32 %r31, 0x0;
196
+ @%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd73 + 0 ];
197
+ @!%p20 mov.u32 %r31, %r173;
198
+ mov.u32 %r33, 0x0;
199
+ @%p22 ld.global.L1::evict_last.b32 { %r33 }, [ %rd74 + 0 ];
200
+ @!%p22 mov.u32 %r33, %r173;
201
+ mov.u32 %r35, 0x0;
202
+ @%p20 ld.global.L1::evict_last.b32 { %r35 }, [ %rd75 + 0 ];
203
+ @!%p20 mov.u32 %r35, %r173;
204
+ mov.u32 %r37, 0x0;
205
+ @%p22 ld.global.L1::evict_last.b32 { %r37 }, [ %rd76 + 0 ];
206
+ @!%p22 mov.u32 %r37, %r173;
207
+ mov.u32 %r39, 0x0;
208
+ @%p20 ld.global.L1::evict_last.b32 { %r39 }, [ %rd77 + 0 ];
209
+ @!%p20 mov.u32 %r39, %r173;
210
+ mov.u32 %r41, 0x0;
211
+ @%p22 ld.global.L1::evict_last.b32 { %r41 }, [ %rd78 + 0 ];
212
+ @!%p22 mov.u32 %r41, %r173;
213
+ mov.b32 %f90, %r41;
214
+ mov.b32 %f91, %r39;
215
+ mov.b32 %f92, %r37;
216
+ mov.b32 %f93, %r35;
217
+ mov.b32 %f94, %r33;
218
+ mov.b32 %f95, %r31;
219
+ mov.b32 %f96, %r29;
220
+ mov.b32 %f97, %r27;
221
+ mov.b32 %f98, %r25;
222
+ mov.b32 %f99, %r23;
223
+ mov.b32 %f100, %r21;
224
+ mov.b32 %f101, %r19;
225
+ mov.b32 %f102, %r17;
226
+ mov.b32 %f103, %r15;
227
+ mov.b32 %f104, %r13;
228
+ mov.b32 %f105, %r11;
229
+ .loc 1 42 23
230
+ mul.f32 %f106, %f1, %f105;
231
+ mul.f32 %f107, %f1, %f104;
232
+ mul.f32 %f108, %f2, %f103;
233
+ mul.f32 %f109, %f2, %f102;
234
+ mul.f32 %f110, %f3, %f101;
235
+ mul.f32 %f111, %f3, %f100;
236
+ mul.f32 %f112, %f4, %f99;
237
+ mul.f32 %f113, %f4, %f98;
238
+ mul.f32 %f114, %f5, %f97;
239
+ mul.f32 %f115, %f5, %f96;
240
+ mul.f32 %f116, %f6, %f95;
241
+ mul.f32 %f117, %f6, %f94;
242
+ mul.f32 %f118, %f7, %f93;
243
+ mul.f32 %f119, %f7, %f92;
244
+ mul.f32 %f120, %f8, %f91;
245
+ mul.f32 %f121, %f8, %f90;
246
+ .loc 1 45 40
247
+ selp.f32 %f122, %f121, 0f80000000, %p22;
248
+ selp.f32 %f123, %f120, 0f80000000, %p20;
249
+ selp.f32 %f124, %f119, 0f80000000, %p22;
250
+ selp.f32 %f125, %f118, 0f80000000, %p20;
251
+ selp.f32 %f126, %f117, 0f80000000, %p22;
252
+ selp.f32 %f127, %f116, 0f80000000, %p20;
253
+ selp.f32 %f128, %f115, 0f80000000, %p22;
254
+ selp.f32 %f129, %f114, 0f80000000, %p20;
255
+ selp.f32 %f130, %f113, 0f80000000, %p22;
256
+ selp.f32 %f131, %f112, 0f80000000, %p20;
257
+ selp.f32 %f132, %f111, 0f80000000, %p22;
258
+ selp.f32 %f133, %f110, 0f80000000, %p20;
259
+ selp.f32 %f134, %f109, 0f80000000, %p22;
260
+ selp.f32 %f135, %f108, 0f80000000, %p20;
261
+ selp.f32 %f136, %f107, 0f80000000, %p22;
262
+ selp.f32 %f137, %f106, 0f80000000, %p20;
263
+ add.f32 %f377, %f377, %f137;
264
+ add.f32 %f378, %f378, %f136;
265
+ add.f32 %f379, %f379, %f135;
266
+ add.f32 %f380, %f380, %f134;
267
+ add.f32 %f381, %f381, %f133;
268
+ add.f32 %f382, %f382, %f132;
269
+ add.f32 %f383, %f383, %f131;
270
+ add.f32 %f384, %f384, %f130;
271
+ add.f32 %f385, %f385, %f129;
272
+ add.f32 %f386, %f386, %f128;
273
+ add.f32 %f387, %f387, %f127;
274
+ add.f32 %f388, %f388, %f126;
275
+ add.f32 %f389, %f389, %f125;
276
+ add.f32 %f390, %f390, %f124;
277
+ add.f32 %f391, %f391, %f123;
278
+ add.f32 %f392, %f392, %f122;
279
+ .loc 1 32 36
280
+ add.s64 %rd157, %rd157, 512;
281
+ cvt.u32.u64 %r43, %rd157;
282
+ add.s32 %r44, %r43, -512;
283
+ setp.lt.u32 %p51, %r44, 49745;
284
+ @%p51 bra $L__BB0_1;
285
+ .loc 1 22 44
286
+ and.b32 %r65, %r1, 31;
287
+ .loc 1 24 33
288
+ and.b32 %r66, %r2, 7;
289
+ $L__tmp1:
290
+ .loc 2 233 15
291
+ add.f32 %f138, %f377, %f378;
292
+ add.f32 %f139, %f379, %f380;
293
+ add.f32 %f140, %f381, %f382;
294
+ add.f32 %f141, %f383, %f384;
295
+ add.f32 %f142, %f385, %f386;
296
+ add.f32 %f143, %f387, %f388;
297
+ add.f32 %f144, %f389, %f390;
298
+ add.f32 %f145, %f391, %f392;
299
+ $L__tmp2:
300
+ .loc 2 243 36
301
+ mov.b32 %r67, %f138;
302
+ shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
303
+ mov.b32 %f146, %r68;
304
+ $L__tmp3:
305
+ .loc 2 233 15
306
+ add.f32 %f147, %f138, %f146;
307
+ $L__tmp4:
308
+ .loc 2 243 36
309
+ mov.b32 %r69, %f147;
310
+ shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
311
+ mov.b32 %f148, %r70;
312
+ $L__tmp5:
313
+ .loc 2 233 15
314
+ add.f32 %f149, %f147, %f148;
315
+ $L__tmp6:
316
+ .loc 2 243 36
317
+ mov.b32 %r71, %f149;
318
+ shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
319
+ mov.b32 %f150, %r72;
320
+ $L__tmp7:
321
+ .loc 2 233 15
322
+ add.f32 %f151, %f149, %f150;
323
+ $L__tmp8:
324
+ .loc 2 243 36
325
+ mov.b32 %r73, %f151;
326
+ shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1;
327
+ mov.b32 %f152, %r74;
328
+ $L__tmp9:
329
+ .loc 2 233 15
330
+ add.f32 %f153, %f151, %f152;
331
+ $L__tmp10:
332
+ .loc 2 243 36
333
+ mov.b32 %r75, %f153;
334
+ shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1;
335
+ mov.b32 %f154, %r76;
336
+ $L__tmp11:
337
+ .loc 2 233 15
338
+ add.f32 %f155, %f153, %f154;
339
+ $L__tmp12:
340
+ .loc 2 243 36
341
+ mov.b32 %r77, %f139;
342
+ shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
343
+ mov.b32 %f156, %r78;
344
+ $L__tmp13:
345
+ .loc 2 233 15
346
+ add.f32 %f157, %f139, %f156;
347
+ $L__tmp14:
348
+ .loc 2 243 36
349
+ mov.b32 %r79, %f157;
350
+ shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
351
+ mov.b32 %f158, %r80;
352
+ $L__tmp15:
353
+ .loc 2 233 15
354
+ add.f32 %f159, %f157, %f158;
355
+ $L__tmp16:
356
+ .loc 2 243 36
357
+ mov.b32 %r81, %f159;
358
+ shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
359
+ mov.b32 %f160, %r82;
360
+ $L__tmp17:
361
+ .loc 2 233 15
362
+ add.f32 %f161, %f159, %f160;
363
+ $L__tmp18:
364
+ .loc 2 243 36
365
+ mov.b32 %r83, %f161;
366
+ shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
367
+ mov.b32 %f162, %r84;
368
+ $L__tmp19:
369
+ .loc 2 233 15
370
+ add.f32 %f163, %f161, %f162;
371
+ $L__tmp20:
372
+ .loc 2 243 36
373
+ mov.b32 %r85, %f163;
374
+ shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
375
+ mov.b32 %f164, %r86;
376
+ $L__tmp21:
377
+ .loc 2 233 15
378
+ add.f32 %f165, %f163, %f164;
379
+ $L__tmp22:
380
+ .loc 2 243 36
381
+ mov.b32 %r87, %f140;
382
+ shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
383
+ mov.b32 %f166, %r88;
384
+ $L__tmp23:
385
+ .loc 2 233 15
386
+ add.f32 %f167, %f140, %f166;
387
+ $L__tmp24:
388
+ .loc 2 243 36
389
+ mov.b32 %r89, %f167;
390
+ shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
391
+ mov.b32 %f168, %r90;
392
+ $L__tmp25:
393
+ .loc 2 233 15
394
+ add.f32 %f169, %f167, %f168;
395
+ $L__tmp26:
396
+ .loc 2 243 36
397
+ mov.b32 %r91, %f169;
398
+ shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
399
+ mov.b32 %f170, %r92;
400
+ $L__tmp27:
401
+ .loc 2 233 15
402
+ add.f32 %f171, %f169, %f170;
403
+ $L__tmp28:
404
+ .loc 2 243 36
405
+ mov.b32 %r93, %f171;
406
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
407
+ mov.b32 %f172, %r94;
408
+ $L__tmp29:
409
+ .loc 2 233 15
410
+ add.f32 %f173, %f171, %f172;
411
+ $L__tmp30:
412
+ .loc 2 243 36
413
+ mov.b32 %r95, %f173;
414
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
415
+ mov.b32 %f174, %r96;
416
+ $L__tmp31:
417
+ .loc 2 233 15
418
+ add.f32 %f175, %f173, %f174;
419
+ $L__tmp32:
420
+ .loc 2 243 36
421
+ mov.b32 %r97, %f141;
422
+ shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
423
+ mov.b32 %f176, %r98;
424
+ $L__tmp33:
425
+ .loc 2 233 15
426
+ add.f32 %f177, %f141, %f176;
427
+ $L__tmp34:
428
+ .loc 2 243 36
429
+ mov.b32 %r99, %f177;
430
+ shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
431
+ mov.b32 %f178, %r100;
432
+ $L__tmp35:
433
+ .loc 2 233 15
434
+ add.f32 %f179, %f177, %f178;
435
+ $L__tmp36:
436
+ .loc 2 243 36
437
+ mov.b32 %r101, %f179;
438
+ shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
439
+ mov.b32 %f180, %r102;
440
+ $L__tmp37:
441
+ .loc 2 233 15
442
+ add.f32 %f181, %f179, %f180;
443
+ $L__tmp38:
444
+ .loc 2 243 36
445
+ mov.b32 %r103, %f181;
446
+ shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
447
+ mov.b32 %f182, %r104;
448
+ $L__tmp39:
449
+ .loc 2 233 15
450
+ add.f32 %f183, %f181, %f182;
451
+ $L__tmp40:
452
+ .loc 2 243 36
453
+ mov.b32 %r105, %f183;
454
+ shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
455
+ mov.b32 %f184, %r106;
456
+ $L__tmp41:
457
+ .loc 2 233 15
458
+ add.f32 %f185, %f183, %f184;
459
+ $L__tmp42:
460
+ .loc 2 243 36
461
+ mov.b32 %r107, %f142;
462
+ shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1;
463
+ mov.b32 %f186, %r108;
464
+ $L__tmp43:
465
+ .loc 2 233 15
466
+ add.f32 %f187, %f142, %f186;
467
+ $L__tmp44:
468
+ .loc 2 243 36
469
+ mov.b32 %r109, %f187;
470
+ shfl.sync.bfly.b32 %r110, %r109, 8, 31, -1;
471
+ mov.b32 %f188, %r110;
472
+ $L__tmp45:
473
+ .loc 2 233 15
474
+ add.f32 %f189, %f187, %f188;
475
+ $L__tmp46:
476
+ .loc 2 243 36
477
+ mov.b32 %r111, %f189;
478
+ shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
479
+ mov.b32 %f190, %r112;
480
+ $L__tmp47:
481
+ .loc 2 233 15
482
+ add.f32 %f191, %f189, %f190;
483
+ $L__tmp48:
484
+ .loc 2 243 36
485
+ mov.b32 %r113, %f191;
486
+ shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
487
+ mov.b32 %f192, %r114;
488
+ $L__tmp49:
489
+ .loc 2 233 15
490
+ add.f32 %f193, %f191, %f192;
491
+ $L__tmp50:
492
+ .loc 2 243 36
493
+ mov.b32 %r115, %f193;
494
+ shfl.sync.bfly.b32 %r116, %r115, 1, 31, -1;
495
+ mov.b32 %f194, %r116;
496
+ $L__tmp51:
497
+ .loc 2 233 15
498
+ add.f32 %f195, %f193, %f194;
499
+ $L__tmp52:
500
+ .loc 2 243 36
501
+ mov.b32 %r117, %f143;
502
+ shfl.sync.bfly.b32 %r118, %r117, 16, 31, -1;
503
+ mov.b32 %f196, %r118;
504
+ $L__tmp53:
505
+ .loc 2 233 15
506
+ add.f32 %f197, %f143, %f196;
507
+ $L__tmp54:
508
+ .loc 2 243 36
509
+ mov.b32 %r119, %f197;
510
+ shfl.sync.bfly.b32 %r120, %r119, 8, 31, -1;
511
+ mov.b32 %f198, %r120;
512
+ $L__tmp55:
513
+ .loc 2 233 15
514
+ add.f32 %f199, %f197, %f198;
515
+ $L__tmp56:
516
+ .loc 2 243 36
517
+ mov.b32 %r121, %f199;
518
+ shfl.sync.bfly.b32 %r122, %r121, 4, 31, -1;
519
+ mov.b32 %f200, %r122;
520
+ $L__tmp57:
521
+ .loc 2 233 15
522
+ add.f32 %f201, %f199, %f200;
523
+ $L__tmp58:
524
+ .loc 2 243 36
525
+ mov.b32 %r123, %f201;
526
+ shfl.sync.bfly.b32 %r124, %r123, 2, 31, -1;
527
+ mov.b32 %f202, %r124;
528
+ $L__tmp59:
529
+ .loc 2 233 15
530
+ add.f32 %f203, %f201, %f202;
531
+ $L__tmp60:
532
+ .loc 2 243 36
533
+ mov.b32 %r125, %f203;
534
+ shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1;
535
+ mov.b32 %f204, %r126;
536
+ $L__tmp61:
537
+ .loc 2 233 15
538
+ add.f32 %f205, %f203, %f204;
539
+ $L__tmp62:
540
+ .loc 2 243 36
541
+ mov.b32 %r127, %f144;
542
+ shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
543
+ mov.b32 %f206, %r128;
544
+ $L__tmp63:
545
+ .loc 2 233 15
546
+ add.f32 %f207, %f144, %f206;
547
+ $L__tmp64:
548
+ .loc 2 243 36
549
+ mov.b32 %r129, %f207;
550
+ shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
551
+ mov.b32 %f208, %r130;
552
+ $L__tmp65:
553
+ .loc 2 233 15
554
+ add.f32 %f209, %f207, %f208;
555
+ $L__tmp66:
556
+ .loc 2 243 36
557
+ mov.b32 %r131, %f209;
558
+ shfl.sync.bfly.b32 %r132, %r131, 4, 31, -1;
559
+ mov.b32 %f210, %r132;
560
+ $L__tmp67:
561
+ .loc 2 233 15
562
+ add.f32 %f211, %f209, %f210;
563
+ $L__tmp68:
564
+ .loc 2 243 36
565
+ mov.b32 %r133, %f211;
566
+ shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
567
+ mov.b32 %f212, %r134;
568
+ $L__tmp69:
569
+ .loc 2 233 15
570
+ add.f32 %f213, %f211, %f212;
571
+ $L__tmp70:
572
+ .loc 2 243 36
573
+ mov.b32 %r135, %f213;
574
+ shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
575
+ mov.b32 %f214, %r136;
576
+ $L__tmp71:
577
+ .loc 2 233 15
578
+ add.f32 %f215, %f213, %f214;
579
+ $L__tmp72:
580
+ .loc 2 243 36
581
+ mov.b32 %r137, %f145;
582
+ shfl.sync.bfly.b32 %r138, %r137, 16, 31, -1;
583
+ mov.b32 %f216, %r138;
584
+ $L__tmp73:
585
+ .loc 2 233 15
586
+ add.f32 %f217, %f145, %f216;
587
+ $L__tmp74:
588
+ .loc 2 243 36
589
+ mov.b32 %r139, %f217;
590
+ shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1;
591
+ mov.b32 %f218, %r140;
592
+ $L__tmp75:
593
+ .loc 2 233 15
594
+ add.f32 %f219, %f217, %f218;
595
+ $L__tmp76:
596
+ .loc 2 243 36
597
+ mov.b32 %r141, %f219;
598
+ shfl.sync.bfly.b32 %r142, %r141, 4, 31, -1;
599
+ mov.b32 %f220, %r142;
600
+ $L__tmp77:
601
+ .loc 2 233 15
602
+ add.f32 %f221, %f219, %f220;
603
+ $L__tmp78:
604
+ .loc 2 243 36
605
+ mov.b32 %r143, %f221;
606
+ shfl.sync.bfly.b32 %r144, %r143, 2, 31, -1;
607
+ mov.b32 %f222, %r144;
608
+ $L__tmp79:
609
+ .loc 2 233 15
610
+ add.f32 %f223, %f221, %f222;
611
+ $L__tmp80:
612
+ .loc 2 243 36
613
+ mov.b32 %r145, %f223;
614
+ shfl.sync.bfly.b32 %r146, %r145, 1, 31, -1;
615
+ mov.b32 %f224, %r146;
616
+ $L__tmp81:
617
+ .loc 2 233 15
618
+ add.f32 %f225, %f223, %f224;
619
+ $L__tmp82:
620
+ .loc 2 243 36
621
+ setp.eq.s32 %p52, %r65, 0;
622
+ shl.b32 %r147, %r66, 2;
623
+ mov.u32 %r148, global_smem;
624
+ add.s32 %r45, %r148, %r147;
625
+ mov.b32 %r46, %f155;
626
+ @%p52 st.shared.b32 [ %r45 + 0 ], %r46;
627
+ add.s32 %r47, %r45, 32;
628
+ mov.b32 %r48, %f165;
629
+ @%p52 st.shared.b32 [ %r47 + 0 ], %r48;
630
+ add.s32 %r49, %r45, 64;
631
+ mov.b32 %r50, %f175;
632
+ @%p52 st.shared.b32 [ %r49 + 0 ], %r50;
633
+ add.s32 %r51, %r45, 96;
634
+ mov.b32 %r52, %f185;
635
+ @%p52 st.shared.b32 [ %r51 + 0 ], %r52;
636
+ add.s32 %r53, %r45, 128;
637
+ mov.b32 %r54, %f195;
638
+ @%p52 st.shared.b32 [ %r53 + 0 ], %r54;
639
+ add.s32 %r55, %r45, 160;
640
+ mov.b32 %r56, %f205;
641
+ @%p52 st.shared.b32 [ %r55 + 0 ], %r56;
642
+ add.s32 %r57, %r45, 192;
643
+ mov.b32 %r58, %f215;
644
+ @%p52 st.shared.b32 [ %r57 + 0 ], %r58;
645
+ add.s32 %r59, %r45, 224;
646
+ mov.b32 %r60, %f225;
647
+ @%p52 st.shared.b32 [ %r59 + 0 ], %r60;
648
+ bar.sync 0;
649
+ setp.lt.s32 %p60, %r1, 64;
650
+ shl.b32 %r149, %r1, 2;
651
+ add.s32 %r62, %r148, %r149;
652
+ @%p60 ld.shared.b32 %r61, [ %r62 + 0 ];
653
+ mov.b32 %f226, %r61;
654
+ shfl.sync.bfly.b32 %r150, %r61, 4, 31, -1;
655
+ mov.b32 %f227, %r150;
656
+ $L__tmp83:
657
+ .loc 2 233 15
658
+ add.f32 %f228, %f226, %f227;
659
+ $L__tmp84:
660
+ .loc 2 243 36
661
+ mov.b32 %r151, %f228;
662
+ shfl.sync.bfly.b32 %r152, %r151, 2, 31, -1;
663
+ mov.b32 %f229, %r152;
664
+ $L__tmp85:
665
+ .loc 2 233 15
666
+ add.f32 %f230, %f228, %f229;
667
+ $L__tmp86:
668
+ .loc 2 243 36
669
+ mov.b32 %r153, %f230;
670
+ shfl.sync.bfly.b32 %r154, %r153, 1, 31, -1;
671
+ mov.b32 %f231, %r154;
672
+ $L__tmp87:
673
+ .loc 2 233 15
674
+ add.f32 %f232, %f230, %f231;
675
+ $L__tmp88:
676
+ .loc 2 243 36
677
+ and.b32 %r155, %r1, 7;
678
+ setp.eq.s32 %p62, %r155, 0;
679
+ and.pred %p61, %p60, %p62;
680
+ mov.b32 %r64, %f232;
681
+ @%p61 st.shared.b32 [ %r62 + 0 ], %r64;
682
+ bar.sync 0;
683
+ ld.shared.f32 %f57, [global_smem];
684
+ ld.shared.f32 %f58, [global_smem+32];
685
+ ld.shared.f32 %f59, [global_smem+64];
686
+ ld.shared.f32 %f60, [global_smem+96];
687
+ ld.shared.f32 %f61, [global_smem+128];
688
+ ld.shared.f32 %f62, [global_smem+160];
689
+ ld.shared.f32 %f63, [global_smem+192];
690
+ ld.shared.f32 %f64, [global_smem+224];
691
+ $L__tmp89:
692
+ .loc 1 51 36
693
+ mul.lo.s64 %rd10, %rd1, 804112;
694
+ shl.b64 %rd88, %rd3, 1;
695
+ add.s64 %rd164, %rd39, %rd88;
696
+ add.s64 %rd163, %rd38, %rd88;
697
+ shl.b64 %rd13, %rd3, 2;
698
+ mul.lo.s64 %rd89, %rd1, 1608224;
699
+ add.s64 %rd162, %rd36, %rd89;
700
+ add.s64 %rd161, %rd37, %rd88;
701
+ shl.b64 %rd90, %rd2, 1;
702
+ add.s64 %rd160, %rd39, %rd90;
703
+ add.s64 %rd159, %rd38, %rd90;
704
+ shl.b64 %rd18, %rd2, 2;
705
+ add.s64 %rd158, %rd37, %rd90;
706
+ mov.u64 %rd165, 0;
707
+ mov.u16 %rs2, 0;
708
+ $L__BB0_3:
709
+ .loc 1 52 27
710
+ add.s64 %rd155, %rd2, %rd165;
711
+ .loc 1 53 25
712
+ add.s64 %rd156, %rd3, %rd165;
713
+ setp.lt.u64 %p63, %rd155, 50257;
714
+ setp.lt.u64 %p65, %rd156, 50257;
715
+ .loc 1 55 35
716
+ add.s64 %rd91, %rd158, %rd10;
717
+ add.s64 %rd92, %rd161, %rd10;
718
+ add.s64 %rd93, %rd91, 100514;
719
+ add.s64 %rd94, %rd92, 100514;
720
+ add.s64 %rd95, %rd91, 201028;
721
+ add.s64 %rd96, %rd92, 201028;
722
+ add.s64 %rd97, %rd91, 301542;
723
+ add.s64 %rd98, %rd92, 301542;
724
+ add.s64 %rd99, %rd91, 402056;
725
+ add.s64 %rd100, %rd92, 402056;
726
+ add.s64 %rd101, %rd91, 502570;
727
+ add.s64 %rd102, %rd92, 502570;
728
+ add.s64 %rd103, %rd91, 603084;
729
+ add.s64 %rd104, %rd92, 603084;
730
+ add.s64 %rd105, %rd91, 703598;
731
+ .loc 1 55 53
732
+ add.s64 %rd106, %rd92, 703598;
733
+ mov.u16 %rs1, 0x0;
734
+ @%p63 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd91 + 0 ];
735
+ @!%p63 mov.u16 %rs1, %rs2;
736
+ mov.u16 %rs3, 0x0;
737
+ @%p65 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd92 + 0 ];
738
+ @!%p65 mov.u16 %rs3, %rs2;
739
+ mov.u16 %rs5, 0x0;
740
+ @%p63 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd93 + 0 ];
741
+ @!%p63 mov.u16 %rs5, %rs2;
742
+ mov.u16 %rs7, 0x0;
743
+ @%p65 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd94 + 0 ];
744
+ @!%p65 mov.u16 %rs7, %rs2;
745
+ mov.u16 %rs9, 0x0;
746
+ @%p63 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd95 + 0 ];
747
+ @!%p63 mov.u16 %rs9, %rs2;
748
+ mov.u16 %rs11, 0x0;
749
+ @%p65 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd96 + 0 ];
750
+ @!%p65 mov.u16 %rs11, %rs2;
751
+ mov.u16 %rs13, 0x0;
752
+ @%p63 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd97 + 0 ];
753
+ @!%p63 mov.u16 %rs13, %rs2;
754
+ mov.u16 %rs15, 0x0;
755
+ @%p65 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd98 + 0 ];
756
+ @!%p65 mov.u16 %rs15, %rs2;
757
+ mov.u16 %rs17, 0x0;
758
+ @%p63 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd99 + 0 ];
759
+ @!%p63 mov.u16 %rs17, %rs2;
760
+ mov.u16 %rs19, 0x0;
761
+ @%p65 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd100 + 0 ];
762
+ @!%p65 mov.u16 %rs19, %rs2;
763
+ mov.u16 %rs21, 0x0;
764
+ @%p63 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd101 + 0 ];
765
+ @!%p63 mov.u16 %rs21, %rs2;
766
+ mov.u16 %rs23, 0x0;
767
+ @%p65 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd102 + 0 ];
768
+ @!%p65 mov.u16 %rs23, %rs2;
769
+ mov.u16 %rs25, 0x0;
770
+ @%p63 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd103 + 0 ];
771
+ @!%p63 mov.u16 %rs25, %rs2;
772
+ mov.u16 %rs27, 0x0;
773
+ @%p65 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd104 + 0 ];
774
+ @!%p65 mov.u16 %rs27, %rs2;
775
+ mov.u16 %rs29, 0x0;
776
+ @%p63 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd105 + 0 ];
777
+ @!%p63 mov.u16 %rs29, %rs2;
778
+ mov.u16 %rs31, 0x0;
779
+ @%p65 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd106 + 0 ];
780
+ @!%p65 mov.u16 %rs31, %rs2;
781
+ .loc 1 55 105
782
+ cvt.f32.bf16 %r156, %rs1;
783
+ mov.b32 %f265, %r156;
784
+ cvt.f32.bf16 %r157, %rs3;
785
+ mov.b32 %f266, %r157;
786
+ cvt.f32.bf16 %r158, %rs5;
787
+ mov.b32 %f267, %r158;
788
+ cvt.f32.bf16 %r159, %rs7;
789
+ mov.b32 %f268, %r159;
790
+ cvt.f32.bf16 %r160, %rs9;
791
+ mov.b32 %f269, %r160;
792
+ cvt.f32.bf16 %r161, %rs11;
793
+ mov.b32 %f270, %r161;
794
+ cvt.f32.bf16 %r162, %rs13;
795
+ mov.b32 %f271, %r162;
796
+ cvt.f32.bf16 %r163, %rs15;
797
+ mov.b32 %f272, %r163;
798
+ cvt.f32.bf16 %r164, %rs17;
799
+ mov.b32 %f273, %r164;
800
+ cvt.f32.bf16 %r165, %rs19;
801
+ mov.b32 %f274, %r165;
802
+ cvt.f32.bf16 %r166, %rs21;
803
+ mov.b32 %f275, %r166;
804
+ cvt.f32.bf16 %r167, %rs23;
805
+ mov.b32 %f276, %r167;
806
+ cvt.f32.bf16 %r168, %rs25;
807
+ mov.b32 %f277, %r168;
808
+ cvt.f32.bf16 %r169, %rs27;
809
+ mov.b32 %f278, %r169;
810
+ cvt.f32.bf16 %r170, %rs29;
811
+ mov.b32 %f279, %r170;
812
+ cvt.f32.bf16 %r171, %rs31;
813
+ mov.b32 %f280, %r171;
814
+ .loc 1 56 35
815
+ add.s64 %rd107, %rd162, %rd18;
816
+ add.s64 %rd108, %rd162, %rd13;
817
+ add.s64 %rd109, %rd107, 201028;
818
+ add.s64 %rd110, %rd108, 201028;
819
+ add.s64 %rd111, %rd107, 402056;
820
+ add.s64 %rd112, %rd108, 402056;
821
+ add.s64 %rd113, %rd107, 603084;
822
+ add.s64 %rd114, %rd108, 603084;
823
+ add.s64 %rd115, %rd107, 804112;
824
+ add.s64 %rd116, %rd108, 804112;
825
+ add.s64 %rd117, %rd107, 1005140;
826
+ add.s64 %rd118, %rd108, 1005140;
827
+ add.s64 %rd119, %rd107, 1206168;
828
+ add.s64 %rd120, %rd108, 1206168;
829
+ add.s64 %rd121, %rd107, 1407196;
830
+ .loc 1 56 53
831
+ add.s64 %rd122, %rd108, 1407196;
832
+ mov.u32 %r172, 0x0;
833
+ @%p63 ld.global.L1::evict_first.b32 { %r172 }, [ %rd107 + 0 ];
834
+ @!%p63 mov.u32 %r172, %r173;
835
+ mov.b32 %f281, %r172;
836
+ mov.u32 %r174, 0x0;
837
+ @%p65 ld.global.L1::evict_first.b32 { %r174 }, [ %rd108 + 0 ];
838
+ @!%p65 mov.u32 %r174, %r173;
839
+ mov.b32 %f282, %r174;
840
+ mov.u32 %r176, 0x0;
841
+ @%p63 ld.global.L1::evict_first.b32 { %r176 }, [ %rd109 + 0 ];
842
+ @!%p63 mov.u32 %r176, %r173;
843
+ mov.b32 %f283, %r176;
844
+ mov.u32 %r178, 0x0;
845
+ @%p65 ld.global.L1::evict_first.b32 { %r178 }, [ %rd110 + 0 ];
846
+ @!%p65 mov.u32 %r178, %r173;
847
+ mov.b32 %f284, %r178;
848
+ mov.u32 %r180, 0x0;
849
+ @%p63 ld.global.L1::evict_first.b32 { %r180 }, [ %rd111 + 0 ];
850
+ @!%p63 mov.u32 %r180, %r173;
851
+ mov.b32 %f285, %r180;
852
+ mov.u32 %r182, 0x0;
853
+ @%p65 ld.global.L1::evict_first.b32 { %r182 }, [ %rd112 + 0 ];
854
+ @!%p65 mov.u32 %r182, %r173;
855
+ mov.b32 %f286, %r182;
856
+ mov.u32 %r184, 0x0;
857
+ @%p63 ld.global.L1::evict_first.b32 { %r184 }, [ %rd113 + 0 ];
858
+ @!%p63 mov.u32 %r184, %r173;
859
+ mov.b32 %f287, %r184;
860
+ mov.u32 %r186, 0x0;
861
+ @%p65 ld.global.L1::evict_first.b32 { %r186 }, [ %rd114 + 0 ];
862
+ @!%p65 mov.u32 %r186, %r173;
863
+ mov.b32 %f288, %r186;
864
+ mov.u32 %r188, 0x0;
865
+ @%p63 ld.global.L1::evict_first.b32 { %r188 }, [ %rd115 + 0 ];
866
+ @!%p63 mov.u32 %r188, %r173;
867
+ mov.b32 %f289, %r188;
868
+ mov.u32 %r190, 0x0;
869
+ @%p65 ld.global.L1::evict_first.b32 { %r190 }, [ %rd116 + 0 ];
870
+ @!%p65 mov.u32 %r190, %r173;
871
+ mov.b32 %f290, %r190;
872
+ mov.u32 %r192, 0x0;
873
+ @%p63 ld.global.L1::evict_first.b32 { %r192 }, [ %rd117 + 0 ];
874
+ @!%p63 mov.u32 %r192, %r173;
875
+ mov.b32 %f291, %r192;
876
+ mov.u32 %r194, 0x0;
877
+ @%p65 ld.global.L1::evict_first.b32 { %r194 }, [ %rd118 + 0 ];
878
+ @!%p65 mov.u32 %r194, %r173;
879
+ mov.b32 %f292, %r194;
880
+ mov.u32 %r196, 0x0;
881
+ @%p63 ld.global.L1::evict_first.b32 { %r196 }, [ %rd119 + 0 ];
882
+ @!%p63 mov.u32 %r196, %r173;
883
+ mov.b32 %f293, %r196;
884
+ mov.u32 %r198, 0x0;
885
+ @%p65 ld.global.L1::evict_first.b32 { %r198 }, [ %rd120 + 0 ];
886
+ @!%p65 mov.u32 %r198, %r173;
887
+ mov.b32 %f294, %r198;
888
+ mov.u32 %r200, 0x0;
889
+ @%p63 ld.global.L1::evict_first.b32 { %r200 }, [ %rd121 + 0 ];
890
+ @!%p63 mov.u32 %r200, %r173;
891
+ mov.b32 %f295, %r200;
892
+ mov.u32 %r202, 0x0;
893
+ @%p65 ld.global.L1::evict_first.b32 { %r202 }, [ %rd122 + 0 ];
894
+ @!%p65 mov.u32 %r202, %r173;
895
+ mov.b32 %f296, %r202;
896
+ .loc 1 57 35
897
+ add.s64 %rd123, %rd159, %rd10;
898
+ add.s64 %rd124, %rd163, %rd10;
899
+ add.s64 %rd125, %rd123, 100514;
900
+ add.s64 %rd126, %rd124, 100514;
901
+ add.s64 %rd127, %rd123, 201028;
902
+ add.s64 %rd128, %rd124, 201028;
903
+ add.s64 %rd129, %rd123, 301542;
904
+ add.s64 %rd130, %rd124, 301542;
905
+ add.s64 %rd131, %rd123, 402056;
906
+ add.s64 %rd132, %rd124, 402056;
907
+ add.s64 %rd133, %rd123, 502570;
908
+ add.s64 %rd134, %rd124, 502570;
909
+ add.s64 %rd135, %rd123, 603084;
910
+ add.s64 %rd136, %rd124, 603084;
911
+ add.s64 %rd137, %rd123, 703598;
912
+ .loc 1 57 53
913
+ add.s64 %rd138, %rd124, 703598;
914
+ mov.u16 %rs49, 0x0;
915
+ @%p63 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd123 + 0 ];
916
+ @!%p63 mov.u16 %rs49, %rs2;
917
+ mov.u16 %rs51, 0x0;
918
+ @%p65 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd124 + 0 ];
919
+ @!%p65 mov.u16 %rs51, %rs2;
920
+ mov.u16 %rs53, 0x0;
921
+ @%p63 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd125 + 0 ];
922
+ @!%p63 mov.u16 %rs53, %rs2;
923
+ mov.u16 %rs55, 0x0;
924
+ @%p65 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd126 + 0 ];
925
+ @!%p65 mov.u16 %rs55, %rs2;
926
+ mov.u16 %rs57, 0x0;
927
+ @%p63 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd127 + 0 ];
928
+ @!%p63 mov.u16 %rs57, %rs2;
929
+ mov.u16 %rs59, 0x0;
930
+ @%p65 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd128 + 0 ];
931
+ @!%p65 mov.u16 %rs59, %rs2;
932
+ mov.u16 %rs61, 0x0;
933
+ @%p63 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd129 + 0 ];
934
+ @!%p63 mov.u16 %rs61, %rs2;
935
+ mov.u16 %rs63, 0x0;
936
+ @%p65 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd130 + 0 ];
937
+ @!%p65 mov.u16 %rs63, %rs2;
938
+ mov.u16 %rs65, 0x0;
939
+ @%p63 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd131 + 0 ];
940
+ @!%p63 mov.u16 %rs65, %rs2;
941
+ mov.u16 %rs67, 0x0;
942
+ @%p65 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd132 + 0 ];
943
+ @!%p65 mov.u16 %rs67, %rs2;
944
+ mov.u16 %rs69, 0x0;
945
+ @%p63 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd133 + 0 ];
946
+ @!%p63 mov.u16 %rs69, %rs2;
947
+ mov.u16 %rs71, 0x0;
948
+ @%p65 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd134 + 0 ];
949
+ @!%p65 mov.u16 %rs71, %rs2;
950
+ mov.u16 %rs73, 0x0;
951
+ @%p63 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd135 + 0 ];
952
+ @!%p63 mov.u16 %rs73, %rs2;
953
+ mov.u16 %rs75, 0x0;
954
+ @%p65 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd136 + 0 ];
955
+ @!%p65 mov.u16 %rs75, %rs2;
956
+ mov.u16 %rs77, 0x0;
957
+ @%p63 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd137 + 0 ];
958
+ @!%p63 mov.u16 %rs77, %rs2;
959
+ mov.u16 %rs79, 0x0;
960
+ @%p65 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd138 + 0 ];
961
+ @!%p65 mov.u16 %rs79, %rs2;
962
+ .loc 1 57 105
963
+ cvt.f32.bf16 %r204, %rs49;
964
+ mov.b32 %f297, %r204;
965
+ cvt.f32.bf16 %r205, %rs51;
966
+ mov.b32 %f298, %r205;
967
+ cvt.f32.bf16 %r206, %rs53;
968
+ mov.b32 %f299, %r206;
969
+ cvt.f32.bf16 %r207, %rs55;
970
+ mov.b32 %f300, %r207;
971
+ cvt.f32.bf16 %r208, %rs57;
972
+ mov.b32 %f301, %r208;
973
+ cvt.f32.bf16 %r209, %rs59;
974
+ mov.b32 %f302, %r209;
975
+ cvt.f32.bf16 %r210, %rs61;
976
+ mov.b32 %f303, %r210;
977
+ cvt.f32.bf16 %r211, %rs63;
978
+ mov.b32 %f304, %r211;
979
+ cvt.f32.bf16 %r212, %rs65;
980
+ mov.b32 %f305, %r212;
981
+ cvt.f32.bf16 %r213, %rs67;
982
+ mov.b32 %f306, %r213;
983
+ cvt.f32.bf16 %r214, %rs69;
984
+ mov.b32 %f307, %r214;
985
+ cvt.f32.bf16 %r215, %rs71;
986
+ mov.b32 %f308, %r215;
987
+ cvt.f32.bf16 %r216, %rs73;
988
+ mov.b32 %f309, %r216;
989
+ cvt.f32.bf16 %r217, %rs75;
990
+ mov.b32 %f310, %r217;
991
+ cvt.f32.bf16 %r218, %rs77;
992
+ mov.b32 %f311, %r218;
993
+ cvt.f32.bf16 %r219, %rs79;
994
+ mov.b32 %f312, %r219;
995
+ .loc 1 65 23
996
+ mul.f32 %f234, %f297, 0f3FB8AA3B;
997
+ ex2.approx.f32 %f233, %f234;
998
+ mul.f32 %f236, %f298, 0f3FB8AA3B;
999
+ ex2.approx.f32 %f235, %f236;
1000
+ mul.f32 %f238, %f299, 0f3FB8AA3B;
1001
+ ex2.approx.f32 %f237, %f238;
1002
+ mul.f32 %f240, %f300, 0f3FB8AA3B;
1003
+ ex2.approx.f32 %f239, %f240;
1004
+ mul.f32 %f242, %f301, 0f3FB8AA3B;
1005
+ ex2.approx.f32 %f241, %f242;
1006
+ mul.f32 %f244, %f302, 0f3FB8AA3B;
1007
+ ex2.approx.f32 %f243, %f244;
1008
+ mul.f32 %f246, %f303, 0f3FB8AA3B;
1009
+ ex2.approx.f32 %f245, %f246;
1010
+ mul.f32 %f248, %f304, 0f3FB8AA3B;
1011
+ ex2.approx.f32 %f247, %f248;
1012
+ mul.f32 %f250, %f305, 0f3FB8AA3B;
1013
+ ex2.approx.f32 %f249, %f250;
1014
+ mul.f32 %f252, %f306, 0f3FB8AA3B;
1015
+ ex2.approx.f32 %f251, %f252;
1016
+ mul.f32 %f254, %f307, 0f3FB8AA3B;
1017
+ ex2.approx.f32 %f253, %f254;
1018
+ mul.f32 %f256, %f308, 0f3FB8AA3B;
1019
+ ex2.approx.f32 %f255, %f256;
1020
+ mul.f32 %f258, %f309, 0f3FB8AA3B;
1021
+ ex2.approx.f32 %f257, %f258;
1022
+ mul.f32 %f260, %f310, 0f3FB8AA3B;
1023
+ ex2.approx.f32 %f259, %f260;
1024
+ mul.f32 %f262, %f311, 0f3FB8AA3B;
1025
+ ex2.approx.f32 %f261, %f262;
1026
+ mul.f32 %f264, %f312, 0f3FB8AA3B;
1027
+ ex2.approx.f32 %f263, %f264;
1028
+ .loc 1 66 24
1029
+ mul.f32 %f313, %f57, %f233;
1030
+ mul.f32 %f314, %f57, %f235;
1031
+ mul.f32 %f315, %f58, %f237;
1032
+ mul.f32 %f316, %f58, %f239;
1033
+ mul.f32 %f317, %f59, %f241;
1034
+ mul.f32 %f318, %f59, %f243;
1035
+ mul.f32 %f319, %f60, %f245;
1036
+ mul.f32 %f320, %f60, %f247;
1037
+ mul.f32 %f321, %f61, %f249;
1038
+ mul.f32 %f322, %f61, %f251;
1039
+ mul.f32 %f323, %f62, %f253;
1040
+ mul.f32 %f324, %f62, %f255;
1041
+ mul.f32 %f325, %f63, %f257;
1042
+ mul.f32 %f326, %f63, %f259;
1043
+ mul.f32 %f327, %f64, %f261;
1044
+ mul.f32 %f328, %f64, %f263;
1045
+ .loc 1 67 24
1046
+ neg.f32 %f329, %f313;
1047
+ fma.rn.f32 %f330, %f1, %f281, %f329;
1048
+ neg.f32 %f331, %f314;
1049
+ fma.rn.f32 %f332, %f1, %f282, %f331;
1050
+ neg.f32 %f333, %f315;
1051
+ fma.rn.f32 %f334, %f2, %f283, %f333;
1052
+ neg.f32 %f335, %f316;
1053
+ fma.rn.f32 %f336, %f2, %f284, %f335;
1054
+ neg.f32 %f337, %f317;
1055
+ fma.rn.f32 %f338, %f3, %f285, %f337;
1056
+ neg.f32 %f339, %f318;
1057
+ fma.rn.f32 %f340, %f3, %f286, %f339;
1058
+ neg.f32 %f341, %f319;
1059
+ fma.rn.f32 %f342, %f4, %f287, %f341;
1060
+ neg.f32 %f343, %f320;
1061
+ fma.rn.f32 %f344, %f4, %f288, %f343;
1062
+ neg.f32 %f345, %f321;
1063
+ fma.rn.f32 %f346, %f5, %f289, %f345;
1064
+ neg.f32 %f347, %f322;
1065
+ fma.rn.f32 %f348, %f5, %f290, %f347;
1066
+ neg.f32 %f349, %f323;
1067
+ fma.rn.f32 %f350, %f6, %f291, %f349;
1068
+ neg.f32 %f351, %f324;
1069
+ fma.rn.f32 %f352, %f6, %f292, %f351;
1070
+ neg.f32 %f353, %f325;
1071
+ fma.rn.f32 %f354, %f7, %f293, %f353;
1072
+ neg.f32 %f355, %f326;
1073
+ fma.rn.f32 %f356, %f7, %f294, %f355;
1074
+ neg.f32 %f357, %f327;
1075
+ fma.rn.f32 %f358, %f8, %f295, %f357;
1076
+ neg.f32 %f359, %f328;
1077
+ fma.rn.f32 %f360, %f8, %f296, %f359;
1078
+ .loc 1 69 24
1079
+ add.f32 %f361, %f265, %f330;
1080
+ add.f32 %f362, %f266, %f332;
1081
+ add.f32 %f363, %f267, %f334;
1082
+ add.f32 %f364, %f268, %f336;
1083
+ add.f32 %f365, %f269, %f338;
1084
+ add.f32 %f366, %f270, %f340;
1085
+ add.f32 %f367, %f271, %f342;
1086
+ add.f32 %f368, %f272, %f344;
1087
+ add.f32 %f369, %f273, %f346;
1088
+ add.f32 %f370, %f274, %f348;
1089
+ add.f32 %f371, %f275, %f350;
1090
+ add.f32 %f372, %f276, %f352;
1091
+ add.f32 %f373, %f277, %f354;
1092
+ add.f32 %f374, %f278, %f356;
1093
+ add.f32 %f375, %f279, %f358;
1094
+ add.f32 %f376, %f280, %f360;
1095
+ .loc 1 70 29
1096
+ add.s64 %rd139, %rd160, %rd10;
1097
+ add.s64 %rd140, %rd164, %rd10;
1098
+ add.s64 %rd141, %rd139, 100514;
1099
+ add.s64 %rd142, %rd140, 100514;
1100
+ add.s64 %rd143, %rd139, 201028;
1101
+ add.s64 %rd144, %rd140, 201028;
1102
+ add.s64 %rd145, %rd139, 301542;
1103
+ add.s64 %rd146, %rd140, 301542;
1104
+ add.s64 %rd147, %rd139, 402056;
1105
+ add.s64 %rd148, %rd140, 402056;
1106
+ add.s64 %rd149, %rd139, 502570;
1107
+ add.s64 %rd150, %rd140, 502570;
1108
+ add.s64 %rd151, %rd139, 603084;
1109
+ add.s64 %rd152, %rd140, 603084;
1110
+ add.s64 %rd153, %rd139, 703598;
1111
+ .loc 1 70 54
1112
+ add.s64 %rd154, %rd140, 703598;
1113
+ mov.b32 %r220, %f361;
1114
+ cvt.rn.bf16.f32 %rs97, %r220;
1115
+ mov.b32 %r221, %f362;
1116
+ cvt.rn.bf16.f32 %rs98, %r221;
1117
+ mov.b32 %r222, %f363;
1118
+ cvt.rn.bf16.f32 %rs99, %r222;
1119
+ mov.b32 %r223, %f364;
1120
+ cvt.rn.bf16.f32 %rs100, %r223;
1121
+ mov.b32 %r224, %f365;
1122
+ cvt.rn.bf16.f32 %rs101, %r224;
1123
+ mov.b32 %r225, %f366;
1124
+ cvt.rn.bf16.f32 %rs102, %r225;
1125
+ mov.b32 %r226, %f367;
1126
+ cvt.rn.bf16.f32 %rs103, %r226;
1127
+ mov.b32 %r227, %f368;
1128
+ cvt.rn.bf16.f32 %rs104, %r227;
1129
+ mov.b32 %r228, %f369;
1130
+ cvt.rn.bf16.f32 %rs105, %r228;
1131
+ mov.b32 %r229, %f370;
1132
+ cvt.rn.bf16.f32 %rs106, %r229;
1133
+ mov.b32 %r230, %f371;
1134
+ cvt.rn.bf16.f32 %rs107, %r230;
1135
+ mov.b32 %r231, %f372;
1136
+ cvt.rn.bf16.f32 %rs108, %r231;
1137
+ mov.b32 %r232, %f373;
1138
+ cvt.rn.bf16.f32 %rs109, %r232;
1139
+ mov.b32 %r233, %f374;
1140
+ cvt.rn.bf16.f32 %rs110, %r233;
1141
+ mov.b32 %r234, %f375;
1142
+ cvt.rn.bf16.f32 %rs111, %r234;
1143
+ mov.b32 %r235, %f376;
1144
+ cvt.rn.bf16.f32 %rs112, %r235;
1145
+ @%p63 st.global.b16 [ %rd139 + 0 ], { %rs97 };
1146
+ @%p65 st.global.b16 [ %rd140 + 0 ], { %rs98 };
1147
+ @%p63 st.global.b16 [ %rd141 + 0 ], { %rs99 };
1148
+ @%p65 st.global.b16 [ %rd142 + 0 ], { %rs100 };
1149
+ @%p63 st.global.b16 [ %rd143 + 0 ], { %rs101 };
1150
+ @%p65 st.global.b16 [ %rd144 + 0 ], { %rs102 };
1151
+ @%p63 st.global.b16 [ %rd145 + 0 ], { %rs103 };
1152
+ @%p65 st.global.b16 [ %rd146 + 0 ], { %rs104 };
1153
+ @%p63 st.global.b16 [ %rd147 + 0 ], { %rs105 };
1154
+ @%p65 st.global.b16 [ %rd148 + 0 ], { %rs106 };
1155
+ @%p63 st.global.b16 [ %rd149 + 0 ], { %rs107 };
1156
+ @%p65 st.global.b16 [ %rd150 + 0 ], { %rs108 };
1157
+ @%p63 st.global.b16 [ %rd151 + 0 ], { %rs109 };
1158
+ @%p65 st.global.b16 [ %rd152 + 0 ], { %rs110 };
1159
+ @%p63 st.global.b16 [ %rd153 + 0 ], { %rs111 };
1160
+ @%p65 st.global.b16 [ %rd154 + 0 ], { %rs112 };
1161
+ .loc 1 51 36
1162
+ add.s64 %rd165, %rd165, 512;
1163
+ cvt.u32.u64 %r236, %rd165;
1164
+ add.s32 %r237, %r236, -512;
1165
+ add.s64 %rd164, %rd164, 1024;
1166
+ add.s64 %rd163, %rd163, 1024;
1167
+ add.s64 %rd162, %rd162, 2048;
1168
+ add.s64 %rd161, %rd161, 1024;
1169
+ add.s64 %rd160, %rd160, 1024;
1170
+ add.s64 %rd159, %rd159, 1024;
1171
+ add.s64 %rd158, %rd158, 1024;
1172
+ setp.lt.u32 %p175, %r237, 49745;
1173
+ @%p175 bra $L__BB0_3;
1174
+ .loc 1 51 4
1175
+ ret;
1176
+ $L__tmp90:
1177
+ $L__func_end0:
1178
+
1179
+ }
1180
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
1181
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
1182
+ .section .debug_abbrev
1183
+ {
1184
+ .b8 1
1185
+ .b8 17
1186
+ .b8 1
1187
+ .b8 37
1188
+ .b8 8
1189
+ .b8 19
1190
+ .b8 5
1191
+ .b8 3
1192
+ .b8 8
1193
+ .b8 16
1194
+ .b8 6
1195
+ .b8 27
1196
+ .b8 8
1197
+ .b8 180
1198
+ .b8 66
1199
+ .b8 12
1200
+ .b8 17
1201
+ .b8 1
1202
+ .b8 18
1203
+ .b8 1
1204
+ .b8 0
1205
+ .b8 0
1206
+ .b8 2
1207
+ .b8 46
1208
+ .b8 0
1209
+ .b8 135
1210
+ .b8 64
1211
+ .b8 8
1212
+ .b8 3
1213
+ .b8 8
1214
+ .b8 58
1215
+ .b8 11
1216
+ .b8 59
1217
+ .b8 11
1218
+ .b8 63
1219
+ .b8 12
1220
+ .b8 32
1221
+ .b8 11
1222
+ .b8 0
1223
+ .b8 0
1224
+ .b8 3
1225
+ .b8 46
1226
+ .b8 1
1227
+ .b8 17
1228
+ .b8 1
1229
+ .b8 18
1230
+ .b8 1
1231
+ .b8 64
1232
+ .b8 10
1233
+ .b8 49
1234
+ .b8 19
1235
+ .b8 0
1236
+ .b8 0
1237
+ .b8 4
1238
+ .b8 29
1239
+ .b8 1
1240
+ .b8 49
1241
+ .b8 19
1242
+ .b8 17
1243
+ .b8 1
1244
+ .b8 18
1245
+ .b8 1
1246
+ .b8 88
1247
+ .b8 11
1248
+ .b8 89
1249
+ .b8 11
1250
+ .b8 87
1251
+ .b8 11
1252
+ .b8 0
1253
+ .b8 0
1254
+ .b8 5
1255
+ .b8 29
1256
+ .b8 0
1257
+ .b8 49
1258
+ .b8 19
1259
+ .b8 17
1260
+ .b8 1
1261
+ .b8 18
1262
+ .b8 1
1263
+ .b8 88
1264
+ .b8 11
1265
+ .b8 89
1266
+ .b8 11
1267
+ .b8 87
1268
+ .b8 11
1269
+ .b8 0
1270
+ .b8 0
1271
+ .b8 0
1272
+ }
1273
+ .section .debug_info
1274
+ {
1275
+ .b32 278
1276
+ .b8 2
1277
+ .b8 0
1278
+ .b32 .debug_abbrev
1279
+ .b8 8
1280
+ .b8 1
1281
+ .b8 116
1282
+ .b8 114
1283
+ .b8 105
1284
+ .b8 116
1285
+ .b8 111
1286
+ .b8 110
1287
+ .b8 0
1288
+ .b8 2
1289
+ .b8 0
1290
+ .b8 99
1291
+ .b8 107
1292
+ .b8 122
1293
+ .b8 103
1294
+ .b8 108
1295
+ .b8 55
1296
+ .b8 116
1297
+ .b8 104
1298
+ .b8 98
1299
+ .b8 52
1300
+ .b8 120
1301
+ .b8 100
1302
+ .b8 102
1303
+ .b8 107
1304
+ .b8 102
1305
+ .b8 110
1306
+ .b8 100
1307
+ .b8 50
1308
+ .b8 116
1309
+ .b8 105
1310
+ .b8 100
1311
+ .b8 107
1312
+ .b8 115
1313
+ .b8 54
1314
+ .b8 109
1315
+ .b8 116
1316
+ .b8 53
1317
+ .b8 102
1318
+ .b8 51
1319
+ .b8 104
1320
+ .b8 97
1321
+ .b8 117
1322
+ .b8 119
1323
+ .b8 102
1324
+ .b8 121
1325
+ .b8 106
1326
+ .b8 102
1327
+ .b8 108
1328
+ .b8 98
1329
+ .b8 116
1330
+ .b8 122
1331
+ .b8 121
1332
+ .b8 101
1333
+ .b8 112
1334
+ .b8 111
1335
+ .b8 53
1336
+ .b8 111
1337
+ .b8 120
1338
+ .b8 107
1339
+ .b8 118
1340
+ .b8 104
1341
+ .b8 107
1342
+ .b8 46
1343
+ .b8 112
1344
+ .b8 121
1345
+ .b8 0
1346
+ .b32 .debug_line
1347
+ .b8 47
1348
+ .b8 116
1349
+ .b8 109
1350
+ .b8 112
1351
+ .b8 47
1352
+ .b8 116
1353
+ .b8 111
1354
+ .b8 114
1355
+ .b8 99
1356
+ .b8 104
1357
+ .b8 105
1358
+ .b8 110
1359
+ .b8 100
1360
+ .b8 117
1361
+ .b8 99
1362
+ .b8 116
1363
+ .b8 111
1364
+ .b8 114
1365
+ .b8 95
1366
+ .b8 114
1367
+ .b8 111
1368
+ .b8 111
1369
+ .b8 116
1370
+ .b8 47
1371
+ .b8 107
1372
+ .b8 122
1373
+ .b8 0
1374
+ .b8 1
1375
+ .b64 $L__func_begin0
1376
+ .b64 $L__func_end0
1377
+ .b8 2
1378
+ .b8 116
1379
+ .b8 114
1380
+ .b8 105
1381
+ .b8 116
1382
+ .b8 111
1383
+ .b8 110
1384
+ .b8 95
1385
+ .b8 95
1386
+ .b8 48
1387
+ .b8 100
1388
+ .b8 49
1389
+ .b8 100
1390
+ .b8 50
1391
+ .b8 100
1392
+ .b8 51
1393
+ .b8 100
1394
+ .b8 52
1395
+ .b8 100
1396
+ .b8 53
1397
+ .b8 100
1398
+ .b8 54
1399
+ .b8 100
1400
+ .b8 55
1401
+ .b8 100
1402
+ .b8 101
1403
+ .b8 56
1404
+ .b8 0
1405
+ .b8 116
1406
+ .b8 114
1407
+ .b8 105
1408
+ .b8 116
1409
+ .b8 111
1410
+ .b8 110
1411
+ .b8 95
1412
+ .b8 95
1413
+ .b8 48
1414
+ .b8 100
1415
+ .b8 49
1416
+ .b8 100
1417
+ .b8 50
1418
+ .b8 100
1419
+ .b8 51
1420
+ .b8 100
1421
+ .b8 52
1422
+ .b8 100
1423
+ .b8 53
1424
+ .b8 100
1425
+ .b8 54
1426
+ .b8 100
1427
+ .b8 55
1428
+ .b8 100
1429
+ .b8 101
1430
+ .b8 56
1431
+ .b8 0
1432
+ .b8 1
1433
+ .b8 18
1434
+ .b8 1
1435
+ .b8 1
1436
+ .b8 3
1437
+ .b64 $L__func_begin0
1438
+ .b64 $L__func_end0
1439
+ .b8 1
1440
+ .b8 156
1441
+ .b32 125
1442
+ .b8 4
1443
+ .b32 125
1444
+ .b64 $L__tmp1
1445
+ .b64 $L__tmp88
1446
+ .b8 2
1447
+ .b8 46
1448
+ .b8 27
1449
+ .b8 5
1450
+ .b32 125
1451
+ .b64 $L__tmp1
1452
+ .b64 $L__tmp88
1453
+ .b8 2
1454
+ .b8 243
1455
+ .b8 36
1456
+ .b8 0
1457
+ .b8 5
1458
+ .b32 125
1459
+ .b64 $L__tmp2
1460
+ .b64 $L__tmp89
1461
+ .b8 2
1462
+ .b8 46
1463
+ .b8 27
1464
+ .b8 0
1465
+ .b8 0
1466
+ }
1467
+ .section .debug_pubnames
1468
+ {
1469
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1470
+ $L__pubNames_start0:
1471
+ .b8 2
1472
+ .b8 0
1473
+ .b32 .debug_info
1474
+ .b32 282
1475
+ .b32 125
1476
+ .b8 116
1477
+ .b8 114
1478
+ .b8 105
1479
+ .b8 116
1480
+ .b8 111
1481
+ .b8 110
1482
+ .b8 95
1483
+ .b8 95
1484
+ .b8 48
1485
+ .b8 100
1486
+ .b8 49
1487
+ .b8 100
1488
+ .b8 50
1489
+ .b8 100
1490
+ .b8 51
1491
+ .b8 100
1492
+ .b8 52
1493
+ .b8 100
1494
+ .b8 53
1495
+ .b8 100
1496
+ .b8 54
1497
+ .b8 100
1498
+ .b8 55
1499
+ .b8 100
1500
+ .b8 101
1501
+ .b8 56
1502
+ .b8 0
1503
+ .b32 0
1504
+ $L__pubNames_end0:
1505
+ }
1506
+ .section .debug_pubtypes
1507
+ {
1508
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1509
+ $L__pubTypes_start0:
1510
+ .b8 2
1511
+ .b8 0
1512
+ .b32 .debug_info
1513
+ .b32 282
1514
+ .b32 0
1515
+ $L__pubTypes_end0:
1516
+ }
1517
+ .section .debug_loc { }
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<8x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<50257> : tensor<8x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<-1> : tensor<8x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked>
8
+ %c8_i64 = arith.constant 8 : i64
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x512xi64, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c512_i32 = arith.constant 512 : i32
12
+ %c50257_i32 = arith.constant 50257 : i32
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xbf16, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = arith.muli %1, %c8_i64 : i64
17
+ %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xi32, #blocked>
19
+ %5 = arith.extsi %4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked>
20
+ %6 = tt.splat %2 : (i64) -> tensor<8x1xi64, #blocked>
21
+ %7 = arith.addi %6, %5 : tensor<8x1xi64, #blocked>
22
+ %8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x512xi32, #blocked>
24
+ %10 = arith.extsi %9 : tensor<1x512xi32, #blocked> to tensor<1x512xi64, #blocked>
25
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<8x1x!tt.ptr<i64, 1>, #blocked>
26
+ %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr<i64, 1>, #blocked>, tensor<8x1xi64, #blocked>
27
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64, #blocked>
28
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
32
+ %18 = arith.muli %7, %cst_0 : tensor<8x1xi64, #blocked>
33
+ %19 = tt.broadcast %18 : (tensor<8x1xi64, #blocked>) -> tensor<8x512xi64, #blocked>
34
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>, #blocked>
35
+ %21 = arith.cmpi ne, %13, %cst_1 : tensor<8x1xi64, #blocked>
36
+ %22 = arith.divf %15, %17 : f32
37
+ %23 = tt.splat %22 : (f32) -> tensor<8x1xf32, #blocked>
38
+ %24 = arith.select %21, %23, %cst : tensor<8x1xi1, #blocked>, tensor<8x1xf32, #blocked>
39
+ %25 = tt.broadcast %24 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
40
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_2) -> (tensor<8x512xf32, #blocked>) : i32 {
41
+ %33 = arith.extsi %arg9 : i32 to i64
42
+ %34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
43
+ %35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
44
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
45
+ %37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
46
+ %38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
47
+ %39 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
48
+ %40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
49
+ %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
50
+ %42 = arith.mulf %41, %25 : tensor<8x512xf32, #blocked>
51
+ %43 = arith.addf %arg10, %42 : tensor<8x512xf32, #blocked>
52
+ %44 = arith.select %40, %43, %arg10 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked>
53
+ scf.yield %44 : tensor<8x512xf32, #blocked>
54
+ }
55
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
56
+ ^bb0(%arg9: f32, %arg10: f32):
57
+ %33 = arith.addf %arg9, %arg10 : f32
58
+ tt.reduce.return %33 : f32
59
+ }) : (tensor<8x512xf32, #blocked>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
60
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xf32, #blocked>
61
+ %29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
62
+ %30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
63
+ %31 = tt.broadcast %28 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
64
+ %32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
65
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 {
66
+ %33 = arith.extsi %arg9 : i32 to i64
67
+ %34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
68
+ %35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
69
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
70
+ %37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
71
+ %38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
72
+ %39 = tt.addptr %29, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
73
+ %40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
74
+ %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
75
+ %42 = arith.extf %41 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
76
+ %43 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
77
+ %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
78
+ %45 = tt.addptr %30, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
79
+ %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
80
+ %47 = arith.extf %46 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
81
+ %48 = arith.mulf %44, %25 : tensor<8x512xf32, #blocked>
82
+ %49 = math.exp %47 : tensor<8x512xf32, #blocked>
83
+ %50 = arith.mulf %49, %31 : tensor<8x512xf32, #blocked>
84
+ %51 = arith.subf %48, %50 : tensor<8x512xf32, #blocked>
85
+ %52 = arith.addf %42, %51 : tensor<8x512xf32, #blocked>
86
+ %53 = tt.addptr %32, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
87
+ %54 = arith.truncf %52 : tensor<8x512xf32, #blocked> to tensor<8x512xbf16, #blocked>
88
+ tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16, #blocked>
89
+ }
90
+ tt.return
91
+ }
92
+ }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 1, !dbg !8
7
+ %5 = and i32 %4, 510, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 9, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = icmp slt i32 %8, 12865792, !dbg !12
12
+ %10 = sext i32 %8 to i64, !dbg !13
13
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13
14
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 %9) #1, !dbg !14
15
+ ret void, !dbg !15
16
+ }
17
+
18
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
19
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
20
+
21
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
22
+ attributes #1 = { nounwind }
23
+
24
+ !llvm.module.flags = !{!0}
25
+ !llvm.dbg.cu = !{!1}
26
+ !nvvm.annotations = !{!3, !4, !4, !3}
27
+
28
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
29
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
30
+ !2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
31
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
32
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
33
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
34
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
35
+ !7 = !{}
36
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
37
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
38
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
39
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
40
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
41
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
42
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
43
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u32 triton__0d1de_param_1
14
+ )
15
+ .maxntid 256, 1, 1
16
+ {
17
+ .reg .pred %p<2>;
18
+ .reg .b32 %r<9>;
19
+ .reg .b64 %rd<4>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd2, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r4, %tid.x;
28
+ shl.b32 %r5, %r4, 1;
29
+ and.b32 %r6, %r5, 510;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 33
33
+ shl.b32 %r7, %r1, 9;
34
+ .loc 1 21 23
35
+ or.b32 %r8, %r7, %r6;
36
+ .loc 1 22 21
37
+ setp.lt.s32 %p1, %r8, 12865792;
38
+ .loc 1 25 25
39
+ mul.wide.s32 %rd3, %r8, 4;
40
+ add.s64 %rd1, %rd2, %rd3;
41
+ mov.b32 %r2, 0;
42
+ .loc 1 25 36
43
+ @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
44
+ .loc 1 25 4
45
+ ret;
46
+ $L__tmp1:
47
+ $L__func_end0:
48
+
49
+ }
50
+ .file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
51
+ .section .debug_abbrev
52
+ {
53
+ .b8 1
54
+ .b8 17
55
+ .b8 1
56
+ .b8 37
57
+ .b8 8
58
+ .b8 19
59
+ .b8 5
60
+ .b8 3
61
+ .b8 8
62
+ .b8 16
63
+ .b8 6
64
+ .b8 27
65
+ .b8 8
66
+ .b8 180
67
+ .b8 66
68
+ .b8 12
69
+ .b8 17
70
+ .b8 1
71
+ .b8 18
72
+ .b8 1
73
+ .b8 0
74
+ .b8 0
75
+ .b8 2
76
+ .b8 46
77
+ .b8 0
78
+ .b8 17
79
+ .b8 1
80
+ .b8 18
81
+ .b8 1
82
+ .b8 64
83
+ .b8 10
84
+ .b8 135
85
+ .b8 64
86
+ .b8 8
87
+ .b8 3
88
+ .b8 8
89
+ .b8 58
90
+ .b8 11
91
+ .b8 59
92
+ .b8 11
93
+ .b8 63
94
+ .b8 12
95
+ .b8 0
96
+ .b8 0
97
+ .b8 0
98
+ }
99
+ .section .debug_info
100
+ {
101
+ .b32 172
102
+ .b8 2
103
+ .b8 0
104
+ .b32 .debug_abbrev
105
+ .b8 8
106
+ .b8 1
107
+ .b8 116
108
+ .b8 114
109
+ .b8 105
110
+ .b8 116
111
+ .b8 111
112
+ .b8 110
113
+ .b8 0
114
+ .b8 2
115
+ .b8 0
116
+ .b8 99
117
+ .b8 52
118
+ .b8 121
119
+ .b8 115
120
+ .b8 101
121
+ .b8 108
122
+ .b8 100
123
+ .b8 119
124
+ .b8 109
125
+ .b8 117
126
+ .b8 51
127
+ .b8 116
128
+ .b8 111
129
+ .b8 53
130
+ .b8 50
131
+ .b8 112
132
+ .b8 98
133
+ .b8 104
134
+ .b8 50
135
+ .b8 109
136
+ .b8 100
137
+ .b8 50
138
+ .b8 111
139
+ .b8 101
140
+ .b8 117
141
+ .b8 102
142
+ .b8 114
143
+ .b8 113
144
+ .b8 51
145
+ .b8 102
146
+ .b8 99
147
+ .b8 100
148
+ .b8 109
149
+ .b8 97
150
+ .b8 112
151
+ .b8 107
152
+ .b8 116
153
+ .b8 52
154
+ .b8 110
155
+ .b8 120
156
+ .b8 100
157
+ .b8 122
158
+ .b8 109
159
+ .b8 121
160
+ .b8 113
161
+ .b8 116
162
+ .b8 103
163
+ .b8 100
164
+ .b8 50
165
+ .b8 121
166
+ .b8 115
167
+ .b8 112
168
+ .b8 46
169
+ .b8 112
170
+ .b8 121
171
+ .b8 0
172
+ .b32 .debug_line
173
+ .b8 47
174
+ .b8 116
175
+ .b8 109
176
+ .b8 112
177
+ .b8 47
178
+ .b8 116
179
+ .b8 111
180
+ .b8 114
181
+ .b8 99
182
+ .b8 104
183
+ .b8 105
184
+ .b8 110
185
+ .b8 100
186
+ .b8 117
187
+ .b8 99
188
+ .b8 116
189
+ .b8 111
190
+ .b8 114
191
+ .b8 95
192
+ .b8 114
193
+ .b8 111
194
+ .b8 111
195
+ .b8 116
196
+ .b8 47
197
+ .b8 52
198
+ .b8 121
199
+ .b8 0
200
+ .b8 1
201
+ .b64 $L__func_begin0
202
+ .b64 $L__func_end0
203
+ .b8 2
204
+ .b64 $L__func_begin0
205
+ .b64 $L__func_end0
206
+ .b8 1
207
+ .b8 156
208
+ .b8 116
209
+ .b8 114
210
+ .b8 105
211
+ .b8 116
212
+ .b8 111
213
+ .b8 110
214
+ .b8 95
215
+ .b8 95
216
+ .b8 48
217
+ .b8 100
218
+ .b8 49
219
+ .b8 100
220
+ .b8 101
221
+ .b8 0
222
+ .b8 116
223
+ .b8 114
224
+ .b8 105
225
+ .b8 116
226
+ .b8 111
227
+ .b8 110
228
+ .b8 95
229
+ .b8 95
230
+ .b8 48
231
+ .b8 100
232
+ .b8 49
233
+ .b8 100
234
+ .b8 101
235
+ .b8 0
236
+ .b8 1
237
+ .b8 18
238
+ .b8 1
239
+ .b8 0
240
+ }
241
+ .section .debug_pubnames
242
+ {
243
+ .b32 $L__pubNames_end0-$L__pubNames_start0
244
+ $L__pubNames_start0:
245
+ .b8 2
246
+ .b8 0
247
+ .b32 .debug_info
248
+ .b32 176
249
+ .b32 125
250
+ .b8 116
251
+ .b8 114
252
+ .b8 105
253
+ .b8 116
254
+ .b8 111
255
+ .b8 110
256
+ .b8 95
257
+ .b8 95
258
+ .b8 48
259
+ .b8 100
260
+ .b8 49
261
+ .b8 100
262
+ .b8 101
263
+ .b8 0
264
+ .b32 0
265
+ $L__pubNames_end0:
266
+ }
267
+ .section .debug_pubtypes
268
+ {
269
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
270
+ $L__pubTypes_start0:
271
+ .b8 2
272
+ .b8 0
273
+ .b32 .debug_info
274
+ .b32 176
275
+ .b32 0
276
+ $L__pubTypes_end0:
277
+ }
278
+ .section .debug_loc { }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c512_i32 : i32
9
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
10
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
11
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
12
+ %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
13
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
14
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
15
+ tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32>
4
+ %cst_0 = arith.constant dense<12865792> : tensor<512xi32>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c512_i32 : i32
8
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
9
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
10
+ %4 = arith.addi %3, %2 : tensor<512xi32>
11
+ %5 = arith.cmpi slt, %4, %cst_0 : tensor<512xi32>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
13
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
14
+ tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin ADDED
Binary file (15 kB). View file
 
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = and i32 %10, 31, !dbg !8
9
+ %12 = lshr i32 %10, 5, !dbg !8
10
+ %13 = and i32 %12, 1, !dbg !8
11
+ %urem = shl i32 %10, 2, !dbg !8
12
+ %14 = and i32 %urem, 252, !dbg !8
13
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
14
+ %16 = shl i32 %15, 8, !dbg !10
15
+ %17 = or i32 %16, %14, !dbg !11
16
+ %18 = sext i32 %17 to i64, !dbg !12
17
+ %19 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !12
18
+ %20 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
19
+ %21 = extractvalue { i32, i32 } %20, 0, !dbg !13
20
+ %22 = extractvalue { i32, i32 } %20, 1, !dbg !13
21
+ %23 = trunc i32 %21 to i16, !dbg !13
22
+ %extelt.offset = lshr i32 %21, 16, !dbg !13
23
+ %24 = trunc i32 %extelt.offset to i16, !dbg !13
24
+ %25 = trunc i32 %22 to i16, !dbg !13
25
+ %extelt.offset1 = lshr i32 %22, 16, !dbg !13
26
+ %26 = trunc i32 %extelt.offset1 to i16, !dbg !13
27
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
28
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
29
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
30
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
31
+ %31 = zext nneg i32 %14 to i64, !dbg !15
32
+ %32 = getelementptr float, ptr addrspace(1) %2, i64 %31, !dbg !15
33
+ %33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
34
+ %34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !16
35
+ %35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !16
36
+ %36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !16
37
+ %37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !16
38
+ %38 = bitcast i32 %34 to float, !dbg !16
39
+ %39 = bitcast i32 %35 to float, !dbg !16
40
+ %40 = bitcast i32 %36 to float, !dbg !16
41
+ %41 = bitcast i32 %37 to float, !dbg !16
42
+ %42 = getelementptr float, ptr addrspace(1) %3, i64 %18, !dbg !17
43
+ %43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
44
+ %44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
45
+ %45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
46
+ %46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
47
+ %47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
48
+ %48 = bitcast i32 %44 to float, !dbg !18
49
+ %49 = bitcast i32 %45 to float, !dbg !18
50
+ %50 = bitcast i32 %46 to float, !dbg !18
51
+ %51 = bitcast i32 %47 to float, !dbg !18
52
+ %52 = sext i32 %15 to i64, !dbg !19
53
+ %53 = getelementptr float, ptr addrspace(1) %4, i64 %52, !dbg !19
54
+ %54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
55
+ %55 = bitcast i32 %54 to float, !dbg !20
56
+ %56 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
57
+ %57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
58
+ %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
59
+ %59 = getelementptr float, ptr addrspace(1) %5, i64 %52, !dbg !21
60
+ %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
61
+ %61 = bitcast i32 %60 to float, !dbg !22
62
+ %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
63
+ %63 = bitcast i32 %62 to float, !dbg !22
64
+ %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
65
+ %65 = bitcast i32 %64 to float, !dbg !22
66
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
67
+ %67 = bitcast i32 %66 to float, !dbg !22
68
+ %68 = getelementptr float, ptr addrspace(1) %0, i64 %18, !dbg !23
69
+ %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
70
+ %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !24
71
+ %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !24
72
+ %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !24
73
+ %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !24
74
+ %74 = bitcast i32 %70 to float, !dbg !24
75
+ %75 = bitcast i32 %71 to float, !dbg !24
76
+ %76 = bitcast i32 %72 to float, !dbg !24
77
+ %77 = bitcast i32 %73 to float, !dbg !24
78
+ %78 = fmul float %27, %38, !dbg !25
79
+ %79 = fmul float %28, %39, !dbg !25
80
+ %80 = fmul float %29, %40, !dbg !25
81
+ %81 = fmul float %30, %41, !dbg !25
82
+ %82 = fadd float %78, %79, !dbg !26
83
+ %83 = fadd float %80, %82, !dbg !26
84
+ %84 = fadd float %81, %83, !dbg !26
85
+ %85 = bitcast float %84 to i32, !dbg !32
86
+ %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !32
87
+ %87 = bitcast i32 %86 to float, !dbg !32
88
+ %88 = fadd float %84, %87, !dbg !26
89
+ %89 = bitcast float %88 to i32, !dbg !32
90
+ %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !32
91
+ %91 = bitcast i32 %90 to float, !dbg !32
92
+ %92 = fadd float %88, %91, !dbg !26
93
+ %93 = bitcast float %92 to i32, !dbg !32
94
+ %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !32
95
+ %95 = bitcast i32 %94 to float, !dbg !32
96
+ %96 = fadd float %92, %95, !dbg !26
97
+ %97 = bitcast float %96 to i32, !dbg !32
98
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !32
99
+ %99 = bitcast i32 %98 to float, !dbg !32
100
+ %100 = fadd float %96, %99, !dbg !26
101
+ %101 = bitcast float %100 to i32, !dbg !32
102
+ %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !32
103
+ %103 = bitcast i32 %102 to float, !dbg !32
104
+ %104 = fadd float %100, %103, !dbg !26
105
+ %105 = icmp eq i32 %11, 0, !dbg !32
106
+ %106 = zext nneg i32 %13 to i64, !dbg !32
107
+ %107 = getelementptr float, ptr addrspace(3) @global_smem, i64 %106, !dbg !32
108
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %104, i1 %105) #3, !dbg !32
109
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
110
+ %108 = icmp slt i32 %10, 2, !dbg !32
111
+ %109 = sext i32 %10 to i64, !dbg !32
112
+ %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !32
113
+ %111 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !32
114
+ %112 = bitcast float %111 to i32, !dbg !32
115
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !32
116
+ %114 = bitcast i32 %113 to float, !dbg !32
117
+ %115 = fadd float %111, %114, !dbg !26
118
+ %116 = and i32 %10, 1, !dbg !32
119
+ %117 = icmp eq i32 %116, 0, !dbg !32
120
+ %118 = and i1 %108, %117, !dbg !32
121
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %115, i1 %118) #3, !dbg !32
122
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
123
+ %119 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
124
+ %120 = fadd float %119, 0.000000e+00, !dbg !34
125
+ %121 = fsub float %48, %55, !dbg !38
126
+ %122 = fsub float %49, %55, !dbg !38
127
+ %123 = fsub float %50, %55, !dbg !38
128
+ %124 = fsub float %51, %55, !dbg !38
129
+ %125 = fmul float %121, %61, !dbg !39
130
+ %126 = fmul float %122, %61, !dbg !39
131
+ %127 = fmul float %123, %61, !dbg !39
132
+ %128 = fmul float %124, %61, !dbg !39
133
+ %129 = fmul float %78, %125, !dbg !40
134
+ %130 = fmul float %79, %126, !dbg !40
135
+ %131 = fmul float %80, %127, !dbg !40
136
+ %132 = fmul float %81, %128, !dbg !40
137
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
138
+ %133 = fadd float %129, %130, !dbg !43
139
+ %134 = fadd float %131, %133, !dbg !43
140
+ %135 = fadd float %132, %134, !dbg !43
141
+ %136 = bitcast float %135 to i32, !dbg !41
142
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !41
143
+ %138 = bitcast i32 %137 to float, !dbg !41
144
+ %139 = fadd float %135, %138, !dbg !43
145
+ %140 = bitcast float %139 to i32, !dbg !41
146
+ %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !41
147
+ %142 = bitcast i32 %141 to float, !dbg !41
148
+ %143 = fadd float %139, %142, !dbg !43
149
+ %144 = bitcast float %143 to i32, !dbg !41
150
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !41
151
+ %146 = bitcast i32 %145 to float, !dbg !41
152
+ %147 = fadd float %143, %146, !dbg !43
153
+ %148 = bitcast float %147 to i32, !dbg !41
154
+ %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !41
155
+ %150 = bitcast i32 %149 to float, !dbg !41
156
+ %151 = fadd float %147, %150, !dbg !43
157
+ %152 = bitcast float %151 to i32, !dbg !41
158
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !41
159
+ %154 = bitcast i32 %153 to float, !dbg !41
160
+ %155 = fadd float %151, %154, !dbg !43
161
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %155, i1 %105) #3, !dbg !41
162
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
163
+ %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !41
164
+ %157 = bitcast float %156 to i32, !dbg !41
165
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !41
166
+ %159 = bitcast i32 %158 to float, !dbg !41
167
+ %160 = fadd float %156, %159, !dbg !43
168
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %160, i1 %118) #3, !dbg !41
169
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
170
+ %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
171
+ %162 = fadd float %161, 0.000000e+00, !dbg !46
172
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %61, float 2.560000e+02) #3, !dbg !48
173
+ %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %63, float 2.560000e+02) #3, !dbg !48
174
+ %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !48
175
+ %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !48
176
+ %167 = fmul float %78, 2.560000e+02, !dbg !49
177
+ %168 = fmul float %79, 2.560000e+02, !dbg !49
178
+ %169 = fmul float %80, 2.560000e+02, !dbg !49
179
+ %170 = fmul float %81, 2.560000e+02, !dbg !49
180
+ %171 = fsub float %167, %120, !dbg !50
181
+ %172 = fsub float %168, %120, !dbg !50
182
+ %173 = fsub float %169, %120, !dbg !50
183
+ %174 = fsub float %170, %120, !dbg !50
184
+ %175 = fmul float %125, %162, !dbg !51
185
+ %176 = fmul float %126, %162, !dbg !51
186
+ %177 = fmul float %127, %162, !dbg !51
187
+ %178 = fmul float %128, %162, !dbg !51
188
+ %179 = fsub float %171, %175, !dbg !52
189
+ %180 = fsub float %172, %176, !dbg !52
190
+ %181 = fsub float %173, %177, !dbg !52
191
+ %182 = fsub float %174, %178, !dbg !52
192
+ %183 = fmul float %163, %179, !dbg !53
193
+ %184 = fmul float %163, %180, !dbg !53
194
+ %185 = fmul float %163, %181, !dbg !53
195
+ %186 = fmul float %163, %182, !dbg !53
196
+ %187 = fadd float %183, %74, !dbg !54
197
+ %188 = fadd float %184, %75, !dbg !54
198
+ %189 = fadd float %185, %76, !dbg !54
199
+ %190 = fadd float %186, %77, !dbg !54
200
+ %191 = bitcast float %187 to i32, !dbg !55
201
+ %192 = bitcast float %188 to i32, !dbg !55
202
+ %193 = bitcast float %189 to i32, !dbg !55
203
+ %194 = bitcast float %190 to i32, !dbg !55
204
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %191, i32 %192, i32 %193, i32 %194, ptr addrspace(1) %68, i1 true) #3, !dbg !55
205
+ %195 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56
206
+ %196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #3, !dbg !57
207
+ %197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !57
208
+ %198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !57
209
+ %199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !57
210
+ %200 = insertelement <2 x i16> undef, i16 %196, i64 0, !dbg !57
211
+ %201 = insertelement <2 x i16> %200, i16 %197, i64 1, !dbg !57
212
+ %202 = bitcast <2 x i16> %201 to i32, !dbg !57
213
+ %203 = insertelement <2 x i16> undef, i16 %198, i64 0, !dbg !57
214
+ %204 = insertelement <2 x i16> %203, i16 %199, i64 1, !dbg !57
215
+ %205 = bitcast <2 x i16> %204 to i32, !dbg !57
216
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %202, i32 %205, ptr addrspace(1) %195, i1 true) #3, !dbg !57
217
+ ret void, !dbg !58
218
+ }
219
+
220
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
221
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
222
+
223
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
224
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
225
+
226
+ ; Function Attrs: convergent nocallback nounwind
227
+ declare void @llvm.nvvm.barrier0() #2
228
+
229
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
230
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
231
+ attributes #2 = { convergent nocallback nounwind }
232
+ attributes #3 = { nounwind }
233
+
234
+ !llvm.module.flags = !{!0}
235
+ !llvm.dbg.cu = !{!1}
236
+ !nvvm.annotations = !{!3, !4, !4, !3}
237
+
238
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
239
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
240
+ !2 = !DIFile(filename: "csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py", directory: "/tmp/torchinductor_root/sn")
241
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1}
242
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64}
243
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
244
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
245
+ !7 = !{}
246
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
247
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
248
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
249
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
250
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
251
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
252
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
253
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
254
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
255
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
256
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
257
+ !19 = !DILocation(line: 33, column: 30, scope: !5)
258
+ !20 = !DILocation(line: 33, column: 35, scope: !5)
259
+ !21 = !DILocation(line: 34, column: 31, scope: !5)
260
+ !22 = !DILocation(line: 34, column: 36, scope: !5)
261
+ !23 = !DILocation(line: 35, column: 35, scope: !5)
262
+ !24 = !DILocation(line: 35, column: 51, scope: !5)
263
+ !25 = !DILocation(line: 37, column: 18, scope: !5)
264
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
265
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
266
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
267
+ !29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
268
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
269
+ !31 = !DILocation(line: 40, column: 57, scope: !27)
270
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
271
+ !33 = !DILocation(line: 40, column: 57, scope: !29)
272
+ !34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
273
+ !35 = distinct !DILexicalBlockFile(scope: !5, file: !36, discriminator: 0)
274
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
275
+ !37 = !DILocation(line: 40, column: 44, scope: !35)
276
+ !38 = !DILocation(line: 41, column: 19, scope: !5)
277
+ !39 = !DILocation(line: 42, column: 20, scope: !5)
278
+ !40 = !DILocation(line: 43, column: 19, scope: !5)
279
+ !41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
280
+ !42 = !DILocation(line: 46, column: 59, scope: !29)
281
+ !43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
282
+ !44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
283
+ !45 = !DILocation(line: 46, column: 59, scope: !27)
284
+ !46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
285
+ !47 = !DILocation(line: 46, column: 45, scope: !35)
286
+ !48 = !DILocation(line: 48, column: 20, scope: !5)
287
+ !49 = !DILocation(line: 49, column: 19, scope: !5)
288
+ !50 = !DILocation(line: 50, column: 20, scope: !5)
289
+ !51 = !DILocation(line: 51, column: 20, scope: !5)
290
+ !52 = !DILocation(line: 52, column: 20, scope: !5)
291
+ !53 = !DILocation(line: 53, column: 20, scope: !5)
292
+ !54 = !DILocation(line: 54, column: 20, scope: !5)
293
+ !55 = !DILocation(line: 56, column: 51, scope: !5)
294
+ !56 = !DILocation(line: 57, column: 25, scope: !5)
295
+ !57 = !DILocation(line: 57, column: 48, scope: !5)
296
+ !58 = !DILocation(line: 57, column: 4, scope: !5)
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
6
+ %cst_1 = arith.constant 0.000000e+00 : f32
7
+ %c256_i32 = arith.constant 256 : i32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
9
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
20
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
21
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
22
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
28
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
29
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
30
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
31
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
32
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
33
+ %22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
34
+ %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
35
+ %24 = tt.load %23, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
36
+ %25 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
37
+ %26 = arith.select %2, %25, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
38
+ %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
39
+ ^bb0(%arg9: f32, %arg10: f32):
40
+ %50 = arith.addf %arg9, %arg10 : f32
41
+ tt.reduce.return %50 : f32
42
+ }) : (tensor<256xf32, #blocked>) -> f32
43
+ %28 = arith.addf %27, %cst_1 : f32
44
+ %29 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
45
+ %30 = arith.subf %15, %29 : tensor<256xf32, #blocked>
46
+ %31 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
47
+ %32 = arith.mulf %30, %31 : tensor<256xf32, #blocked>
48
+ %33 = arith.mulf %25, %32 : tensor<256xf32, #blocked>
49
+ %34 = arith.select %2, %33, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
50
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
51
+ ^bb0(%arg9: f32, %arg10: f32):
52
+ %50 = arith.addf %arg9, %arg10 : f32
53
+ tt.reduce.return %50 : f32
54
+ }) : (tensor<256xf32, #blocked>) -> f32
55
+ %36 = arith.addf %35, %cst_1 : f32
56
+ %37 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
57
+ %38 = arith.mulf %25, %cst_3 : tensor<256xf32, #blocked>
58
+ %39 = tt.splat %28 : (f32) -> tensor<256xf32, #blocked>
59
+ %40 = arith.subf %38, %39 : tensor<256xf32, #blocked>
60
+ %41 = tt.splat %36 : (f32) -> tensor<256xf32, #blocked>
61
+ %42 = arith.mulf %32, %41 : tensor<256xf32, #blocked>
62
+ %43 = arith.subf %40, %42 : tensor<256xf32, #blocked>
63
+ %44 = tt.broadcast %37 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
64
+ %45 = arith.mulf %44, %43 : tensor<256xf32, #blocked>
65
+ %46 = arith.addf %24, %45 : tensor<256xf32, #blocked>
66
+ tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
67
+ %47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
68
+ %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
69
+ %49 = arith.truncf %46 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
70
+ tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
71
+ tt.return
72
+ }
73
+ }
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx ADDED
@@ -0,0 +1,1927 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
22
+ )
23
+ .maxntid 256, 1, 1
24
+ {
25
+ .reg .pred %p<201>;
26
+ .reg .b16 %rs<129>;
27
+ .reg .b32 %r<399>;
28
+ .reg .f32 %f<469>;
29
+ .reg .b64 %rd<150>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
35
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
36
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
37
+ ld.param.u64 %rd52, [triton__0d1d2d3d4d5d6d7de8_param_0];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r1, %tid.x;
41
+ ld.param.u64 %rd53, [triton__0d1d2d3d4d5d6d7de8_param_1];
42
+ and.b32 %r2, %r1, 31;
43
+ ld.param.u64 %rd50, [triton__0d1d2d3d4d5d6d7de8_param_2];
44
+ ld.param.u64 %rd51, [triton__0d1d2d3d4d5d6d7de8_param_3];
45
+ bfe.u32 %r30, %r1, 6, 2;
46
+ or.b32 %r3, %r30, 4;
47
+ or.b32 %r4, %r30, 8;
48
+ or.b32 %r5, %r30, 12;
49
+ or.b32 %r6, %r30, 16;
50
+ or.b32 %r7, %r30, 20;
51
+ or.b32 %r8, %r30, 24;
52
+ or.b32 %r9, %r30, 28;
53
+ or.b32 %r10, %r30, 32;
54
+ or.b32 %r11, %r30, 36;
55
+ or.b32 %r12, %r30, 40;
56
+ or.b32 %r13, %r30, 44;
57
+ or.b32 %r14, %r30, 48;
58
+ or.b32 %r15, %r30, 52;
59
+ or.b32 %r16, %r30, 56;
60
+ or.b32 %r17, %r30, 60;
61
+ .loc 1 24 33
62
+ and.b32 %r18, %r1, 63;
63
+ .loc 1 21 28
64
+ mov.u32 %r23, %ctaid.x;
65
+ .loc 1 21 34
66
+ cvt.s64.s32 %rd1, %r23;
67
+ .loc 1 21 46
68
+ mul.wide.s32 %rd54, %r23, 64;
69
+ cvt.u64.u32 %rd2, %r30;
70
+ .loc 1 22 23
71
+ or.b64 %rd55, %rd54, %rd2;
72
+ .loc 1 26 30
73
+ shl.b64 %rd56, %rd55, 3;
74
+ add.s64 %rd19, %rd53, %rd56;
75
+ add.s64 %rd21, %rd19, 32;
76
+ add.s64 %rd23, %rd19, 64;
77
+ add.s64 %rd25, %rd19, 96;
78
+ add.s64 %rd27, %rd19, 128;
79
+ add.s64 %rd29, %rd19, 160;
80
+ add.s64 %rd31, %rd19, 192;
81
+ add.s64 %rd33, %rd19, 224;
82
+ add.s64 %rd35, %rd19, 256;
83
+ add.s64 %rd37, %rd19, 288;
84
+ add.s64 %rd39, %rd19, 320;
85
+ add.s64 %rd41, %rd19, 352;
86
+ add.s64 %rd43, %rd19, 384;
87
+ add.s64 %rd45, %rd19, 416;
88
+ add.s64 %rd47, %rd19, 448;
89
+ add.s64 %rd49, %rd19, 480;
90
+ mov.pred %p1, -1;
91
+ .loc 1 26 35
92
+ mov.u64 %rd18, 0x0;
93
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
94
+ mov.u64 %rd20, 0x0;
95
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
96
+ mov.u64 %rd22, 0x0;
97
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
98
+ mov.u64 %rd24, 0x0;
99
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
100
+ mov.u64 %rd26, 0x0;
101
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
102
+ mov.u64 %rd28, 0x0;
103
+ @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
104
+ mov.u64 %rd30, 0x0;
105
+ @%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
106
+ mov.u64 %rd32, 0x0;
107
+ @%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd33 + 0 ];
108
+ mov.u64 %rd34, 0x0;
109
+ @%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd35 + 0 ];
110
+ mov.u64 %rd36, 0x0;
111
+ @%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd37 + 0 ];
112
+ mov.u64 %rd38, 0x0;
113
+ @%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd39 + 0 ];
114
+ mov.u64 %rd40, 0x0;
115
+ @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
116
+ mov.u64 %rd42, 0x0;
117
+ @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
118
+ mov.u64 %rd44, 0x0;
119
+ @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
120
+ mov.u64 %rd46, 0x0;
121
+ @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
122
+ mov.u64 %rd48, 0x0;
123
+ @%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
124
+ .loc 1 27 19
125
+ mov.u32 %r27, 0x0;
126
+ @%p1 ld.global.b32 { %r27 }, [ %rd50 + 0 ];
127
+ .loc 1 29 19
128
+ mov.u32 %r28, 0x0;
129
+ @%p1 ld.global.b32 { %r28 }, [ %rd51 + 0 ];
130
+ .loc 1 38 23
131
+ setp.eq.s64 %p19, %rd18, -1;
132
+ setp.eq.s64 %p20, %rd20, -1;
133
+ setp.eq.s64 %p21, %rd22, -1;
134
+ setp.eq.s64 %p22, %rd24, -1;
135
+ setp.eq.s64 %p23, %rd26, -1;
136
+ setp.eq.s64 %p24, %rd28, -1;
137
+ setp.eq.s64 %p25, %rd30, -1;
138
+ setp.eq.s64 %p26, %rd32, -1;
139
+ setp.eq.s64 %p27, %rd34, -1;
140
+ setp.eq.s64 %p28, %rd36, -1;
141
+ setp.eq.s64 %p29, %rd38, -1;
142
+ setp.eq.s64 %p30, %rd40, -1;
143
+ setp.eq.s64 %p31, %rd42, -1;
144
+ setp.eq.s64 %p32, %rd44, -1;
145
+ setp.eq.s64 %p33, %rd46, -1;
146
+ setp.eq.s64 %p34, %rd48, -1;
147
+ .loc 1 39 22
148
+ div.full.f32 %r26, %r27, %r28;
149
+ mov.b32 %f97, %r26;
150
+ .loc 1 41 37
151
+ selp.f32 %f16, 0f00000000, %f97, %p34;
152
+ selp.f32 %f15, 0f00000000, %f97, %p33;
153
+ selp.f32 %f14, 0f00000000, %f97, %p32;
154
+ selp.f32 %f13, 0f00000000, %f97, %p31;
155
+ selp.f32 %f12, 0f00000000, %f97, %p30;
156
+ selp.f32 %f11, 0f00000000, %f97, %p29;
157
+ selp.f32 %f10, 0f00000000, %f97, %p28;
158
+ selp.f32 %f9, 0f00000000, %f97, %p27;
159
+ selp.f32 %f8, 0f00000000, %f97, %p26;
160
+ selp.f32 %f7, 0f00000000, %f97, %p25;
161
+ selp.f32 %f6, 0f00000000, %f97, %p24;
162
+ selp.f32 %f5, 0f00000000, %f97, %p23;
163
+ selp.f32 %f4, 0f00000000, %f97, %p22;
164
+ selp.f32 %f3, 0f00000000, %f97, %p21;
165
+ selp.f32 %f2, 0f00000000, %f97, %p20;
166
+ selp.f32 %f1, 0f00000000, %f97, %p19;
167
+ .loc 1 32 36
168
+ mul.wide.s32 %rd57, %r23, 12865792;
169
+ mul.wide.u32 %rd58, %r30, 201028;
170
+ add.s64 %rd59, %rd57, %rd58;
171
+ cvt.u64.u32 %rd60, %r1;
172
+ and.b64 %rd3, %rd60, 63;
173
+ mul.wide.u32 %rd61, %r18, 4;
174
+ add.s64 %rd62, %rd59, %rd61;
175
+ add.s64 %rd63, %rd62, %rd52;
176
+ add.s64 %rd149, %rd63, 12061680;
177
+ mov.f32 %f453, 0f00000000;
178
+ mov.b32 %r397, -64;
179
+ mov.u64 %rd147, %rd149;
180
+ mov.f32 %f454, %f453;
181
+ mov.f32 %f455, %f453;
182
+ mov.f32 %f456, %f453;
183
+ mov.f32 %f457, %f453;
184
+ mov.f32 %f458, %f453;
185
+ mov.f32 %f459, %f453;
186
+ mov.f32 %f460, %f453;
187
+ mov.f32 %f461, %f453;
188
+ mov.f32 %f462, %f453;
189
+ mov.f32 %f463, %f453;
190
+ mov.f32 %f464, %f453;
191
+ mov.f32 %f465, %f453;
192
+ mov.f32 %f466, %f453;
193
+ mov.f32 %f467, %f453;
194
+ mov.f32 %f468, %f453;
195
+ $L__BB0_1:
196
+ add.s32 %r397, %r397, 64;
197
+ .loc 1 33 27
198
+ add.s32 %r63, %r397, %r18;
199
+ .loc 1 34 25
200
+ setp.lt.u32 %p35, %r63, 50257;
201
+ .loc 1 36 34
202
+ add.s64 %rd64, %rd147, -12061680;
203
+ add.s64 %rd65, %rd147, -11257568;
204
+ add.s64 %rd66, %rd147, -10453456;
205
+ add.s64 %rd67, %rd147, -9649344;
206
+ add.s64 %rd68, %rd147, -8845232;
207
+ add.s64 %rd69, %rd147, -8041120;
208
+ add.s64 %rd70, %rd147, -7237008;
209
+ add.s64 %rd71, %rd147, -6432896;
210
+ add.s64 %rd72, %rd147, -5628784;
211
+ add.s64 %rd73, %rd147, -4824672;
212
+ add.s64 %rd74, %rd147, -4020560;
213
+ add.s64 %rd75, %rd147, -3216448;
214
+ add.s64 %rd76, %rd147, -2412336;
215
+ add.s64 %rd77, %rd147, -1608224;
216
+ add.s64 %rd78, %rd147, -804112;
217
+ mov.b32 %r333, 0;
218
+ .loc 1 36 52
219
+ mov.u32 %r31, 0x0;
220
+ @%p35 ld.global.L1::evict_last.b32 { %r31 }, [ %rd64 + 0 ];
221
+ @!%p35 mov.u32 %r31, %r333;
222
+ mov.u32 %r33, 0x0;
223
+ @%p35 ld.global.L1::evict_last.b32 { %r33 }, [ %rd65 + 0 ];
224
+ @!%p35 mov.u32 %r33, %r333;
225
+ mov.u32 %r35, 0x0;
226
+ @%p35 ld.global.L1::evict_last.b32 { %r35 }, [ %rd66 + 0 ];
227
+ @!%p35 mov.u32 %r35, %r333;
228
+ mov.u32 %r37, 0x0;
229
+ @%p35 ld.global.L1::evict_last.b32 { %r37 }, [ %rd67 + 0 ];
230
+ @!%p35 mov.u32 %r37, %r333;
231
+ mov.u32 %r39, 0x0;
232
+ @%p35 ld.global.L1::evict_last.b32 { %r39 }, [ %rd68 + 0 ];
233
+ @!%p35 mov.u32 %r39, %r333;
234
+ mov.u32 %r41, 0x0;
235
+ @%p35 ld.global.L1::evict_last.b32 { %r41 }, [ %rd69 + 0 ];
236
+ @!%p35 mov.u32 %r41, %r333;
237
+ mov.u32 %r43, 0x0;
238
+ @%p35 ld.global.L1::evict_last.b32 { %r43 }, [ %rd70 + 0 ];
239
+ @!%p35 mov.u32 %r43, %r333;
240
+ mov.u32 %r45, 0x0;
241
+ @%p35 ld.global.L1::evict_last.b32 { %r45 }, [ %rd71 + 0 ];
242
+ @!%p35 mov.u32 %r45, %r333;
243
+ mov.u32 %r47, 0x0;
244
+ @%p35 ld.global.L1::evict_last.b32 { %r47 }, [ %rd72 + 0 ];
245
+ @!%p35 mov.u32 %r47, %r333;
246
+ mov.u32 %r49, 0x0;
247
+ @%p35 ld.global.L1::evict_last.b32 { %r49 }, [ %rd73 + 0 ];
248
+ @!%p35 mov.u32 %r49, %r333;
249
+ mov.u32 %r51, 0x0;
250
+ @%p35 ld.global.L1::evict_last.b32 { %r51 }, [ %rd74 + 0 ];
251
+ @!%p35 mov.u32 %r51, %r333;
252
+ mov.u32 %r53, 0x0;
253
+ @%p35 ld.global.L1::evict_last.b32 { %r53 }, [ %rd75 + 0 ];
254
+ @!%p35 mov.u32 %r53, %r333;
255
+ mov.u32 %r55, 0x0;
256
+ @%p35 ld.global.L1::evict_last.b32 { %r55 }, [ %rd76 + 0 ];
257
+ @!%p35 mov.u32 %r55, %r333;
258
+ mov.u32 %r57, 0x0;
259
+ @%p35 ld.global.L1::evict_last.b32 { %r57 }, [ %rd77 + 0 ];
260
+ @!%p35 mov.u32 %r57, %r333;
261
+ mov.u32 %r59, 0x0;
262
+ @%p35 ld.global.L1::evict_last.b32 { %r59 }, [ %rd78 + 0 ];
263
+ @!%p35 mov.u32 %r59, %r333;
264
+ mov.u32 %r61, 0x0;
265
+ @%p35 ld.global.L1::evict_last.b32 { %r61 }, [ %rd147 + 0 ];
266
+ @!%p35 mov.u32 %r61, %r333;
267
+ mov.b32 %f98, %r61;
268
+ mov.b32 %f99, %r59;
269
+ mov.b32 %f100, %r57;
270
+ mov.b32 %f101, %r55;
271
+ mov.b32 %f102, %r53;
272
+ mov.b32 %f103, %r51;
273
+ mov.b32 %f104, %r49;
274
+ mov.b32 %f105, %r47;
275
+ mov.b32 %f106, %r45;
276
+ mov.b32 %f107, %r43;
277
+ mov.b32 %f108, %r41;
278
+ mov.b32 %f109, %r39;
279
+ mov.b32 %f110, %r37;
280
+ mov.b32 %f111, %r35;
281
+ mov.b32 %f112, %r33;
282
+ mov.b32 %f113, %r31;
283
+ .loc 1 42 23
284
+ mul.f32 %f114, %f1, %f113;
285
+ mul.f32 %f115, %f2, %f112;
286
+ mul.f32 %f116, %f3, %f111;
287
+ mul.f32 %f117, %f4, %f110;
288
+ mul.f32 %f118, %f5, %f109;
289
+ mul.f32 %f119, %f6, %f108;
290
+ mul.f32 %f120, %f7, %f107;
291
+ mul.f32 %f121, %f8, %f106;
292
+ mul.f32 %f122, %f9, %f105;
293
+ mul.f32 %f123, %f10, %f104;
294
+ mul.f32 %f124, %f11, %f103;
295
+ mul.f32 %f125, %f12, %f102;
296
+ mul.f32 %f126, %f13, %f101;
297
+ mul.f32 %f127, %f14, %f100;
298
+ mul.f32 %f128, %f15, %f99;
299
+ mul.f32 %f129, %f16, %f98;
300
+ .loc 1 45 40
301
+ selp.f32 %f130, %f129, 0f80000000, %p35;
302
+ selp.f32 %f131, %f128, 0f80000000, %p35;
303
+ selp.f32 %f132, %f127, 0f80000000, %p35;
304
+ selp.f32 %f133, %f126, 0f80000000, %p35;
305
+ selp.f32 %f134, %f125, 0f80000000, %p35;
306
+ selp.f32 %f135, %f124, 0f80000000, %p35;
307
+ selp.f32 %f136, %f123, 0f80000000, %p35;
308
+ selp.f32 %f137, %f122, 0f80000000, %p35;
309
+ selp.f32 %f138, %f121, 0f80000000, %p35;
310
+ selp.f32 %f139, %f120, 0f80000000, %p35;
311
+ selp.f32 %f140, %f119, 0f80000000, %p35;
312
+ selp.f32 %f141, %f118, 0f80000000, %p35;
313
+ selp.f32 %f142, %f117, 0f80000000, %p35;
314
+ selp.f32 %f143, %f116, 0f80000000, %p35;
315
+ selp.f32 %f144, %f115, 0f80000000, %p35;
316
+ selp.f32 %f145, %f114, 0f80000000, %p35;
317
+ add.f32 %f453, %f453, %f145;
318
+ add.f32 %f454, %f454, %f144;
319
+ add.f32 %f455, %f455, %f143;
320
+ add.f32 %f456, %f456, %f142;
321
+ add.f32 %f457, %f457, %f141;
322
+ add.f32 %f458, %f458, %f140;
323
+ add.f32 %f459, %f459, %f139;
324
+ add.f32 %f460, %f460, %f138;
325
+ add.f32 %f461, %f461, %f137;
326
+ add.f32 %f462, %f462, %f136;
327
+ add.f32 %f463, %f463, %f135;
328
+ add.f32 %f464, %f464, %f134;
329
+ add.f32 %f465, %f465, %f133;
330
+ add.f32 %f466, %f466, %f132;
331
+ add.f32 %f467, %f467, %f131;
332
+ add.f32 %f468, %f468, %f130;
333
+ .loc 1 32 36
334
+ add.s64 %rd147, %rd147, 256;
335
+ setp.lt.u32 %p67, %r397, 50193;
336
+ @%p67 bra $L__BB0_1;
337
+ .loc 1 0 36
338
+ cvt.u32.u64 %r101, %rd2;
339
+ $L__tmp1:
340
+ .loc 2 243 36
341
+ mov.b32 %r102, %f453;
342
+ shfl.sync.bfly.b32 %r103, %r102, 16, 31, -1;
343
+ mov.b32 %f146, %r103;
344
+ $L__tmp2:
345
+ .loc 2 233 15
346
+ add.f32 %f147, %f453, %f146;
347
+ $L__tmp3:
348
+ .loc 2 243 36
349
+ mov.b32 %r104, %f147;
350
+ shfl.sync.bfly.b32 %r105, %r104, 8, 31, -1;
351
+ mov.b32 %f148, %r105;
352
+ $L__tmp4:
353
+ .loc 2 233 15
354
+ add.f32 %f149, %f147, %f148;
355
+ $L__tmp5:
356
+ .loc 2 243 36
357
+ mov.b32 %r106, %f149;
358
+ shfl.sync.bfly.b32 %r107, %r106, 4, 31, -1;
359
+ mov.b32 %f150, %r107;
360
+ $L__tmp6:
361
+ .loc 2 233 15
362
+ add.f32 %f151, %f149, %f150;
363
+ $L__tmp7:
364
+ .loc 2 243 36
365
+ mov.b32 %r108, %f151;
366
+ shfl.sync.bfly.b32 %r109, %r108, 2, 31, -1;
367
+ mov.b32 %f152, %r109;
368
+ $L__tmp8:
369
+ .loc 2 233 15
370
+ add.f32 %f153, %f151, %f152;
371
+ $L__tmp9:
372
+ .loc 2 243 36
373
+ mov.b32 %r110, %f153;
374
+ shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
375
+ mov.b32 %f154, %r111;
376
+ $L__tmp10:
377
+ .loc 2 233 15
378
+ add.f32 %f155, %f153, %f154;
379
+ $L__tmp11:
380
+ .loc 2 243 36
381
+ mov.b32 %r112, %f454;
382
+ shfl.sync.bfly.b32 %r113, %r112, 16, 31, -1;
383
+ mov.b32 %f156, %r113;
384
+ $L__tmp12:
385
+ .loc 2 233 15
386
+ add.f32 %f157, %f454, %f156;
387
+ $L__tmp13:
388
+ .loc 2 243 36
389
+ mov.b32 %r114, %f157;
390
+ shfl.sync.bfly.b32 %r115, %r114, 8, 31, -1;
391
+ mov.b32 %f158, %r115;
392
+ $L__tmp14:
393
+ .loc 2 233 15
394
+ add.f32 %f159, %f157, %f158;
395
+ $L__tmp15:
396
+ .loc 2 243 36
397
+ mov.b32 %r116, %f159;
398
+ shfl.sync.bfly.b32 %r117, %r116, 4, 31, -1;
399
+ mov.b32 %f160, %r117;
400
+ $L__tmp16:
401
+ .loc 2 233 15
402
+ add.f32 %f161, %f159, %f160;
403
+ $L__tmp17:
404
+ .loc 2 243 36
405
+ mov.b32 %r118, %f161;
406
+ shfl.sync.bfly.b32 %r119, %r118, 2, 31, -1;
407
+ mov.b32 %f162, %r119;
408
+ $L__tmp18:
409
+ .loc 2 233 15
410
+ add.f32 %f163, %f161, %f162;
411
+ $L__tmp19:
412
+ .loc 2 243 36
413
+ mov.b32 %r120, %f163;
414
+ shfl.sync.bfly.b32 %r121, %r120, 1, 31, -1;
415
+ mov.b32 %f164, %r121;
416
+ $L__tmp20:
417
+ .loc 2 233 15
418
+ add.f32 %f165, %f163, %f164;
419
+ $L__tmp21:
420
+ .loc 2 243 36
421
+ mov.b32 %r122, %f455;
422
+ shfl.sync.bfly.b32 %r123, %r122, 16, 31, -1;
423
+ mov.b32 %f166, %r123;
424
+ $L__tmp22:
425
+ .loc 2 233 15
426
+ add.f32 %f167, %f455, %f166;
427
+ $L__tmp23:
428
+ .loc 2 243 36
429
+ mov.b32 %r124, %f167;
430
+ shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
431
+ mov.b32 %f168, %r125;
432
+ $L__tmp24:
433
+ .loc 2 233 15
434
+ add.f32 %f169, %f167, %f168;
435
+ $L__tmp25:
436
+ .loc 2 243 36
437
+ mov.b32 %r126, %f169;
438
+ shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
439
+ mov.b32 %f170, %r127;
440
+ $L__tmp26:
441
+ .loc 2 233 15
442
+ add.f32 %f171, %f169, %f170;
443
+ $L__tmp27:
444
+ .loc 2 243 36
445
+ mov.b32 %r128, %f171;
446
+ shfl.sync.bfly.b32 %r129, %r128, 2, 31, -1;
447
+ mov.b32 %f172, %r129;
448
+ $L__tmp28:
449
+ .loc 2 233 15
450
+ add.f32 %f173, %f171, %f172;
451
+ $L__tmp29:
452
+ .loc 2 243 36
453
+ mov.b32 %r130, %f173;
454
+ shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1;
455
+ mov.b32 %f174, %r131;
456
+ $L__tmp30:
457
+ .loc 2 233 15
458
+ add.f32 %f175, %f173, %f174;
459
+ $L__tmp31:
460
+ .loc 2 243 36
461
+ mov.b32 %r132, %f456;
462
+ shfl.sync.bfly.b32 %r133, %r132, 16, 31, -1;
463
+ mov.b32 %f176, %r133;
464
+ $L__tmp32:
465
+ .loc 2 233 15
466
+ add.f32 %f177, %f456, %f176;
467
+ $L__tmp33:
468
+ .loc 2 243 36
469
+ mov.b32 %r134, %f177;
470
+ shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1;
471
+ mov.b32 %f178, %r135;
472
+ $L__tmp34:
473
+ .loc 2 233 15
474
+ add.f32 %f179, %f177, %f178;
475
+ $L__tmp35:
476
+ .loc 2 243 36
477
+ mov.b32 %r136, %f179;
478
+ shfl.sync.bfly.b32 %r137, %r136, 4, 31, -1;
479
+ mov.b32 %f180, %r137;
480
+ $L__tmp36:
481
+ .loc 2 233 15
482
+ add.f32 %f181, %f179, %f180;
483
+ $L__tmp37:
484
+ .loc 2 243 36
485
+ mov.b32 %r138, %f181;
486
+ shfl.sync.bfly.b32 %r139, %r138, 2, 31, -1;
487
+ mov.b32 %f182, %r139;
488
+ $L__tmp38:
489
+ .loc 2 233 15
490
+ add.f32 %f183, %f181, %f182;
491
+ $L__tmp39:
492
+ .loc 2 243 36
493
+ mov.b32 %r140, %f183;
494
+ shfl.sync.bfly.b32 %r141, %r140, 1, 31, -1;
495
+ mov.b32 %f184, %r141;
496
+ $L__tmp40:
497
+ .loc 2 233 15
498
+ add.f32 %f185, %f183, %f184;
499
+ $L__tmp41:
500
+ .loc 2 243 36
501
+ mov.b32 %r142, %f457;
502
+ shfl.sync.bfly.b32 %r143, %r142, 16, 31, -1;
503
+ mov.b32 %f186, %r143;
504
+ $L__tmp42:
505
+ .loc 2 233 15
506
+ add.f32 %f187, %f457, %f186;
507
+ $L__tmp43:
508
+ .loc 2 243 36
509
+ mov.b32 %r144, %f187;
510
+ shfl.sync.bfly.b32 %r145, %r144, 8, 31, -1;
511
+ mov.b32 %f188, %r145;
512
+ $L__tmp44:
513
+ .loc 2 233 15
514
+ add.f32 %f189, %f187, %f188;
515
+ $L__tmp45:
516
+ .loc 2 243 36
517
+ mov.b32 %r146, %f189;
518
+ shfl.sync.bfly.b32 %r147, %r146, 4, 31, -1;
519
+ mov.b32 %f190, %r147;
520
+ $L__tmp46:
521
+ .loc 2 233 15
522
+ add.f32 %f191, %f189, %f190;
523
+ $L__tmp47:
524
+ .loc 2 243 36
525
+ mov.b32 %r148, %f191;
526
+ shfl.sync.bfly.b32 %r149, %r148, 2, 31, -1;
527
+ mov.b32 %f192, %r149;
528
+ $L__tmp48:
529
+ .loc 2 233 15
530
+ add.f32 %f193, %f191, %f192;
531
+ $L__tmp49:
532
+ .loc 2 243 36
533
+ mov.b32 %r150, %f193;
534
+ shfl.sync.bfly.b32 %r151, %r150, 1, 31, -1;
535
+ mov.b32 %f194, %r151;
536
+ $L__tmp50:
537
+ .loc 2 233 15
538
+ add.f32 %f195, %f193, %f194;
539
+ $L__tmp51:
540
+ .loc 2 243 36
541
+ mov.b32 %r152, %f458;
542
+ shfl.sync.bfly.b32 %r153, %r152, 16, 31, -1;
543
+ mov.b32 %f196, %r153;
544
+ $L__tmp52:
545
+ .loc 2 233 15
546
+ add.f32 %f197, %f458, %f196;
547
+ $L__tmp53:
548
+ .loc 2 243 36
549
+ mov.b32 %r154, %f197;
550
+ shfl.sync.bfly.b32 %r155, %r154, 8, 31, -1;
551
+ mov.b32 %f198, %r155;
552
+ $L__tmp54:
553
+ .loc 2 233 15
554
+ add.f32 %f199, %f197, %f198;
555
+ $L__tmp55:
556
+ .loc 2 243 36
557
+ mov.b32 %r156, %f199;
558
+ shfl.sync.bfly.b32 %r157, %r156, 4, 31, -1;
559
+ mov.b32 %f200, %r157;
560
+ $L__tmp56:
561
+ .loc 2 233 15
562
+ add.f32 %f201, %f199, %f200;
563
+ $L__tmp57:
564
+ .loc 2 243 36
565
+ mov.b32 %r158, %f201;
566
+ shfl.sync.bfly.b32 %r159, %r158, 2, 31, -1;
567
+ mov.b32 %f202, %r159;
568
+ $L__tmp58:
569
+ .loc 2 233 15
570
+ add.f32 %f203, %f201, %f202;
571
+ $L__tmp59:
572
+ .loc 2 243 36
573
+ mov.b32 %r160, %f203;
574
+ shfl.sync.bfly.b32 %r161, %r160, 1, 31, -1;
575
+ mov.b32 %f204, %r161;
576
+ $L__tmp60:
577
+ .loc 2 233 15
578
+ add.f32 %f205, %f203, %f204;
579
+ $L__tmp61:
580
+ .loc 2 243 36
581
+ mov.b32 %r162, %f459;
582
+ shfl.sync.bfly.b32 %r163, %r162, 16, 31, -1;
583
+ mov.b32 %f206, %r163;
584
+ $L__tmp62:
585
+ .loc 2 233 15
586
+ add.f32 %f207, %f459, %f206;
587
+ $L__tmp63:
588
+ .loc 2 243 36
589
+ mov.b32 %r164, %f207;
590
+ shfl.sync.bfly.b32 %r165, %r164, 8, 31, -1;
591
+ mov.b32 %f208, %r165;
592
+ $L__tmp64:
593
+ .loc 2 233 15
594
+ add.f32 %f209, %f207, %f208;
595
+ $L__tmp65:
596
+ .loc 2 243 36
597
+ mov.b32 %r166, %f209;
598
+ shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1;
599
+ mov.b32 %f210, %r167;
600
+ $L__tmp66:
601
+ .loc 2 233 15
602
+ add.f32 %f211, %f209, %f210;
603
+ $L__tmp67:
604
+ .loc 2 243 36
605
+ mov.b32 %r168, %f211;
606
+ shfl.sync.bfly.b32 %r169, %r168, 2, 31, -1;
607
+ mov.b32 %f212, %r169;
608
+ $L__tmp68:
609
+ .loc 2 233 15
610
+ add.f32 %f213, %f211, %f212;
611
+ $L__tmp69:
612
+ .loc 2 243 36
613
+ mov.b32 %r170, %f213;
614
+ shfl.sync.bfly.b32 %r171, %r170, 1, 31, -1;
615
+ mov.b32 %f214, %r171;
616
+ $L__tmp70:
617
+ .loc 2 233 15
618
+ add.f32 %f215, %f213, %f214;
619
+ $L__tmp71:
620
+ .loc 2 243 36
621
+ mov.b32 %r172, %f460;
622
+ shfl.sync.bfly.b32 %r173, %r172, 16, 31, -1;
623
+ mov.b32 %f216, %r173;
624
+ $L__tmp72:
625
+ .loc 2 233 15
626
+ add.f32 %f217, %f460, %f216;
627
+ $L__tmp73:
628
+ .loc 2 243 36
629
+ mov.b32 %r174, %f217;
630
+ shfl.sync.bfly.b32 %r175, %r174, 8, 31, -1;
631
+ mov.b32 %f218, %r175;
632
+ $L__tmp74:
633
+ .loc 2 233 15
634
+ add.f32 %f219, %f217, %f218;
635
+ $L__tmp75:
636
+ .loc 2 243 36
637
+ mov.b32 %r176, %f219;
638
+ shfl.sync.bfly.b32 %r177, %r176, 4, 31, -1;
639
+ mov.b32 %f220, %r177;
640
+ $L__tmp76:
641
+ .loc 2 233 15
642
+ add.f32 %f221, %f219, %f220;
643
+ $L__tmp77:
644
+ .loc 2 243 36
645
+ mov.b32 %r178, %f221;
646
+ shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1;
647
+ mov.b32 %f222, %r179;
648
+ $L__tmp78:
649
+ .loc 2 233 15
650
+ add.f32 %f223, %f221, %f222;
651
+ $L__tmp79:
652
+ .loc 2 243 36
653
+ mov.b32 %r180, %f223;
654
+ shfl.sync.bfly.b32 %r181, %r180, 1, 31, -1;
655
+ mov.b32 %f224, %r181;
656
+ $L__tmp80:
657
+ .loc 2 233 15
658
+ add.f32 %f225, %f223, %f224;
659
+ $L__tmp81:
660
+ .loc 2 243 36
661
+ mov.b32 %r182, %f461;
662
+ shfl.sync.bfly.b32 %r183, %r182, 16, 31, -1;
663
+ mov.b32 %f226, %r183;
664
+ $L__tmp82:
665
+ .loc 2 233 15
666
+ add.f32 %f227, %f461, %f226;
667
+ $L__tmp83:
668
+ .loc 2 243 36
669
+ mov.b32 %r184, %f227;
670
+ shfl.sync.bfly.b32 %r185, %r184, 8, 31, -1;
671
+ mov.b32 %f228, %r185;
672
+ $L__tmp84:
673
+ .loc 2 233 15
674
+ add.f32 %f229, %f227, %f228;
675
+ $L__tmp85:
676
+ .loc 2 243 36
677
+ mov.b32 %r186, %f229;
678
+ shfl.sync.bfly.b32 %r187, %r186, 4, 31, -1;
679
+ mov.b32 %f230, %r187;
680
+ $L__tmp86:
681
+ .loc 2 233 15
682
+ add.f32 %f231, %f229, %f230;
683
+ $L__tmp87:
684
+ .loc 2 243 36
685
+ mov.b32 %r188, %f231;
686
+ shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1;
687
+ mov.b32 %f232, %r189;
688
+ $L__tmp88:
689
+ .loc 2 233 15
690
+ add.f32 %f233, %f231, %f232;
691
+ $L__tmp89:
692
+ .loc 2 243 36
693
+ mov.b32 %r190, %f233;
694
+ shfl.sync.bfly.b32 %r191, %r190, 1, 31, -1;
695
+ mov.b32 %f234, %r191;
696
+ $L__tmp90:
697
+ .loc 2 233 15
698
+ add.f32 %f235, %f233, %f234;
699
+ $L__tmp91:
700
+ .loc 2 243 36
701
+ mov.b32 %r192, %f462;
702
+ shfl.sync.bfly.b32 %r193, %r192, 16, 31, -1;
703
+ mov.b32 %f236, %r193;
704
+ $L__tmp92:
705
+ .loc 2 233 15
706
+ add.f32 %f237, %f462, %f236;
707
+ $L__tmp93:
708
+ .loc 2 243 36
709
+ mov.b32 %r194, %f237;
710
+ shfl.sync.bfly.b32 %r195, %r194, 8, 31, -1;
711
+ mov.b32 %f238, %r195;
712
+ $L__tmp94:
713
+ .loc 2 233 15
714
+ add.f32 %f239, %f237, %f238;
715
+ $L__tmp95:
716
+ .loc 2 243 36
717
+ mov.b32 %r196, %f239;
718
+ shfl.sync.bfly.b32 %r197, %r196, 4, 31, -1;
719
+ mov.b32 %f240, %r197;
720
+ $L__tmp96:
721
+ .loc 2 233 15
722
+ add.f32 %f241, %f239, %f240;
723
+ $L__tmp97:
724
+ .loc 2 243 36
725
+ mov.b32 %r198, %f241;
726
+ shfl.sync.bfly.b32 %r199, %r198, 2, 31, -1;
727
+ mov.b32 %f242, %r199;
728
+ $L__tmp98:
729
+ .loc 2 233 15
730
+ add.f32 %f243, %f241, %f242;
731
+ $L__tmp99:
732
+ .loc 2 243 36
733
+ mov.b32 %r200, %f243;
734
+ shfl.sync.bfly.b32 %r201, %r200, 1, 31, -1;
735
+ mov.b32 %f244, %r201;
736
+ $L__tmp100:
737
+ .loc 2 233 15
738
+ add.f32 %f245, %f243, %f244;
739
+ $L__tmp101:
740
+ .loc 2 243 36
741
+ mov.b32 %r202, %f463;
742
+ shfl.sync.bfly.b32 %r203, %r202, 16, 31, -1;
743
+ mov.b32 %f246, %r203;
744
+ $L__tmp102:
745
+ .loc 2 233 15
746
+ add.f32 %f247, %f463, %f246;
747
+ $L__tmp103:
748
+ .loc 2 243 36
749
+ mov.b32 %r204, %f247;
750
+ shfl.sync.bfly.b32 %r205, %r204, 8, 31, -1;
751
+ mov.b32 %f248, %r205;
752
+ $L__tmp104:
753
+ .loc 2 233 15
754
+ add.f32 %f249, %f247, %f248;
755
+ $L__tmp105:
756
+ .loc 2 243 36
757
+ mov.b32 %r206, %f249;
758
+ shfl.sync.bfly.b32 %r207, %r206, 4, 31, -1;
759
+ mov.b32 %f250, %r207;
760
+ $L__tmp106:
761
+ .loc 2 233 15
762
+ add.f32 %f251, %f249, %f250;
763
+ $L__tmp107:
764
+ .loc 2 243 36
765
+ mov.b32 %r208, %f251;
766
+ shfl.sync.bfly.b32 %r209, %r208, 2, 31, -1;
767
+ mov.b32 %f252, %r209;
768
+ $L__tmp108:
769
+ .loc 2 233 15
770
+ add.f32 %f253, %f251, %f252;
771
+ $L__tmp109:
772
+ .loc 2 243 36
773
+ mov.b32 %r210, %f253;
774
+ shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1;
775
+ mov.b32 %f254, %r211;
776
+ $L__tmp110:
777
+ .loc 2 233 15
778
+ add.f32 %f255, %f253, %f254;
779
+ $L__tmp111:
780
+ .loc 2 243 36
781
+ mov.b32 %r212, %f464;
782
+ shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1;
783
+ mov.b32 %f256, %r213;
784
+ $L__tmp112:
785
+ .loc 2 233 15
786
+ add.f32 %f257, %f464, %f256;
787
+ $L__tmp113:
788
+ .loc 2 243 36
789
+ mov.b32 %r214, %f257;
790
+ shfl.sync.bfly.b32 %r215, %r214, 8, 31, -1;
791
+ mov.b32 %f258, %r215;
792
+ $L__tmp114:
793
+ .loc 2 233 15
794
+ add.f32 %f259, %f257, %f258;
795
+ $L__tmp115:
796
+ .loc 2 243 36
797
+ mov.b32 %r216, %f259;
798
+ shfl.sync.bfly.b32 %r217, %r216, 4, 31, -1;
799
+ mov.b32 %f260, %r217;
800
+ $L__tmp116:
801
+ .loc 2 233 15
802
+ add.f32 %f261, %f259, %f260;
803
+ $L__tmp117:
804
+ .loc 2 243 36
805
+ mov.b32 %r218, %f261;
806
+ shfl.sync.bfly.b32 %r219, %r218, 2, 31, -1;
807
+ mov.b32 %f262, %r219;
808
+ $L__tmp118:
809
+ .loc 2 233 15
810
+ add.f32 %f263, %f261, %f262;
811
+ $L__tmp119:
812
+ .loc 2 243 36
813
+ mov.b32 %r220, %f263;
814
+ shfl.sync.bfly.b32 %r221, %r220, 1, 31, -1;
815
+ mov.b32 %f264, %r221;
816
+ $L__tmp120:
817
+ .loc 2 233 15
818
+ add.f32 %f265, %f263, %f264;
819
+ $L__tmp121:
820
+ .loc 2 243 36
821
+ mov.b32 %r222, %f465;
822
+ shfl.sync.bfly.b32 %r223, %r222, 16, 31, -1;
823
+ mov.b32 %f266, %r223;
824
+ $L__tmp122:
825
+ .loc 2 233 15
826
+ add.f32 %f267, %f465, %f266;
827
+ $L__tmp123:
828
+ .loc 2 243 36
829
+ mov.b32 %r224, %f267;
830
+ shfl.sync.bfly.b32 %r225, %r224, 8, 31, -1;
831
+ mov.b32 %f268, %r225;
832
+ $L__tmp124:
833
+ .loc 2 233 15
834
+ add.f32 %f269, %f267, %f268;
835
+ $L__tmp125:
836
+ .loc 2 243 36
837
+ mov.b32 %r226, %f269;
838
+ shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1;
839
+ mov.b32 %f270, %r227;
840
+ $L__tmp126:
841
+ .loc 2 233 15
842
+ add.f32 %f271, %f269, %f270;
843
+ $L__tmp127:
844
+ .loc 2 243 36
845
+ mov.b32 %r228, %f271;
846
+ shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1;
847
+ mov.b32 %f272, %r229;
848
+ $L__tmp128:
849
+ .loc 2 233 15
850
+ add.f32 %f273, %f271, %f272;
851
+ $L__tmp129:
852
+ .loc 2 243 36
853
+ mov.b32 %r230, %f273;
854
+ shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1;
855
+ mov.b32 %f274, %r231;
856
+ $L__tmp130:
857
+ .loc 2 233 15
858
+ add.f32 %f275, %f273, %f274;
859
+ $L__tmp131:
860
+ .loc 2 243 36
861
+ mov.b32 %r232, %f466;
862
+ shfl.sync.bfly.b32 %r233, %r232, 16, 31, -1;
863
+ mov.b32 %f276, %r233;
864
+ $L__tmp132:
865
+ .loc 2 233 15
866
+ add.f32 %f277, %f466, %f276;
867
+ $L__tmp133:
868
+ .loc 2 243 36
869
+ mov.b32 %r234, %f277;
870
+ shfl.sync.bfly.b32 %r235, %r234, 8, 31, -1;
871
+ mov.b32 %f278, %r235;
872
+ $L__tmp134:
873
+ .loc 2 233 15
874
+ add.f32 %f279, %f277, %f278;
875
+ $L__tmp135:
876
+ .loc 2 243 36
877
+ mov.b32 %r236, %f279;
878
+ shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1;
879
+ mov.b32 %f280, %r237;
880
+ $L__tmp136:
881
+ .loc 2 233 15
882
+ add.f32 %f281, %f279, %f280;
883
+ $L__tmp137:
884
+ .loc 2 243 36
885
+ mov.b32 %r238, %f281;
886
+ shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1;
887
+ mov.b32 %f282, %r239;
888
+ $L__tmp138:
889
+ .loc 2 233 15
890
+ add.f32 %f283, %f281, %f282;
891
+ $L__tmp139:
892
+ .loc 2 243 36
893
+ mov.b32 %r240, %f283;
894
+ shfl.sync.bfly.b32 %r241, %r240, 1, 31, -1;
895
+ mov.b32 %f284, %r241;
896
+ $L__tmp140:
897
+ .loc 2 233 15
898
+ add.f32 %f285, %f283, %f284;
899
+ $L__tmp141:
900
+ .loc 2 243 36
901
+ mov.b32 %r242, %f467;
902
+ shfl.sync.bfly.b32 %r243, %r242, 16, 31, -1;
903
+ mov.b32 %f286, %r243;
904
+ $L__tmp142:
905
+ .loc 2 233 15
906
+ add.f32 %f287, %f467, %f286;
907
+ $L__tmp143:
908
+ .loc 2 243 36
909
+ mov.b32 %r244, %f287;
910
+ shfl.sync.bfly.b32 %r245, %r244, 8, 31, -1;
911
+ mov.b32 %f288, %r245;
912
+ $L__tmp144:
913
+ .loc 2 233 15
914
+ add.f32 %f289, %f287, %f288;
915
+ $L__tmp145:
916
+ .loc 2 243 36
917
+ mov.b32 %r246, %f289;
918
+ shfl.sync.bfly.b32 %r247, %r246, 4, 31, -1;
919
+ mov.b32 %f290, %r247;
920
+ $L__tmp146:
921
+ .loc 2 233 15
922
+ add.f32 %f291, %f289, %f290;
923
+ $L__tmp147:
924
+ .loc 2 243 36
925
+ mov.b32 %r248, %f291;
926
+ shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1;
927
+ mov.b32 %f292, %r249;
928
+ $L__tmp148:
929
+ .loc 2 233 15
930
+ add.f32 %f293, %f291, %f292;
931
+ $L__tmp149:
932
+ .loc 2 243 36
933
+ mov.b32 %r250, %f293;
934
+ shfl.sync.bfly.b32 %r251, %r250, 1, 31, -1;
935
+ mov.b32 %f294, %r251;
936
+ $L__tmp150:
937
+ .loc 2 233 15
938
+ add.f32 %f295, %f293, %f294;
939
+ $L__tmp151:
940
+ .loc 2 243 36
941
+ mov.b32 %r252, %f468;
942
+ shfl.sync.bfly.b32 %r253, %r252, 16, 31, -1;
943
+ mov.b32 %f296, %r253;
944
+ $L__tmp152:
945
+ .loc 2 233 15
946
+ add.f32 %f297, %f468, %f296;
947
+ $L__tmp153:
948
+ .loc 2 243 36
949
+ mov.b32 %r254, %f297;
950
+ shfl.sync.bfly.b32 %r255, %r254, 8, 31, -1;
951
+ mov.b32 %f298, %r255;
952
+ $L__tmp154:
953
+ .loc 2 233 15
954
+ add.f32 %f299, %f297, %f298;
955
+ $L__tmp155:
956
+ .loc 2 243 36
957
+ mov.b32 %r256, %f299;
958
+ shfl.sync.bfly.b32 %r257, %r256, 4, 31, -1;
959
+ mov.b32 %f300, %r257;
960
+ $L__tmp156:
961
+ .loc 2 233 15
962
+ add.f32 %f301, %f299, %f300;
963
+ $L__tmp157:
964
+ .loc 2 243 36
965
+ mov.b32 %r258, %f301;
966
+ shfl.sync.bfly.b32 %r259, %r258, 2, 31, -1;
967
+ mov.b32 %f302, %r259;
968
+ $L__tmp158:
969
+ .loc 2 233 15
970
+ add.f32 %f303, %f301, %f302;
971
+ $L__tmp159:
972
+ .loc 2 243 36
973
+ mov.b32 %r260, %f303;
974
+ shfl.sync.bfly.b32 %r261, %r260, 1, 31, -1;
975
+ mov.b32 %f304, %r261;
976
+ $L__tmp160:
977
+ .loc 2 233 15
978
+ add.f32 %f305, %f303, %f304;
979
+ $L__tmp161:
980
+ .loc 2 243 36
981
+ setp.eq.s32 %p68, %r2, 0;
982
+ shr.u32 %r262, %r1, 3;
983
+ and.b32 %r263, %r262, 4;
984
+ shl.b32 %r264, %r101, 3;
985
+ or.b32 %r265, %r264, %r263;
986
+ mov.u32 %r266, global_smem;
987
+ add.s32 %r64, %r266, %r265;
988
+ mov.b32 %r65, %f155;
989
+ @%p68 st.shared.b32 [ %r64 + 0 ], %r65;
990
+ shl.b32 %r267, %r3, 3;
991
+ or.b32 %r268, %r267, %r263;
992
+ add.s32 %r66, %r266, %r268;
993
+ mov.b32 %r67, %f165;
994
+ @%p68 st.shared.b32 [ %r66 + 0 ], %r67;
995
+ shl.b32 %r269, %r4, 3;
996
+ or.b32 %r270, %r269, %r263;
997
+ add.s32 %r68, %r266, %r270;
998
+ mov.b32 %r69, %f175;
999
+ @%p68 st.shared.b32 [ %r68 + 0 ], %r69;
1000
+ shl.b32 %r271, %r5, 3;
1001
+ or.b32 %r272, %r271, %r263;
1002
+ add.s32 %r70, %r266, %r272;
1003
+ mov.b32 %r71, %f185;
1004
+ @%p68 st.shared.b32 [ %r70 + 0 ], %r71;
1005
+ shl.b32 %r273, %r6, 3;
1006
+ or.b32 %r274, %r273, %r263;
1007
+ add.s32 %r72, %r266, %r274;
1008
+ mov.b32 %r73, %f195;
1009
+ @%p68 st.shared.b32 [ %r72 + 0 ], %r73;
1010
+ shl.b32 %r275, %r7, 3;
1011
+ or.b32 %r276, %r275, %r263;
1012
+ add.s32 %r74, %r266, %r276;
1013
+ mov.b32 %r75, %f205;
1014
+ @%p68 st.shared.b32 [ %r74 + 0 ], %r75;
1015
+ shl.b32 %r277, %r8, 3;
1016
+ or.b32 %r278, %r277, %r263;
1017
+ add.s32 %r76, %r266, %r278;
1018
+ mov.b32 %r77, %f215;
1019
+ @%p68 st.shared.b32 [ %r76 + 0 ], %r77;
1020
+ shl.b32 %r279, %r9, 3;
1021
+ or.b32 %r280, %r279, %r263;
1022
+ add.s32 %r78, %r266, %r280;
1023
+ mov.b32 %r79, %f225;
1024
+ @%p68 st.shared.b32 [ %r78 + 0 ], %r79;
1025
+ shl.b32 %r281, %r10, 3;
1026
+ or.b32 %r282, %r281, %r263;
1027
+ add.s32 %r80, %r266, %r282;
1028
+ mov.b32 %r81, %f235;
1029
+ @%p68 st.shared.b32 [ %r80 + 0 ], %r81;
1030
+ shl.b32 %r283, %r11, 3;
1031
+ or.b32 %r284, %r283, %r263;
1032
+ add.s32 %r82, %r266, %r284;
1033
+ mov.b32 %r83, %f245;
1034
+ @%p68 st.shared.b32 [ %r82 + 0 ], %r83;
1035
+ shl.b32 %r285, %r12, 3;
1036
+ or.b32 %r286, %r285, %r263;
1037
+ add.s32 %r84, %r266, %r286;
1038
+ mov.b32 %r85, %f255;
1039
+ @%p68 st.shared.b32 [ %r84 + 0 ], %r85;
1040
+ shl.b32 %r287, %r13, 3;
1041
+ or.b32 %r288, %r287, %r263;
1042
+ add.s32 %r86, %r266, %r288;
1043
+ mov.b32 %r87, %f265;
1044
+ @%p68 st.shared.b32 [ %r86 + 0 ], %r87;
1045
+ shl.b32 %r289, %r14, 3;
1046
+ or.b32 %r290, %r289, %r263;
1047
+ add.s32 %r88, %r266, %r290;
1048
+ mov.b32 %r89, %f275;
1049
+ @%p68 st.shared.b32 [ %r88 + 0 ], %r89;
1050
+ shl.b32 %r291, %r15, 3;
1051
+ or.b32 %r292, %r291, %r263;
1052
+ add.s32 %r90, %r266, %r292;
1053
+ mov.b32 %r91, %f285;
1054
+ @%p68 st.shared.b32 [ %r90 + 0 ], %r91;
1055
+ shl.b32 %r293, %r16, 3;
1056
+ or.b32 %r294, %r293, %r263;
1057
+ add.s32 %r92, %r266, %r294;
1058
+ mov.b32 %r93, %f295;
1059
+ @%p68 st.shared.b32 [ %r92 + 0 ], %r93;
1060
+ shl.b32 %r295, %r17, 3;
1061
+ or.b32 %r296, %r295, %r263;
1062
+ add.s32 %r94, %r266, %r296;
1063
+ mov.b32 %r95, %f305;
1064
+ @%p68 st.shared.b32 [ %r94 + 0 ], %r95;
1065
+ bar.sync 0;
1066
+ setp.lt.s32 %p84, %r1, 128;
1067
+ shl.b32 %r297, %r1, 2;
1068
+ add.s32 %r97, %r266, %r297;
1069
+ @%p84 ld.shared.b32 %r96, [ %r97 + 0 ];
1070
+ mov.b32 %f306, %r96;
1071
+ shfl.sync.bfly.b32 %r298, %r96, 1, 31, -1;
1072
+ mov.b32 %f307, %r298;
1073
+ $L__tmp162:
1074
+ .loc 2 233 15
1075
+ add.f32 %f308, %f306, %f307;
1076
+ $L__tmp163:
1077
+ .loc 2 243 36
1078
+ and.b32 %r299, %r1, 1;
1079
+ setp.eq.b32 %p86, %r299, 1;
1080
+ not.pred %p87, %p86;
1081
+ and.pred %p85, %p84, %p87;
1082
+ mov.b32 %r99, %f308;
1083
+ @%p85 st.shared.b32 [ %r97 + 0 ], %r99;
1084
+ bar.sync 0;
1085
+ add.s32 %r300, %r266, %r264;
1086
+ ld.shared.f32 %f49, [%r300];
1087
+ add.s32 %r301, %r266, %r267;
1088
+ ld.shared.f32 %f50, [%r301];
1089
+ add.s32 %r302, %r266, %r269;
1090
+ ld.shared.f32 %f51, [%r302];
1091
+ add.s32 %r303, %r266, %r271;
1092
+ ld.shared.f32 %f52, [%r303];
1093
+ add.s32 %r304, %r266, %r273;
1094
+ ld.shared.f32 %f53, [%r304];
1095
+ add.s32 %r305, %r266, %r275;
1096
+ ld.shared.f32 %f54, [%r305];
1097
+ add.s32 %r306, %r266, %r277;
1098
+ ld.shared.f32 %f55, [%r306];
1099
+ add.s32 %r307, %r266, %r279;
1100
+ ld.shared.f32 %f56, [%r307];
1101
+ add.s32 %r308, %r266, %r281;
1102
+ ld.shared.f32 %f57, [%r308];
1103
+ add.s32 %r309, %r266, %r283;
1104
+ ld.shared.f32 %f58, [%r309];
1105
+ add.s32 %r310, %r266, %r285;
1106
+ ld.shared.f32 %f59, [%r310];
1107
+ add.s32 %r311, %r266, %r287;
1108
+ ld.shared.f32 %f60, [%r311];
1109
+ add.s32 %r312, %r266, %r289;
1110
+ ld.shared.f32 %f61, [%r312];
1111
+ add.s32 %r313, %r266, %r291;
1112
+ ld.shared.f32 %f62, [%r313];
1113
+ add.s32 %r314, %r266, %r293;
1114
+ ld.shared.f32 %f63, [%r314];
1115
+ add.s32 %r315, %r266, %r295;
1116
+ ld.shared.f32 %f64, [%r315];
1117
+ $L__tmp164:
1118
+ .loc 1 51 36
1119
+ shl.b64 %rd80, %rd3, 1;
1120
+ add.s64 %rd7, %rd17, %rd80;
1121
+ mul.lo.s64 %rd81, %rd1, 6432896;
1122
+ mul.lo.s64 %rd82, %rd2, 100514;
1123
+ add.s64 %rd148, %rd81, %rd82;
1124
+ add.s64 %rd9, %rd16, %rd80;
1125
+ add.s64 %rd10, %rd15, %rd80;
1126
+ mov.b32 %r398, -64;
1127
+ mov.u16 %rs2, 0;
1128
+ $L__BB0_3:
1129
+ add.s32 %r398, %r398, 64;
1130
+ .loc 1 52 27
1131
+ add.s32 %r396, %r398, %r18;
1132
+ .loc 1 53 25
1133
+ setp.lt.u32 %p88, %r396, 50257;
1134
+ .loc 1 55 35
1135
+ add.s64 %rd83, %rd10, %rd148;
1136
+ add.s64 %rd84, %rd83, 402056;
1137
+ add.s64 %rd85, %rd83, 804112;
1138
+ add.s64 %rd86, %rd83, 1206168;
1139
+ add.s64 %rd87, %rd83, 1608224;
1140
+ add.s64 %rd88, %rd83, 2010280;
1141
+ add.s64 %rd89, %rd83, 2412336;
1142
+ add.s64 %rd90, %rd83, 2814392;
1143
+ add.s64 %rd91, %rd83, 3216448;
1144
+ add.s64 %rd92, %rd83, 3618504;
1145
+ add.s64 %rd93, %rd83, 4020560;
1146
+ add.s64 %rd94, %rd83, 4422616;
1147
+ add.s64 %rd95, %rd83, 4824672;
1148
+ add.s64 %rd96, %rd83, 5226728;
1149
+ add.s64 %rd97, %rd83, 5628784;
1150
+ .loc 1 55 53
1151
+ add.s64 %rd98, %rd83, 6030840;
1152
+ mov.u16 %rs1, 0x0;
1153
+ @%p88 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd83 + 0 ];
1154
+ @!%p88 mov.u16 %rs1, %rs2;
1155
+ mov.u16 %rs3, 0x0;
1156
+ @%p88 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd84 + 0 ];
1157
+ @!%p88 mov.u16 %rs3, %rs2;
1158
+ mov.u16 %rs5, 0x0;
1159
+ @%p88 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd85 + 0 ];
1160
+ @!%p88 mov.u16 %rs5, %rs2;
1161
+ mov.u16 %rs7, 0x0;
1162
+ @%p88 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd86 + 0 ];
1163
+ @!%p88 mov.u16 %rs7, %rs2;
1164
+ mov.u16 %rs9, 0x0;
1165
+ @%p88 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd87 + 0 ];
1166
+ @!%p88 mov.u16 %rs9, %rs2;
1167
+ mov.u16 %rs11, 0x0;
1168
+ @%p88 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd88 + 0 ];
1169
+ @!%p88 mov.u16 %rs11, %rs2;
1170
+ mov.u16 %rs13, 0x0;
1171
+ @%p88 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd89 + 0 ];
1172
+ @!%p88 mov.u16 %rs13, %rs2;
1173
+ mov.u16 %rs15, 0x0;
1174
+ @%p88 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd90 + 0 ];
1175
+ @!%p88 mov.u16 %rs15, %rs2;
1176
+ mov.u16 %rs17, 0x0;
1177
+ @%p88 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd91 + 0 ];
1178
+ @!%p88 mov.u16 %rs17, %rs2;
1179
+ mov.u16 %rs19, 0x0;
1180
+ @%p88 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd92 + 0 ];
1181
+ @!%p88 mov.u16 %rs19, %rs2;
1182
+ mov.u16 %rs21, 0x0;
1183
+ @%p88 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd93 + 0 ];
1184
+ @!%p88 mov.u16 %rs21, %rs2;
1185
+ mov.u16 %rs23, 0x0;
1186
+ @%p88 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd94 + 0 ];
1187
+ @!%p88 mov.u16 %rs23, %rs2;
1188
+ mov.u16 %rs25, 0x0;
1189
+ @%p88 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd95 + 0 ];
1190
+ @!%p88 mov.u16 %rs25, %rs2;
1191
+ mov.u16 %rs27, 0x0;
1192
+ @%p88 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd96 + 0 ];
1193
+ @!%p88 mov.u16 %rs27, %rs2;
1194
+ mov.u16 %rs29, 0x0;
1195
+ @%p88 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd97 + 0 ];
1196
+ @!%p88 mov.u16 %rs29, %rs2;
1197
+ mov.u16 %rs31, 0x0;
1198
+ @%p88 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd98 + 0 ];
1199
+ @!%p88 mov.u16 %rs31, %rs2;
1200
+ .loc 1 55 105
1201
+ cvt.f32.bf16 %r316, %rs1;
1202
+ mov.b32 %f341, %r316;
1203
+ cvt.f32.bf16 %r317, %rs3;
1204
+ mov.b32 %f342, %r317;
1205
+ cvt.f32.bf16 %r318, %rs5;
1206
+ mov.b32 %f343, %r318;
1207
+ cvt.f32.bf16 %r319, %rs7;
1208
+ mov.b32 %f344, %r319;
1209
+ cvt.f32.bf16 %r320, %rs9;
1210
+ mov.b32 %f345, %r320;
1211
+ cvt.f32.bf16 %r321, %rs11;
1212
+ mov.b32 %f346, %r321;
1213
+ cvt.f32.bf16 %r322, %rs13;
1214
+ mov.b32 %f347, %r322;
1215
+ cvt.f32.bf16 %r323, %rs15;
1216
+ mov.b32 %f348, %r323;
1217
+ cvt.f32.bf16 %r324, %rs17;
1218
+ mov.b32 %f349, %r324;
1219
+ cvt.f32.bf16 %r325, %rs19;
1220
+ mov.b32 %f350, %r325;
1221
+ cvt.f32.bf16 %r326, %rs21;
1222
+ mov.b32 %f351, %r326;
1223
+ cvt.f32.bf16 %r327, %rs23;
1224
+ mov.b32 %f352, %r327;
1225
+ cvt.f32.bf16 %r328, %rs25;
1226
+ mov.b32 %f353, %r328;
1227
+ cvt.f32.bf16 %r329, %rs27;
1228
+ mov.b32 %f354, %r329;
1229
+ cvt.f32.bf16 %r330, %rs29;
1230
+ mov.b32 %f355, %r330;
1231
+ cvt.f32.bf16 %r331, %rs31;
1232
+ mov.b32 %f356, %r331;
1233
+ .loc 1 56 35
1234
+ add.s64 %rd99, %rd149, -12061680;
1235
+ add.s64 %rd100, %rd149, -11257568;
1236
+ add.s64 %rd101, %rd149, -10453456;
1237
+ add.s64 %rd102, %rd149, -9649344;
1238
+ add.s64 %rd103, %rd149, -8845232;
1239
+ add.s64 %rd104, %rd149, -8041120;
1240
+ add.s64 %rd105, %rd149, -7237008;
1241
+ add.s64 %rd106, %rd149, -6432896;
1242
+ add.s64 %rd107, %rd149, -5628784;
1243
+ add.s64 %rd108, %rd149, -4824672;
1244
+ add.s64 %rd109, %rd149, -4020560;
1245
+ add.s64 %rd110, %rd149, -3216448;
1246
+ add.s64 %rd111, %rd149, -2412336;
1247
+ add.s64 %rd112, %rd149, -1608224;
1248
+ add.s64 %rd113, %rd149, -804112;
1249
+ .loc 1 56 53
1250
+ mov.u32 %r332, 0x0;
1251
+ @%p88 ld.global.L1::evict_first.b32 { %r332 }, [ %rd99 + 0 ];
1252
+ @!%p88 mov.u32 %r332, %r333;
1253
+ mov.b32 %f357, %r332;
1254
+ mov.u32 %r334, 0x0;
1255
+ @%p88 ld.global.L1::evict_first.b32 { %r334 }, [ %rd100 + 0 ];
1256
+ @!%p88 mov.u32 %r334, %r333;
1257
+ mov.b32 %f358, %r334;
1258
+ mov.u32 %r336, 0x0;
1259
+ @%p88 ld.global.L1::evict_first.b32 { %r336 }, [ %rd101 + 0 ];
1260
+ @!%p88 mov.u32 %r336, %r333;
1261
+ mov.b32 %f359, %r336;
1262
+ mov.u32 %r338, 0x0;
1263
+ @%p88 ld.global.L1::evict_first.b32 { %r338 }, [ %rd102 + 0 ];
1264
+ @!%p88 mov.u32 %r338, %r333;
1265
+ mov.b32 %f360, %r338;
1266
+ mov.u32 %r340, 0x0;
1267
+ @%p88 ld.global.L1::evict_first.b32 { %r340 }, [ %rd103 + 0 ];
1268
+ @!%p88 mov.u32 %r340, %r333;
1269
+ mov.b32 %f361, %r340;
1270
+ mov.u32 %r342, 0x0;
1271
+ @%p88 ld.global.L1::evict_first.b32 { %r342 }, [ %rd104 + 0 ];
1272
+ @!%p88 mov.u32 %r342, %r333;
1273
+ mov.b32 %f362, %r342;
1274
+ mov.u32 %r344, 0x0;
1275
+ @%p88 ld.global.L1::evict_first.b32 { %r344 }, [ %rd105 + 0 ];
1276
+ @!%p88 mov.u32 %r344, %r333;
1277
+ mov.b32 %f363, %r344;
1278
+ mov.u32 %r346, 0x0;
1279
+ @%p88 ld.global.L1::evict_first.b32 { %r346 }, [ %rd106 + 0 ];
1280
+ @!%p88 mov.u32 %r346, %r333;
1281
+ mov.b32 %f364, %r346;
1282
+ mov.u32 %r348, 0x0;
1283
+ @%p88 ld.global.L1::evict_first.b32 { %r348 }, [ %rd107 + 0 ];
1284
+ @!%p88 mov.u32 %r348, %r333;
1285
+ mov.b32 %f365, %r348;
1286
+ mov.u32 %r350, 0x0;
1287
+ @%p88 ld.global.L1::evict_first.b32 { %r350 }, [ %rd108 + 0 ];
1288
+ @!%p88 mov.u32 %r350, %r333;
1289
+ mov.b32 %f366, %r350;
1290
+ mov.u32 %r352, 0x0;
1291
+ @%p88 ld.global.L1::evict_first.b32 { %r352 }, [ %rd109 + 0 ];
1292
+ @!%p88 mov.u32 %r352, %r333;
1293
+ mov.b32 %f367, %r352;
1294
+ mov.u32 %r354, 0x0;
1295
+ @%p88 ld.global.L1::evict_first.b32 { %r354 }, [ %rd110 + 0 ];
1296
+ @!%p88 mov.u32 %r354, %r333;
1297
+ mov.b32 %f368, %r354;
1298
+ mov.u32 %r356, 0x0;
1299
+ @%p88 ld.global.L1::evict_first.b32 { %r356 }, [ %rd111 + 0 ];
1300
+ @!%p88 mov.u32 %r356, %r333;
1301
+ mov.b32 %f369, %r356;
1302
+ mov.u32 %r358, 0x0;
1303
+ @%p88 ld.global.L1::evict_first.b32 { %r358 }, [ %rd112 + 0 ];
1304
+ @!%p88 mov.u32 %r358, %r333;
1305
+ mov.b32 %f370, %r358;
1306
+ mov.u32 %r360, 0x0;
1307
+ @%p88 ld.global.L1::evict_first.b32 { %r360 }, [ %rd113 + 0 ];
1308
+ @!%p88 mov.u32 %r360, %r333;
1309
+ mov.b32 %f371, %r360;
1310
+ mov.u32 %r362, 0x0;
1311
+ @%p88 ld.global.L1::evict_first.b32 { %r362 }, [ %rd149 + 0 ];
1312
+ @!%p88 mov.u32 %r362, %r333;
1313
+ mov.b32 %f372, %r362;
1314
+ .loc 1 57 35
1315
+ add.s64 %rd115, %rd9, %rd148;
1316
+ add.s64 %rd116, %rd115, 402056;
1317
+ add.s64 %rd117, %rd115, 804112;
1318
+ add.s64 %rd118, %rd115, 1206168;
1319
+ add.s64 %rd119, %rd115, 1608224;
1320
+ add.s64 %rd120, %rd115, 2010280;
1321
+ add.s64 %rd121, %rd115, 2412336;
1322
+ add.s64 %rd122, %rd115, 2814392;
1323
+ add.s64 %rd123, %rd115, 3216448;
1324
+ add.s64 %rd124, %rd115, 3618504;
1325
+ add.s64 %rd125, %rd115, 4020560;
1326
+ add.s64 %rd126, %rd115, 4422616;
1327
+ add.s64 %rd127, %rd115, 4824672;
1328
+ add.s64 %rd128, %rd115, 5226728;
1329
+ add.s64 %rd129, %rd115, 5628784;
1330
+ .loc 1 57 53
1331
+ add.s64 %rd130, %rd115, 6030840;
1332
+ mov.u16 %rs49, 0x0;
1333
+ @%p88 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd115 + 0 ];
1334
+ @!%p88 mov.u16 %rs49, %rs2;
1335
+ mov.u16 %rs51, 0x0;
1336
+ @%p88 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd116 + 0 ];
1337
+ @!%p88 mov.u16 %rs51, %rs2;
1338
+ mov.u16 %rs53, 0x0;
1339
+ @%p88 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd117 + 0 ];
1340
+ @!%p88 mov.u16 %rs53, %rs2;
1341
+ mov.u16 %rs55, 0x0;
1342
+ @%p88 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd118 + 0 ];
1343
+ @!%p88 mov.u16 %rs55, %rs2;
1344
+ mov.u16 %rs57, 0x0;
1345
+ @%p88 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd119 + 0 ];
1346
+ @!%p88 mov.u16 %rs57, %rs2;
1347
+ mov.u16 %rs59, 0x0;
1348
+ @%p88 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd120 + 0 ];
1349
+ @!%p88 mov.u16 %rs59, %rs2;
1350
+ mov.u16 %rs61, 0x0;
1351
+ @%p88 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd121 + 0 ];
1352
+ @!%p88 mov.u16 %rs61, %rs2;
1353
+ mov.u16 %rs63, 0x0;
1354
+ @%p88 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd122 + 0 ];
1355
+ @!%p88 mov.u16 %rs63, %rs2;
1356
+ mov.u16 %rs65, 0x0;
1357
+ @%p88 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd123 + 0 ];
1358
+ @!%p88 mov.u16 %rs65, %rs2;
1359
+ mov.u16 %rs67, 0x0;
1360
+ @%p88 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd124 + 0 ];
1361
+ @!%p88 mov.u16 %rs67, %rs2;
1362
+ mov.u16 %rs69, 0x0;
1363
+ @%p88 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd125 + 0 ];
1364
+ @!%p88 mov.u16 %rs69, %rs2;
1365
+ mov.u16 %rs71, 0x0;
1366
+ @%p88 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd126 + 0 ];
1367
+ @!%p88 mov.u16 %rs71, %rs2;
1368
+ mov.u16 %rs73, 0x0;
1369
+ @%p88 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd127 + 0 ];
1370
+ @!%p88 mov.u16 %rs73, %rs2;
1371
+ mov.u16 %rs75, 0x0;
1372
+ @%p88 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd128 + 0 ];
1373
+ @!%p88 mov.u16 %rs75, %rs2;
1374
+ mov.u16 %rs77, 0x0;
1375
+ @%p88 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd129 + 0 ];
1376
+ @!%p88 mov.u16 %rs77, %rs2;
1377
+ mov.u16 %rs79, 0x0;
1378
+ @%p88 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd130 + 0 ];
1379
+ @!%p88 mov.u16 %rs79, %rs2;
1380
+ .loc 1 57 105
1381
+ cvt.f32.bf16 %r364, %rs49;
1382
+ mov.b32 %f373, %r364;
1383
+ cvt.f32.bf16 %r365, %rs51;
1384
+ mov.b32 %f374, %r365;
1385
+ cvt.f32.bf16 %r366, %rs53;
1386
+ mov.b32 %f375, %r366;
1387
+ cvt.f32.bf16 %r367, %rs55;
1388
+ mov.b32 %f376, %r367;
1389
+ cvt.f32.bf16 %r368, %rs57;
1390
+ mov.b32 %f377, %r368;
1391
+ cvt.f32.bf16 %r369, %rs59;
1392
+ mov.b32 %f378, %r369;
1393
+ cvt.f32.bf16 %r370, %rs61;
1394
+ mov.b32 %f379, %r370;
1395
+ cvt.f32.bf16 %r371, %rs63;
1396
+ mov.b32 %f380, %r371;
1397
+ cvt.f32.bf16 %r372, %rs65;
1398
+ mov.b32 %f381, %r372;
1399
+ cvt.f32.bf16 %r373, %rs67;
1400
+ mov.b32 %f382, %r373;
1401
+ cvt.f32.bf16 %r374, %rs69;
1402
+ mov.b32 %f383, %r374;
1403
+ cvt.f32.bf16 %r375, %rs71;
1404
+ mov.b32 %f384, %r375;
1405
+ cvt.f32.bf16 %r376, %rs73;
1406
+ mov.b32 %f385, %r376;
1407
+ cvt.f32.bf16 %r377, %rs75;
1408
+ mov.b32 %f386, %r377;
1409
+ cvt.f32.bf16 %r378, %rs77;
1410
+ mov.b32 %f387, %r378;
1411
+ cvt.f32.bf16 %r379, %rs79;
1412
+ mov.b32 %f388, %r379;
1413
+ .loc 1 65 23
1414
+ mul.f32 %f310, %f373, 0f3FB8AA3B;
1415
+ ex2.approx.f32 %f309, %f310;
1416
+ mul.f32 %f312, %f374, 0f3FB8AA3B;
1417
+ ex2.approx.f32 %f311, %f312;
1418
+ mul.f32 %f314, %f375, 0f3FB8AA3B;
1419
+ ex2.approx.f32 %f313, %f314;
1420
+ mul.f32 %f316, %f376, 0f3FB8AA3B;
1421
+ ex2.approx.f32 %f315, %f316;
1422
+ mul.f32 %f318, %f377, 0f3FB8AA3B;
1423
+ ex2.approx.f32 %f317, %f318;
1424
+ mul.f32 %f320, %f378, 0f3FB8AA3B;
1425
+ ex2.approx.f32 %f319, %f320;
1426
+ mul.f32 %f322, %f379, 0f3FB8AA3B;
1427
+ ex2.approx.f32 %f321, %f322;
1428
+ mul.f32 %f324, %f380, 0f3FB8AA3B;
1429
+ ex2.approx.f32 %f323, %f324;
1430
+ mul.f32 %f326, %f381, 0f3FB8AA3B;
1431
+ ex2.approx.f32 %f325, %f326;
1432
+ mul.f32 %f328, %f382, 0f3FB8AA3B;
1433
+ ex2.approx.f32 %f327, %f328;
1434
+ mul.f32 %f330, %f383, 0f3FB8AA3B;
1435
+ ex2.approx.f32 %f329, %f330;
1436
+ mul.f32 %f332, %f384, 0f3FB8AA3B;
1437
+ ex2.approx.f32 %f331, %f332;
1438
+ mul.f32 %f334, %f385, 0f3FB8AA3B;
1439
+ ex2.approx.f32 %f333, %f334;
1440
+ mul.f32 %f336, %f386, 0f3FB8AA3B;
1441
+ ex2.approx.f32 %f335, %f336;
1442
+ mul.f32 %f338, %f387, 0f3FB8AA3B;
1443
+ ex2.approx.f32 %f337, %f338;
1444
+ mul.f32 %f340, %f388, 0f3FB8AA3B;
1445
+ ex2.approx.f32 %f339, %f340;
1446
+ .loc 1 66 24
1447
+ mul.f32 %f389, %f49, %f309;
1448
+ mul.f32 %f390, %f50, %f311;
1449
+ mul.f32 %f391, %f51, %f313;
1450
+ mul.f32 %f392, %f52, %f315;
1451
+ mul.f32 %f393, %f53, %f317;
1452
+ mul.f32 %f394, %f54, %f319;
1453
+ mul.f32 %f395, %f55, %f321;
1454
+ mul.f32 %f396, %f56, %f323;
1455
+ mul.f32 %f397, %f57, %f325;
1456
+ mul.f32 %f398, %f58, %f327;
1457
+ mul.f32 %f399, %f59, %f329;
1458
+ mul.f32 %f400, %f60, %f331;
1459
+ mul.f32 %f401, %f61, %f333;
1460
+ mul.f32 %f402, %f62, %f335;
1461
+ mul.f32 %f403, %f63, %f337;
1462
+ mul.f32 %f404, %f64, %f339;
1463
+ .loc 1 67 24
1464
+ neg.f32 %f405, %f389;
1465
+ fma.rn.f32 %f406, %f1, %f357, %f405;
1466
+ neg.f32 %f407, %f390;
1467
+ fma.rn.f32 %f408, %f2, %f358, %f407;
1468
+ neg.f32 %f409, %f391;
1469
+ fma.rn.f32 %f410, %f3, %f359, %f409;
1470
+ neg.f32 %f411, %f392;
1471
+ fma.rn.f32 %f412, %f4, %f360, %f411;
1472
+ neg.f32 %f413, %f393;
1473
+ fma.rn.f32 %f414, %f5, %f361, %f413;
1474
+ neg.f32 %f415, %f394;
1475
+ fma.rn.f32 %f416, %f6, %f362, %f415;
1476
+ neg.f32 %f417, %f395;
1477
+ fma.rn.f32 %f418, %f7, %f363, %f417;
1478
+ neg.f32 %f419, %f396;
1479
+ fma.rn.f32 %f420, %f8, %f364, %f419;
1480
+ neg.f32 %f421, %f397;
1481
+ fma.rn.f32 %f422, %f9, %f365, %f421;
1482
+ neg.f32 %f423, %f398;
1483
+ fma.rn.f32 %f424, %f10, %f366, %f423;
1484
+ neg.f32 %f425, %f399;
1485
+ fma.rn.f32 %f426, %f11, %f367, %f425;
1486
+ neg.f32 %f427, %f400;
1487
+ fma.rn.f32 %f428, %f12, %f368, %f427;
1488
+ neg.f32 %f429, %f401;
1489
+ fma.rn.f32 %f430, %f13, %f369, %f429;
1490
+ neg.f32 %f431, %f402;
1491
+ fma.rn.f32 %f432, %f14, %f370, %f431;
1492
+ neg.f32 %f433, %f403;
1493
+ fma.rn.f32 %f434, %f15, %f371, %f433;
1494
+ neg.f32 %f435, %f404;
1495
+ fma.rn.f32 %f436, %f16, %f372, %f435;
1496
+ .loc 1 69 24
1497
+ add.f32 %f437, %f341, %f406;
1498
+ add.f32 %f438, %f342, %f408;
1499
+ add.f32 %f439, %f343, %f410;
1500
+ add.f32 %f440, %f344, %f412;
1501
+ add.f32 %f441, %f345, %f414;
1502
+ add.f32 %f442, %f346, %f416;
1503
+ add.f32 %f443, %f347, %f418;
1504
+ add.f32 %f444, %f348, %f420;
1505
+ add.f32 %f445, %f349, %f422;
1506
+ add.f32 %f446, %f350, %f424;
1507
+ add.f32 %f447, %f351, %f426;
1508
+ add.f32 %f448, %f352, %f428;
1509
+ add.f32 %f449, %f353, %f430;
1510
+ add.f32 %f450, %f354, %f432;
1511
+ add.f32 %f451, %f355, %f434;
1512
+ add.f32 %f452, %f356, %f436;
1513
+ .loc 1 70 29
1514
+ add.s64 %rd131, %rd7, %rd148;
1515
+ add.s64 %rd132, %rd131, 402056;
1516
+ add.s64 %rd133, %rd131, 804112;
1517
+ add.s64 %rd134, %rd131, 1206168;
1518
+ add.s64 %rd135, %rd131, 1608224;
1519
+ add.s64 %rd136, %rd131, 2010280;
1520
+ add.s64 %rd137, %rd131, 2412336;
1521
+ add.s64 %rd138, %rd131, 2814392;
1522
+ add.s64 %rd139, %rd131, 3216448;
1523
+ add.s64 %rd140, %rd131, 3618504;
1524
+ add.s64 %rd141, %rd131, 4020560;
1525
+ add.s64 %rd142, %rd131, 4422616;
1526
+ add.s64 %rd143, %rd131, 4824672;
1527
+ add.s64 %rd144, %rd131, 5226728;
1528
+ add.s64 %rd145, %rd131, 5628784;
1529
+ .loc 1 70 54
1530
+ add.s64 %rd146, %rd131, 6030840;
1531
+ mov.b32 %r380, %f437;
1532
+ cvt.rn.bf16.f32 %rs97, %r380;
1533
+ mov.b32 %r381, %f438;
1534
+ cvt.rn.bf16.f32 %rs98, %r381;
1535
+ mov.b32 %r382, %f439;
1536
+ cvt.rn.bf16.f32 %rs99, %r382;
1537
+ mov.b32 %r383, %f440;
1538
+ cvt.rn.bf16.f32 %rs100, %r383;
1539
+ mov.b32 %r384, %f441;
1540
+ cvt.rn.bf16.f32 %rs101, %r384;
1541
+ mov.b32 %r385, %f442;
1542
+ cvt.rn.bf16.f32 %rs102, %r385;
1543
+ mov.b32 %r386, %f443;
1544
+ cvt.rn.bf16.f32 %rs103, %r386;
1545
+ mov.b32 %r387, %f444;
1546
+ cvt.rn.bf16.f32 %rs104, %r387;
1547
+ mov.b32 %r388, %f445;
1548
+ cvt.rn.bf16.f32 %rs105, %r388;
1549
+ mov.b32 %r389, %f446;
1550
+ cvt.rn.bf16.f32 %rs106, %r389;
1551
+ mov.b32 %r390, %f447;
1552
+ cvt.rn.bf16.f32 %rs107, %r390;
1553
+ mov.b32 %r391, %f448;
1554
+ cvt.rn.bf16.f32 %rs108, %r391;
1555
+ mov.b32 %r392, %f449;
1556
+ cvt.rn.bf16.f32 %rs109, %r392;
1557
+ mov.b32 %r393, %f450;
1558
+ cvt.rn.bf16.f32 %rs110, %r393;
1559
+ mov.b32 %r394, %f451;
1560
+ cvt.rn.bf16.f32 %rs111, %r394;
1561
+ mov.b32 %r395, %f452;
1562
+ cvt.rn.bf16.f32 %rs112, %r395;
1563
+ @%p88 st.global.b16 [ %rd131 + 0 ], { %rs97 };
1564
+ @%p88 st.global.b16 [ %rd132 + 0 ], { %rs98 };
1565
+ @%p88 st.global.b16 [ %rd133 + 0 ], { %rs99 };
1566
+ @%p88 st.global.b16 [ %rd134 + 0 ], { %rs100 };
1567
+ @%p88 st.global.b16 [ %rd135 + 0 ], { %rs101 };
1568
+ @%p88 st.global.b16 [ %rd136 + 0 ], { %rs102 };
1569
+ @%p88 st.global.b16 [ %rd137 + 0 ], { %rs103 };
1570
+ @%p88 st.global.b16 [ %rd138 + 0 ], { %rs104 };
1571
+ @%p88 st.global.b16 [ %rd139 + 0 ], { %rs105 };
1572
+ @%p88 st.global.b16 [ %rd140 + 0 ], { %rs106 };
1573
+ @%p88 st.global.b16 [ %rd141 + 0 ], { %rs107 };
1574
+ @%p88 st.global.b16 [ %rd142 + 0 ], { %rs108 };
1575
+ @%p88 st.global.b16 [ %rd143 + 0 ], { %rs109 };
1576
+ @%p88 st.global.b16 [ %rd144 + 0 ], { %rs110 };
1577
+ @%p88 st.global.b16 [ %rd145 + 0 ], { %rs111 };
1578
+ @%p88 st.global.b16 [ %rd146 + 0 ], { %rs112 };
1579
+ .loc 1 51 36
1580
+ add.s64 %rd149, %rd149, 256;
1581
+ add.s64 %rd148, %rd148, 128;
1582
+ setp.lt.u32 %p200, %r398, 50193;
1583
+ @%p200 bra $L__BB0_3;
1584
+ .loc 1 51 4
1585
+ ret;
1586
+ $L__tmp165:
1587
+ $L__func_end0:
1588
+
1589
+ }
1590
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
1591
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
1592
+ .section .debug_abbrev
1593
+ {
1594
+ .b8 1
1595
+ .b8 17
1596
+ .b8 1
1597
+ .b8 37
1598
+ .b8 8
1599
+ .b8 19
1600
+ .b8 5
1601
+ .b8 3
1602
+ .b8 8
1603
+ .b8 16
1604
+ .b8 6
1605
+ .b8 27
1606
+ .b8 8
1607
+ .b8 180
1608
+ .b8 66
1609
+ .b8 12
1610
+ .b8 17
1611
+ .b8 1
1612
+ .b8 18
1613
+ .b8 1
1614
+ .b8 0
1615
+ .b8 0
1616
+ .b8 2
1617
+ .b8 46
1618
+ .b8 0
1619
+ .b8 135
1620
+ .b8 64
1621
+ .b8 8
1622
+ .b8 3
1623
+ .b8 8
1624
+ .b8 58
1625
+ .b8 11
1626
+ .b8 59
1627
+ .b8 11
1628
+ .b8 63
1629
+ .b8 12
1630
+ .b8 32
1631
+ .b8 11
1632
+ .b8 0
1633
+ .b8 0
1634
+ .b8 3
1635
+ .b8 46
1636
+ .b8 1
1637
+ .b8 17
1638
+ .b8 1
1639
+ .b8 18
1640
+ .b8 1
1641
+ .b8 64
1642
+ .b8 10
1643
+ .b8 49
1644
+ .b8 19
1645
+ .b8 0
1646
+ .b8 0
1647
+ .b8 4
1648
+ .b8 29
1649
+ .b8 0
1650
+ .b8 49
1651
+ .b8 19
1652
+ .b8 17
1653
+ .b8 1
1654
+ .b8 18
1655
+ .b8 1
1656
+ .b8 88
1657
+ .b8 11
1658
+ .b8 89
1659
+ .b8 11
1660
+ .b8 87
1661
+ .b8 11
1662
+ .b8 0
1663
+ .b8 0
1664
+ .b8 5
1665
+ .b8 29
1666
+ .b8 1
1667
+ .b8 49
1668
+ .b8 19
1669
+ .b8 17
1670
+ .b8 1
1671
+ .b8 18
1672
+ .b8 1
1673
+ .b8 88
1674
+ .b8 11
1675
+ .b8 89
1676
+ .b8 11
1677
+ .b8 87
1678
+ .b8 11
1679
+ .b8 0
1680
+ .b8 0
1681
+ .b8 0
1682
+ }
1683
+ .section .debug_info
1684
+ {
1685
+ .b32 278
1686
+ .b8 2
1687
+ .b8 0
1688
+ .b32 .debug_abbrev
1689
+ .b8 8
1690
+ .b8 1
1691
+ .b8 116
1692
+ .b8 114
1693
+ .b8 105
1694
+ .b8 116
1695
+ .b8 111
1696
+ .b8 110
1697
+ .b8 0
1698
+ .b8 2
1699
+ .b8 0
1700
+ .b8 99
1701
+ .b8 107
1702
+ .b8 122
1703
+ .b8 103
1704
+ .b8 108
1705
+ .b8 55
1706
+ .b8 116
1707
+ .b8 104
1708
+ .b8 98
1709
+ .b8 52
1710
+ .b8 120
1711
+ .b8 100
1712
+ .b8 102
1713
+ .b8 107
1714
+ .b8 102
1715
+ .b8 110
1716
+ .b8 100
1717
+ .b8 50
1718
+ .b8 116
1719
+ .b8 105
1720
+ .b8 100
1721
+ .b8 107
1722
+ .b8 115
1723
+ .b8 54
1724
+ .b8 109
1725
+ .b8 116
1726
+ .b8 53
1727
+ .b8 102
1728
+ .b8 51
1729
+ .b8 104
1730
+ .b8 97
1731
+ .b8 117
1732
+ .b8 119
1733
+ .b8 102
1734
+ .b8 121
1735
+ .b8 106
1736
+ .b8 102
1737
+ .b8 108
1738
+ .b8 98
1739
+ .b8 116
1740
+ .b8 122
1741
+ .b8 121
1742
+ .b8 101
1743
+ .b8 112
1744
+ .b8 111
1745
+ .b8 53
1746
+ .b8 111
1747
+ .b8 120
1748
+ .b8 107
1749
+ .b8 118
1750
+ .b8 104
1751
+ .b8 107
1752
+ .b8 46
1753
+ .b8 112
1754
+ .b8 121
1755
+ .b8 0
1756
+ .b32 .debug_line
1757
+ .b8 47
1758
+ .b8 116
1759
+ .b8 109
1760
+ .b8 112
1761
+ .b8 47
1762
+ .b8 116
1763
+ .b8 111
1764
+ .b8 114
1765
+ .b8 99
1766
+ .b8 104
1767
+ .b8 105
1768
+ .b8 110
1769
+ .b8 100
1770
+ .b8 117
1771
+ .b8 99
1772
+ .b8 116
1773
+ .b8 111
1774
+ .b8 114
1775
+ .b8 95
1776
+ .b8 114
1777
+ .b8 111
1778
+ .b8 111
1779
+ .b8 116
1780
+ .b8 47
1781
+ .b8 107
1782
+ .b8 122
1783
+ .b8 0
1784
+ .b8 1
1785
+ .b64 $L__func_begin0
1786
+ .b64 $L__func_end0
1787
+ .b8 2
1788
+ .b8 116
1789
+ .b8 114
1790
+ .b8 105
1791
+ .b8 116
1792
+ .b8 111
1793
+ .b8 110
1794
+ .b8 95
1795
+ .b8 95
1796
+ .b8 48
1797
+ .b8 100
1798
+ .b8 49
1799
+ .b8 100
1800
+ .b8 50
1801
+ .b8 100
1802
+ .b8 51
1803
+ .b8 100
1804
+ .b8 52
1805
+ .b8 100
1806
+ .b8 53
1807
+ .b8 100
1808
+ .b8 54
1809
+ .b8 100
1810
+ .b8 55
1811
+ .b8 100
1812
+ .b8 101
1813
+ .b8 56
1814
+ .b8 0
1815
+ .b8 116
1816
+ .b8 114
1817
+ .b8 105
1818
+ .b8 116
1819
+ .b8 111
1820
+ .b8 110
1821
+ .b8 95
1822
+ .b8 95
1823
+ .b8 48
1824
+ .b8 100
1825
+ .b8 49
1826
+ .b8 100
1827
+ .b8 50
1828
+ .b8 100
1829
+ .b8 51
1830
+ .b8 100
1831
+ .b8 52
1832
+ .b8 100
1833
+ .b8 53
1834
+ .b8 100
1835
+ .b8 54
1836
+ .b8 100
1837
+ .b8 55
1838
+ .b8 100
1839
+ .b8 101
1840
+ .b8 56
1841
+ .b8 0
1842
+ .b8 1
1843
+ .b8 18
1844
+ .b8 1
1845
+ .b8 1
1846
+ .b8 3
1847
+ .b64 $L__func_begin0
1848
+ .b64 $L__func_end0
1849
+ .b8 1
1850
+ .b8 156
1851
+ .b32 125
1852
+ .b8 4
1853
+ .b32 125
1854
+ .b64 $L__tmp1
1855
+ .b64 $L__tmp164
1856
+ .b8 2
1857
+ .b8 46
1858
+ .b8 27
1859
+ .b8 5
1860
+ .b32 125
1861
+ .b64 $L__tmp2
1862
+ .b64 $L__tmp163
1863
+ .b8 2
1864
+ .b8 46
1865
+ .b8 27
1866
+ .b8 4
1867
+ .b32 125
1868
+ .b64 $L__tmp2
1869
+ .b64 $L__tmp163
1870
+ .b8 2
1871
+ .b8 243
1872
+ .b8 36
1873
+ .b8 0
1874
+ .b8 0
1875
+ .b8 0
1876
+ }
1877
+ .section .debug_pubnames
1878
+ {
1879
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1880
+ $L__pubNames_start0:
1881
+ .b8 2
1882
+ .b8 0
1883
+ .b32 .debug_info
1884
+ .b32 282
1885
+ .b32 125
1886
+ .b8 116
1887
+ .b8 114
1888
+ .b8 105
1889
+ .b8 116
1890
+ .b8 111
1891
+ .b8 110
1892
+ .b8 95
1893
+ .b8 95
1894
+ .b8 48
1895
+ .b8 100
1896
+ .b8 49
1897
+ .b8 100
1898
+ .b8 50
1899
+ .b8 100
1900
+ .b8 51
1901
+ .b8 100
1902
+ .b8 52
1903
+ .b8 100
1904
+ .b8 53
1905
+ .b8 100
1906
+ .b8 54
1907
+ .b8 100
1908
+ .b8 55
1909
+ .b8 100
1910
+ .b8 101
1911
+ .b8 56
1912
+ .b8 0
1913
+ .b32 0
1914
+ $L__pubNames_end0:
1915
+ }
1916
+ .section .debug_pubtypes
1917
+ {
1918
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1919
+ $L__pubTypes_start0:
1920
+ .b8 2
1921
+ .b8 0
1922
+ .b32 .debug_info
1923
+ .b32 282
1924
+ .b32 0
1925
+ $L__pubTypes_end0:
1926
+ }
1927
+ .section .debug_loc { }
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.llir ADDED
@@ -0,0 +1,860 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %5 = shl i32 %4, 3, !dbg !10
9
+ %6 = and i32 %5, 1016, !dbg !10
10
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %8 = shl i32 %7, 10, !dbg !12
12
+ %9 = or i32 %8, %6, !dbg !13
13
+ %10 = sext i32 %9 to i64, !dbg !14
14
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
15
+ %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
16
+ %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
17
+ %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
18
+ %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
19
+ %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
20
+ %17 = trunc i32 %13 to i16, !dbg !15
21
+ %extelt.offset = lshr i32 %13, 16, !dbg !15
22
+ %18 = trunc i32 %extelt.offset to i16, !dbg !15
23
+ %19 = trunc i32 %14 to i16, !dbg !15
24
+ %extelt.offset1 = lshr i32 %14, 16, !dbg !15
25
+ %20 = trunc i32 %extelt.offset1 to i16, !dbg !15
26
+ %21 = trunc i32 %15 to i16, !dbg !15
27
+ %extelt.offset2 = lshr i32 %15, 16, !dbg !15
28
+ %22 = trunc i32 %extelt.offset2 to i16, !dbg !15
29
+ %23 = trunc i32 %16 to i16, !dbg !15
30
+ %extelt.offset3 = lshr i32 %16, 16, !dbg !15
31
+ %24 = trunc i32 %extelt.offset3 to i16, !dbg !15
32
+ %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
33
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
34
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
35
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
36
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
37
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
38
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
39
+ %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
40
+ %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
41
+ %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
42
+ %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
43
+ %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
44
+ %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
45
+ %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
46
+ %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
47
+ %40 = fmul float %32, 0x3FE6A09E60000000, !dbg !17
48
+ %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
49
+ %.not.i = icmp eq i32 %41, 0, !dbg !18
50
+ %42 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
51
+ %43 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
52
+ %.0.i = select i1 %.not.i, float %43, float %42, !dbg !18
53
+ %44 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
54
+ br i1 %44, label %__nv_fabsf.exit1.i, label %46, !dbg !18
55
+
56
+ __nv_fabsf.exit1.i: ; preds = %3
57
+ %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
58
+ %.not1.i = icmp eq i32 %45, 0, !dbg !18
59
+ %.01.i = select i1 %.not1.i, float %43, float %42, !dbg !18
60
+ br label %__internal_fmad.exit.i, !dbg !18
61
+
62
+ 46: ; preds = %3
63
+ %47 = fmul float %33, %33, !dbg !18
64
+ br label %__internal_fmad.exit.i, !dbg !18
65
+
66
+ __internal_fmad.exit.i: ; preds = %46, %__nv_fabsf.exit1.i
67
+ %48 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %46 ], !dbg !18
68
+ %49 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %46 ], !dbg !18
69
+ %50 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %46 ], !dbg !18
70
+ %51 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %46 ], !dbg !18
71
+ %52 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %46 ], !dbg !18
72
+ %53 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %46 ], !dbg !18
73
+ %54 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %46 ], !dbg !18
74
+ %55 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %47, %46 ], !dbg !18
75
+ %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
76
+ %.not2.i = icmp eq i32 %56, 0, !dbg !18
77
+ %57 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %54, float %55, float %53) #4, !dbg !18
78
+ %58 = tail call float @llvm.nvvm.fma.rn.f(float %54, float %55, float %53) #4, !dbg !18
79
+ %.02.i = select i1 %.not2.i, float %58, float %57, !dbg !18
80
+ %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
81
+ %.not3.i = icmp eq i32 %59, 0, !dbg !18
82
+ %60 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %55, float %52) #4, !dbg !18
83
+ %61 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %55, float %52) #4, !dbg !18
84
+ %.03.i = select i1 %.not3.i, float %61, float %60, !dbg !18
85
+ %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
86
+ %.not4.i = icmp eq i32 %62, 0, !dbg !18
87
+ %63 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %55, float %51) #4, !dbg !18
88
+ %64 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %55, float %51) #4, !dbg !18
89
+ %.04.i = select i1 %.not4.i, float %64, float %63, !dbg !18
90
+ %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
91
+ %.not5.i = icmp eq i32 %65, 0, !dbg !18
92
+ %66 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %55, float %50) #4, !dbg !18
93
+ %67 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %55, float %50) #4, !dbg !18
94
+ %.05.i = select i1 %.not5.i, float %67, float %66, !dbg !18
95
+ %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
96
+ %.not6.i = icmp eq i32 %68, 0, !dbg !18
97
+ %69 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %55, float %49) #4, !dbg !18
98
+ %70 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %55, float %49) #4, !dbg !18
99
+ %.06.i = select i1 %.not6.i, float %70, float %69, !dbg !18
100
+ %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
101
+ %.not7.i = icmp eq i32 %71, 0, !dbg !18
102
+ %72 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %55, float %48) #4, !dbg !18
103
+ %73 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %55, float %48) #4, !dbg !18
104
+ %.07.i = select i1 %.not7.i, float %73, float %72, !dbg !18
105
+ %74 = fneg float %55, !dbg !18
106
+ %75 = select i1 %44, float %74, float %33, !dbg !18
107
+ %76 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
108
+ %.not8.i = icmp eq i32 %76, 0, !dbg !18
109
+ %77 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %75, float %75) #4, !dbg !18
110
+ %78 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %75, float %75) #4, !dbg !18
111
+ %.08.i = select i1 %.not8.i, float %78, float %77, !dbg !18
112
+ br i1 %44, label %79, label %__nv_erff.exit, !dbg !18
113
+
114
+ 79: ; preds = %__internal_fmad.exit.i
115
+ %80 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
116
+ %81 = fsub float 1.000000e+00, %80, !dbg !18
117
+ %82 = bitcast float %81 to i32, !dbg !18
118
+ %83 = bitcast float %33 to i32, !dbg !18
119
+ %84 = and i32 %83, -2147483648, !dbg !18
120
+ %85 = or i32 %84, %82, !dbg !18
121
+ %86 = bitcast i32 %85 to float, !dbg !18
122
+ br label %__nv_erff.exit, !dbg !18
123
+
124
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %79
125
+ %r.0.i = phi float [ %86, %79 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
126
+ %87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
127
+ %.not.i4 = icmp eq i32 %87, 0, !dbg !18
128
+ %88 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
129
+ %89 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
130
+ %.0.i5 = select i1 %.not.i4, float %89, float %88, !dbg !18
131
+ %90 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
132
+ br i1 %90, label %__nv_fabsf.exit1.i22, label %92, !dbg !18
133
+
134
+ __nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
135
+ %91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
136
+ %.not1.i23 = icmp eq i32 %91, 0, !dbg !18
137
+ %.01.i24 = select i1 %.not1.i23, float %89, float %88, !dbg !18
138
+ br label %__internal_fmad.exit.i6, !dbg !18
139
+
140
+ 92: ; preds = %__nv_erff.exit
141
+ %93 = fmul float %34, %34, !dbg !18
142
+ br label %__internal_fmad.exit.i6, !dbg !18
143
+
144
+ __internal_fmad.exit.i6: ; preds = %92, %__nv_fabsf.exit1.i22
145
+ %94 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %92 ], !dbg !18
146
+ %95 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %92 ], !dbg !18
147
+ %96 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %92 ], !dbg !18
148
+ %97 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %92 ], !dbg !18
149
+ %98 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %92 ], !dbg !18
150
+ %99 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %92 ], !dbg !18
151
+ %100 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %92 ], !dbg !18
152
+ %101 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %93, %92 ], !dbg !18
153
+ %102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
154
+ %.not2.i7 = icmp eq i32 %102, 0, !dbg !18
155
+ %103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %100, float %101, float %99) #4, !dbg !18
156
+ %104 = tail call float @llvm.nvvm.fma.rn.f(float %100, float %101, float %99) #4, !dbg !18
157
+ %.02.i8 = select i1 %.not2.i7, float %104, float %103, !dbg !18
158
+ %105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
159
+ %.not3.i9 = icmp eq i32 %105, 0, !dbg !18
160
+ %106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %101, float %98) #4, !dbg !18
161
+ %107 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %101, float %98) #4, !dbg !18
162
+ %.03.i10 = select i1 %.not3.i9, float %107, float %106, !dbg !18
163
+ %108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
164
+ %.not4.i11 = icmp eq i32 %108, 0, !dbg !18
165
+ %109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %101, float %97) #4, !dbg !18
166
+ %110 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %101, float %97) #4, !dbg !18
167
+ %.04.i12 = select i1 %.not4.i11, float %110, float %109, !dbg !18
168
+ %111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
169
+ %.not5.i13 = icmp eq i32 %111, 0, !dbg !18
170
+ %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %101, float %96) #4, !dbg !18
171
+ %113 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %101, float %96) #4, !dbg !18
172
+ %.05.i14 = select i1 %.not5.i13, float %113, float %112, !dbg !18
173
+ %114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
174
+ %.not6.i15 = icmp eq i32 %114, 0, !dbg !18
175
+ %115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %101, float %95) #4, !dbg !18
176
+ %116 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %101, float %95) #4, !dbg !18
177
+ %.06.i16 = select i1 %.not6.i15, float %116, float %115, !dbg !18
178
+ %117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
179
+ %.not7.i17 = icmp eq i32 %117, 0, !dbg !18
180
+ %118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %101, float %94) #4, !dbg !18
181
+ %119 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %101, float %94) #4, !dbg !18
182
+ %.07.i18 = select i1 %.not7.i17, float %119, float %118, !dbg !18
183
+ %120 = fneg float %101, !dbg !18
184
+ %121 = select i1 %90, float %120, float %34, !dbg !18
185
+ %122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
186
+ %.not8.i19 = icmp eq i32 %122, 0, !dbg !18
187
+ %123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %121, float %121) #4, !dbg !18
188
+ %124 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %121, float %121) #4, !dbg !18
189
+ %.08.i20 = select i1 %.not8.i19, float %124, float %123, !dbg !18
190
+ br i1 %90, label %125, label %__nv_erff.exit25, !dbg !18
191
+
192
+ 125: ; preds = %__internal_fmad.exit.i6
193
+ %126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
194
+ %127 = fsub float 1.000000e+00, %126, !dbg !18
195
+ %128 = bitcast float %127 to i32, !dbg !18
196
+ %129 = bitcast float %34 to i32, !dbg !18
197
+ %130 = and i32 %129, -2147483648, !dbg !18
198
+ %131 = or i32 %130, %128, !dbg !18
199
+ %132 = bitcast i32 %131 to float, !dbg !18
200
+ br label %__nv_erff.exit25, !dbg !18
201
+
202
+ __nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %125
203
+ %r.0.i21 = phi float [ %132, %125 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
204
+ %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
205
+ %.not.i26 = icmp eq i32 %133, 0, !dbg !18
206
+ %134 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
207
+ %135 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
208
+ %.0.i27 = select i1 %.not.i26, float %135, float %134, !dbg !18
209
+ %136 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
210
+ br i1 %136, label %__nv_fabsf.exit1.i44, label %138, !dbg !18
211
+
212
+ __nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
213
+ %137 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
214
+ %.not1.i45 = icmp eq i32 %137, 0, !dbg !18
215
+ %.01.i46 = select i1 %.not1.i45, float %135, float %134, !dbg !18
216
+ br label %__internal_fmad.exit.i28, !dbg !18
217
+
218
+ 138: ; preds = %__nv_erff.exit25
219
+ %139 = fmul float %35, %35, !dbg !18
220
+ br label %__internal_fmad.exit.i28, !dbg !18
221
+
222
+ __internal_fmad.exit.i28: ; preds = %138, %__nv_fabsf.exit1.i44
223
+ %140 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %138 ], !dbg !18
224
+ %141 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %138 ], !dbg !18
225
+ %142 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %138 ], !dbg !18
226
+ %143 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %138 ], !dbg !18
227
+ %144 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %138 ], !dbg !18
228
+ %145 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %138 ], !dbg !18
229
+ %146 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %138 ], !dbg !18
230
+ %147 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %139, %138 ], !dbg !18
231
+ %148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
232
+ %.not2.i29 = icmp eq i32 %148, 0, !dbg !18
233
+ %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %146, float %147, float %145) #4, !dbg !18
234
+ %150 = tail call float @llvm.nvvm.fma.rn.f(float %146, float %147, float %145) #4, !dbg !18
235
+ %.02.i30 = select i1 %.not2.i29, float %150, float %149, !dbg !18
236
+ %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
237
+ %.not3.i31 = icmp eq i32 %151, 0, !dbg !18
238
+ %152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %147, float %144) #4, !dbg !18
239
+ %153 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %147, float %144) #4, !dbg !18
240
+ %.03.i32 = select i1 %.not3.i31, float %153, float %152, !dbg !18
241
+ %154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
242
+ %.not4.i33 = icmp eq i32 %154, 0, !dbg !18
243
+ %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %147, float %143) #4, !dbg !18
244
+ %156 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %147, float %143) #4, !dbg !18
245
+ %.04.i34 = select i1 %.not4.i33, float %156, float %155, !dbg !18
246
+ %157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
247
+ %.not5.i35 = icmp eq i32 %157, 0, !dbg !18
248
+ %158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %147, float %142) #4, !dbg !18
249
+ %159 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %147, float %142) #4, !dbg !18
250
+ %.05.i36 = select i1 %.not5.i35, float %159, float %158, !dbg !18
251
+ %160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
252
+ %.not6.i37 = icmp eq i32 %160, 0, !dbg !18
253
+ %161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %147, float %141) #4, !dbg !18
254
+ %162 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %147, float %141) #4, !dbg !18
255
+ %.06.i38 = select i1 %.not6.i37, float %162, float %161, !dbg !18
256
+ %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
257
+ %.not7.i39 = icmp eq i32 %163, 0, !dbg !18
258
+ %164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %147, float %140) #4, !dbg !18
259
+ %165 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %147, float %140) #4, !dbg !18
260
+ %.07.i40 = select i1 %.not7.i39, float %165, float %164, !dbg !18
261
+ %166 = fneg float %147, !dbg !18
262
+ %167 = select i1 %136, float %166, float %35, !dbg !18
263
+ %168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
264
+ %.not8.i41 = icmp eq i32 %168, 0, !dbg !18
265
+ %169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %167, float %167) #4, !dbg !18
266
+ %170 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %167, float %167) #4, !dbg !18
267
+ %.08.i42 = select i1 %.not8.i41, float %170, float %169, !dbg !18
268
+ br i1 %136, label %171, label %__nv_erff.exit47, !dbg !18
269
+
270
+ 171: ; preds = %__internal_fmad.exit.i28
271
+ %172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
272
+ %173 = fsub float 1.000000e+00, %172, !dbg !18
273
+ %174 = bitcast float %173 to i32, !dbg !18
274
+ %175 = bitcast float %35 to i32, !dbg !18
275
+ %176 = and i32 %175, -2147483648, !dbg !18
276
+ %177 = or i32 %176, %174, !dbg !18
277
+ %178 = bitcast i32 %177 to float, !dbg !18
278
+ br label %__nv_erff.exit47, !dbg !18
279
+
280
+ __nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %171
281
+ %r.0.i43 = phi float [ %178, %171 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
282
+ %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
283
+ %.not.i48 = icmp eq i32 %179, 0, !dbg !18
284
+ %180 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
285
+ %181 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
286
+ %.0.i49 = select i1 %.not.i48, float %181, float %180, !dbg !18
287
+ %182 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
288
+ br i1 %182, label %__nv_fabsf.exit1.i66, label %184, !dbg !18
289
+
290
+ __nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
291
+ %183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
292
+ %.not1.i67 = icmp eq i32 %183, 0, !dbg !18
293
+ %.01.i68 = select i1 %.not1.i67, float %181, float %180, !dbg !18
294
+ br label %__internal_fmad.exit.i50, !dbg !18
295
+
296
+ 184: ; preds = %__nv_erff.exit47
297
+ %185 = fmul float %36, %36, !dbg !18
298
+ br label %__internal_fmad.exit.i50, !dbg !18
299
+
300
+ __internal_fmad.exit.i50: ; preds = %184, %__nv_fabsf.exit1.i66
301
+ %186 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %184 ], !dbg !18
302
+ %187 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %184 ], !dbg !18
303
+ %188 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %184 ], !dbg !18
304
+ %189 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %184 ], !dbg !18
305
+ %190 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %184 ], !dbg !18
306
+ %191 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %184 ], !dbg !18
307
+ %192 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %184 ], !dbg !18
308
+ %193 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %185, %184 ], !dbg !18
309
+ %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
310
+ %.not2.i51 = icmp eq i32 %194, 0, !dbg !18
311
+ %195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %192, float %193, float %191) #4, !dbg !18
312
+ %196 = tail call float @llvm.nvvm.fma.rn.f(float %192, float %193, float %191) #4, !dbg !18
313
+ %.02.i52 = select i1 %.not2.i51, float %196, float %195, !dbg !18
314
+ %197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
315
+ %.not3.i53 = icmp eq i32 %197, 0, !dbg !18
316
+ %198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %193, float %190) #4, !dbg !18
317
+ %199 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %193, float %190) #4, !dbg !18
318
+ %.03.i54 = select i1 %.not3.i53, float %199, float %198, !dbg !18
319
+ %200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
320
+ %.not4.i55 = icmp eq i32 %200, 0, !dbg !18
321
+ %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %193, float %189) #4, !dbg !18
322
+ %202 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %193, float %189) #4, !dbg !18
323
+ %.04.i56 = select i1 %.not4.i55, float %202, float %201, !dbg !18
324
+ %203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
325
+ %.not5.i57 = icmp eq i32 %203, 0, !dbg !18
326
+ %204 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %193, float %188) #4, !dbg !18
327
+ %205 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %193, float %188) #4, !dbg !18
328
+ %.05.i58 = select i1 %.not5.i57, float %205, float %204, !dbg !18
329
+ %206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
330
+ %.not6.i59 = icmp eq i32 %206, 0, !dbg !18
331
+ %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %193, float %187) #4, !dbg !18
332
+ %208 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %193, float %187) #4, !dbg !18
333
+ %.06.i60 = select i1 %.not6.i59, float %208, float %207, !dbg !18
334
+ %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
335
+ %.not7.i61 = icmp eq i32 %209, 0, !dbg !18
336
+ %210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %193, float %186) #4, !dbg !18
337
+ %211 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %193, float %186) #4, !dbg !18
338
+ %.07.i62 = select i1 %.not7.i61, float %211, float %210, !dbg !18
339
+ %212 = fneg float %193, !dbg !18
340
+ %213 = select i1 %182, float %212, float %36, !dbg !18
341
+ %214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
342
+ %.not8.i63 = icmp eq i32 %214, 0, !dbg !18
343
+ %215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %213, float %213) #4, !dbg !18
344
+ %216 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %213, float %213) #4, !dbg !18
345
+ %.08.i64 = select i1 %.not8.i63, float %216, float %215, !dbg !18
346
+ br i1 %182, label %217, label %__nv_erff.exit69, !dbg !18
347
+
348
+ 217: ; preds = %__internal_fmad.exit.i50
349
+ %218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
350
+ %219 = fsub float 1.000000e+00, %218, !dbg !18
351
+ %220 = bitcast float %219 to i32, !dbg !18
352
+ %221 = bitcast float %36 to i32, !dbg !18
353
+ %222 = and i32 %221, -2147483648, !dbg !18
354
+ %223 = or i32 %222, %220, !dbg !18
355
+ %224 = bitcast i32 %223 to float, !dbg !18
356
+ br label %__nv_erff.exit69, !dbg !18
357
+
358
+ __nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %217
359
+ %r.0.i65 = phi float [ %224, %217 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
360
+ %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
361
+ %.not.i70 = icmp eq i32 %225, 0, !dbg !18
362
+ %226 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
363
+ %227 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
364
+ %.0.i71 = select i1 %.not.i70, float %227, float %226, !dbg !18
365
+ %228 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
366
+ br i1 %228, label %__nv_fabsf.exit1.i88, label %230, !dbg !18
367
+
368
+ __nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
369
+ %229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
370
+ %.not1.i89 = icmp eq i32 %229, 0, !dbg !18
371
+ %.01.i90 = select i1 %.not1.i89, float %227, float %226, !dbg !18
372
+ br label %__internal_fmad.exit.i72, !dbg !18
373
+
374
+ 230: ; preds = %__nv_erff.exit69
375
+ %231 = fmul float %37, %37, !dbg !18
376
+ br label %__internal_fmad.exit.i72, !dbg !18
377
+
378
+ __internal_fmad.exit.i72: ; preds = %230, %__nv_fabsf.exit1.i88
379
+ %232 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %230 ], !dbg !18
380
+ %233 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %230 ], !dbg !18
381
+ %234 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %230 ], !dbg !18
382
+ %235 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %230 ], !dbg !18
383
+ %236 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %230 ], !dbg !18
384
+ %237 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %230 ], !dbg !18
385
+ %238 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %230 ], !dbg !18
386
+ %239 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %231, %230 ], !dbg !18
387
+ %240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
388
+ %.not2.i73 = icmp eq i32 %240, 0, !dbg !18
389
+ %241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %238, float %239, float %237) #4, !dbg !18
390
+ %242 = tail call float @llvm.nvvm.fma.rn.f(float %238, float %239, float %237) #4, !dbg !18
391
+ %.02.i74 = select i1 %.not2.i73, float %242, float %241, !dbg !18
392
+ %243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
393
+ %.not3.i75 = icmp eq i32 %243, 0, !dbg !18
394
+ %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %239, float %236) #4, !dbg !18
395
+ %245 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %239, float %236) #4, !dbg !18
396
+ %.03.i76 = select i1 %.not3.i75, float %245, float %244, !dbg !18
397
+ %246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
398
+ %.not4.i77 = icmp eq i32 %246, 0, !dbg !18
399
+ %247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %239, float %235) #4, !dbg !18
400
+ %248 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %239, float %235) #4, !dbg !18
401
+ %.04.i78 = select i1 %.not4.i77, float %248, float %247, !dbg !18
402
+ %249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
403
+ %.not5.i79 = icmp eq i32 %249, 0, !dbg !18
404
+ %250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %239, float %234) #4, !dbg !18
405
+ %251 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %239, float %234) #4, !dbg !18
406
+ %.05.i80 = select i1 %.not5.i79, float %251, float %250, !dbg !18
407
+ %252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
408
+ %.not6.i81 = icmp eq i32 %252, 0, !dbg !18
409
+ %253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %239, float %233) #4, !dbg !18
410
+ %254 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %239, float %233) #4, !dbg !18
411
+ %.06.i82 = select i1 %.not6.i81, float %254, float %253, !dbg !18
412
+ %255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
413
+ %.not7.i83 = icmp eq i32 %255, 0, !dbg !18
414
+ %256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %239, float %232) #4, !dbg !18
415
+ %257 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %239, float %232) #4, !dbg !18
416
+ %.07.i84 = select i1 %.not7.i83, float %257, float %256, !dbg !18
417
+ %258 = fneg float %239, !dbg !18
418
+ %259 = select i1 %228, float %258, float %37, !dbg !18
419
+ %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
420
+ %.not8.i85 = icmp eq i32 %260, 0, !dbg !18
421
+ %261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %259, float %259) #4, !dbg !18
422
+ %262 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %259, float %259) #4, !dbg !18
423
+ %.08.i86 = select i1 %.not8.i85, float %262, float %261, !dbg !18
424
+ br i1 %228, label %263, label %__nv_erff.exit91, !dbg !18
425
+
426
+ 263: ; preds = %__internal_fmad.exit.i72
427
+ %264 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
428
+ %265 = fsub float 1.000000e+00, %264, !dbg !18
429
+ %266 = bitcast float %265 to i32, !dbg !18
430
+ %267 = bitcast float %37 to i32, !dbg !18
431
+ %268 = and i32 %267, -2147483648, !dbg !18
432
+ %269 = or i32 %268, %266, !dbg !18
433
+ %270 = bitcast i32 %269 to float, !dbg !18
434
+ br label %__nv_erff.exit91, !dbg !18
435
+
436
+ __nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %263
437
+ %r.0.i87 = phi float [ %270, %263 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
438
+ %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
439
+ %.not.i92 = icmp eq i32 %271, 0, !dbg !18
440
+ %272 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
441
+ %273 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
442
+ %.0.i93 = select i1 %.not.i92, float %273, float %272, !dbg !18
443
+ %274 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
444
+ br i1 %274, label %__nv_fabsf.exit1.i110, label %276, !dbg !18
445
+
446
+ __nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
447
+ %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
448
+ %.not1.i111 = icmp eq i32 %275, 0, !dbg !18
449
+ %.01.i112 = select i1 %.not1.i111, float %273, float %272, !dbg !18
450
+ br label %__internal_fmad.exit.i94, !dbg !18
451
+
452
+ 276: ; preds = %__nv_erff.exit91
453
+ %277 = fmul float %38, %38, !dbg !18
454
+ br label %__internal_fmad.exit.i94, !dbg !18
455
+
456
+ __internal_fmad.exit.i94: ; preds = %276, %__nv_fabsf.exit1.i110
457
+ %278 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %276 ], !dbg !18
458
+ %279 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %276 ], !dbg !18
459
+ %280 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %276 ], !dbg !18
460
+ %281 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %276 ], !dbg !18
461
+ %282 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %276 ], !dbg !18
462
+ %283 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %276 ], !dbg !18
463
+ %284 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %276 ], !dbg !18
464
+ %285 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %277, %276 ], !dbg !18
465
+ %286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
466
+ %.not2.i95 = icmp eq i32 %286, 0, !dbg !18
467
+ %287 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %284, float %285, float %283) #4, !dbg !18
468
+ %288 = tail call float @llvm.nvvm.fma.rn.f(float %284, float %285, float %283) #4, !dbg !18
469
+ %.02.i96 = select i1 %.not2.i95, float %288, float %287, !dbg !18
470
+ %289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
471
+ %.not3.i97 = icmp eq i32 %289, 0, !dbg !18
472
+ %290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %285, float %282) #4, !dbg !18
473
+ %291 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %285, float %282) #4, !dbg !18
474
+ %.03.i98 = select i1 %.not3.i97, float %291, float %290, !dbg !18
475
+ %292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
476
+ %.not4.i99 = icmp eq i32 %292, 0, !dbg !18
477
+ %293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %285, float %281) #4, !dbg !18
478
+ %294 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %285, float %281) #4, !dbg !18
479
+ %.04.i100 = select i1 %.not4.i99, float %294, float %293, !dbg !18
480
+ %295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
481
+ %.not5.i101 = icmp eq i32 %295, 0, !dbg !18
482
+ %296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %285, float %280) #4, !dbg !18
483
+ %297 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %285, float %280) #4, !dbg !18
484
+ %.05.i102 = select i1 %.not5.i101, float %297, float %296, !dbg !18
485
+ %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
486
+ %.not6.i103 = icmp eq i32 %298, 0, !dbg !18
487
+ %299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %285, float %279) #4, !dbg !18
488
+ %300 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %285, float %279) #4, !dbg !18
489
+ %.06.i104 = select i1 %.not6.i103, float %300, float %299, !dbg !18
490
+ %301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
491
+ %.not7.i105 = icmp eq i32 %301, 0, !dbg !18
492
+ %302 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %285, float %278) #4, !dbg !18
493
+ %303 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %285, float %278) #4, !dbg !18
494
+ %.07.i106 = select i1 %.not7.i105, float %303, float %302, !dbg !18
495
+ %304 = fneg float %285, !dbg !18
496
+ %305 = select i1 %274, float %304, float %38, !dbg !18
497
+ %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
498
+ %.not8.i107 = icmp eq i32 %306, 0, !dbg !18
499
+ %307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %305, float %305) #4, !dbg !18
500
+ %308 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %305, float %305) #4, !dbg !18
501
+ %.08.i108 = select i1 %.not8.i107, float %308, float %307, !dbg !18
502
+ br i1 %274, label %309, label %__nv_erff.exit113, !dbg !18
503
+
504
+ 309: ; preds = %__internal_fmad.exit.i94
505
+ %310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
506
+ %311 = fsub float 1.000000e+00, %310, !dbg !18
507
+ %312 = bitcast float %311 to i32, !dbg !18
508
+ %313 = bitcast float %38 to i32, !dbg !18
509
+ %314 = and i32 %313, -2147483648, !dbg !18
510
+ %315 = or i32 %314, %312, !dbg !18
511
+ %316 = bitcast i32 %315 to float, !dbg !18
512
+ br label %__nv_erff.exit113, !dbg !18
513
+
514
+ __nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %309
515
+ %r.0.i109 = phi float [ %316, %309 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
516
+ %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
517
+ %.not.i114 = icmp eq i32 %317, 0, !dbg !18
518
+ %318 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
519
+ %319 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
520
+ %.0.i115 = select i1 %.not.i114, float %319, float %318, !dbg !18
521
+ %320 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
522
+ br i1 %320, label %__nv_fabsf.exit1.i132, label %322, !dbg !18
523
+
524
+ __nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
525
+ %321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
526
+ %.not1.i133 = icmp eq i32 %321, 0, !dbg !18
527
+ %.01.i134 = select i1 %.not1.i133, float %319, float %318, !dbg !18
528
+ br label %__internal_fmad.exit.i116, !dbg !18
529
+
530
+ 322: ; preds = %__nv_erff.exit113
531
+ %323 = fmul float %39, %39, !dbg !18
532
+ br label %__internal_fmad.exit.i116, !dbg !18
533
+
534
+ __internal_fmad.exit.i116: ; preds = %322, %__nv_fabsf.exit1.i132
535
+ %324 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %322 ], !dbg !18
536
+ %325 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %322 ], !dbg !18
537
+ %326 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %322 ], !dbg !18
538
+ %327 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %322 ], !dbg !18
539
+ %328 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %322 ], !dbg !18
540
+ %329 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %322 ], !dbg !18
541
+ %330 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %322 ], !dbg !18
542
+ %331 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %323, %322 ], !dbg !18
543
+ %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
544
+ %.not2.i117 = icmp eq i32 %332, 0, !dbg !18
545
+ %333 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %330, float %331, float %329) #4, !dbg !18
546
+ %334 = tail call float @llvm.nvvm.fma.rn.f(float %330, float %331, float %329) #4, !dbg !18
547
+ %.02.i118 = select i1 %.not2.i117, float %334, float %333, !dbg !18
548
+ %335 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
549
+ %.not3.i119 = icmp eq i32 %335, 0, !dbg !18
550
+ %336 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %331, float %328) #4, !dbg !18
551
+ %337 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %331, float %328) #4, !dbg !18
552
+ %.03.i120 = select i1 %.not3.i119, float %337, float %336, !dbg !18
553
+ %338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
554
+ %.not4.i121 = icmp eq i32 %338, 0, !dbg !18
555
+ %339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %331, float %327) #4, !dbg !18
556
+ %340 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %331, float %327) #4, !dbg !18
557
+ %.04.i122 = select i1 %.not4.i121, float %340, float %339, !dbg !18
558
+ %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
559
+ %.not5.i123 = icmp eq i32 %341, 0, !dbg !18
560
+ %342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %331, float %326) #4, !dbg !18
561
+ %343 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %331, float %326) #4, !dbg !18
562
+ %.05.i124 = select i1 %.not5.i123, float %343, float %342, !dbg !18
563
+ %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
564
+ %.not6.i125 = icmp eq i32 %344, 0, !dbg !18
565
+ %345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %331, float %325) #4, !dbg !18
566
+ %346 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %331, float %325) #4, !dbg !18
567
+ %.06.i126 = select i1 %.not6.i125, float %346, float %345, !dbg !18
568
+ %347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
569
+ %.not7.i127 = icmp eq i32 %347, 0, !dbg !18
570
+ %348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %331, float %324) #4, !dbg !18
571
+ %349 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %331, float %324) #4, !dbg !18
572
+ %.07.i128 = select i1 %.not7.i127, float %349, float %348, !dbg !18
573
+ %350 = fneg float %331, !dbg !18
574
+ %351 = select i1 %320, float %350, float %39, !dbg !18
575
+ %352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
576
+ %.not8.i129 = icmp eq i32 %352, 0, !dbg !18
577
+ %353 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %351, float %351) #4, !dbg !18
578
+ %354 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %351, float %351) #4, !dbg !18
579
+ %.08.i130 = select i1 %.not8.i129, float %354, float %353, !dbg !18
580
+ br i1 %320, label %355, label %__nv_erff.exit135, !dbg !18
581
+
582
+ 355: ; preds = %__internal_fmad.exit.i116
583
+ %356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
584
+ %357 = fsub float 1.000000e+00, %356, !dbg !18
585
+ %358 = bitcast float %357 to i32, !dbg !18
586
+ %359 = bitcast float %39 to i32, !dbg !18
587
+ %360 = and i32 %359, -2147483648, !dbg !18
588
+ %361 = or i32 %360, %358, !dbg !18
589
+ %362 = bitcast i32 %361 to float, !dbg !18
590
+ br label %__nv_erff.exit135, !dbg !18
591
+
592
+ __nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %355
593
+ %r.0.i131 = phi float [ %362, %355 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
594
+ %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
595
+ %.not.i136 = icmp eq i32 %363, 0, !dbg !18
596
+ %364 = tail call float @llvm.nvvm.fabs.ftz.f(float %40) #4, !dbg !18
597
+ %365 = tail call float @llvm.nvvm.fabs.f(float %40) #4, !dbg !18
598
+ %.0.i137 = select i1 %.not.i136, float %365, float %364, !dbg !18
599
+ %366 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
600
+ br i1 %366, label %__nv_fabsf.exit1.i154, label %368, !dbg !18
601
+
602
+ __nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
603
+ %367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
604
+ %.not1.i155 = icmp eq i32 %367, 0, !dbg !18
605
+ %.01.i156 = select i1 %.not1.i155, float %365, float %364, !dbg !18
606
+ br label %__internal_fmad.exit.i138, !dbg !18
607
+
608
+ 368: ; preds = %__nv_erff.exit135
609
+ %369 = fmul float %40, %40, !dbg !18
610
+ br label %__internal_fmad.exit.i138, !dbg !18
611
+
612
+ __internal_fmad.exit.i138: ; preds = %368, %__nv_fabsf.exit1.i154
613
+ %370 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %368 ], !dbg !18
614
+ %371 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %368 ], !dbg !18
615
+ %372 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %368 ], !dbg !18
616
+ %373 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %368 ], !dbg !18
617
+ %374 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %368 ], !dbg !18
618
+ %375 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %368 ], !dbg !18
619
+ %376 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %368 ], !dbg !18
620
+ %377 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %369, %368 ], !dbg !18
621
+ %378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
622
+ %.not2.i139 = icmp eq i32 %378, 0, !dbg !18
623
+ %379 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float %377, float %375) #4, !dbg !18
624
+ %380 = tail call float @llvm.nvvm.fma.rn.f(float %376, float %377, float %375) #4, !dbg !18
625
+ %.02.i140 = select i1 %.not2.i139, float %380, float %379, !dbg !18
626
+ %381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
627
+ %.not3.i141 = icmp eq i32 %381, 0, !dbg !18
628
+ %382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %377, float %374) #4, !dbg !18
629
+ %383 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %377, float %374) #4, !dbg !18
630
+ %.03.i142 = select i1 %.not3.i141, float %383, float %382, !dbg !18
631
+ %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
632
+ %.not4.i143 = icmp eq i32 %384, 0, !dbg !18
633
+ %385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %377, float %373) #4, !dbg !18
634
+ %386 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %377, float %373) #4, !dbg !18
635
+ %.04.i144 = select i1 %.not4.i143, float %386, float %385, !dbg !18
636
+ %387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
637
+ %.not5.i145 = icmp eq i32 %387, 0, !dbg !18
638
+ %388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %377, float %372) #4, !dbg !18
639
+ %389 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %377, float %372) #4, !dbg !18
640
+ %.05.i146 = select i1 %.not5.i145, float %389, float %388, !dbg !18
641
+ %390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
642
+ %.not6.i147 = icmp eq i32 %390, 0, !dbg !18
643
+ %391 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %377, float %371) #4, !dbg !18
644
+ %392 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %377, float %371) #4, !dbg !18
645
+ %.06.i148 = select i1 %.not6.i147, float %392, float %391, !dbg !18
646
+ %393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
647
+ %.not7.i149 = icmp eq i32 %393, 0, !dbg !18
648
+ %394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %377, float %370) #4, !dbg !18
649
+ %395 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %377, float %370) #4, !dbg !18
650
+ %.07.i150 = select i1 %.not7.i149, float %395, float %394, !dbg !18
651
+ %396 = fneg float %377, !dbg !18
652
+ %397 = select i1 %366, float %396, float %40, !dbg !18
653
+ %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
654
+ %.not8.i151 = icmp eq i32 %398, 0, !dbg !18
655
+ %399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %397, float %397) #4, !dbg !18
656
+ %400 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %397, float %397) #4, !dbg !18
657
+ %.08.i152 = select i1 %.not8.i151, float %400, float %399, !dbg !18
658
+ br i1 %366, label %401, label %__nv_erff.exit157, !dbg !18
659
+
660
+ 401: ; preds = %__internal_fmad.exit.i138
661
+ %402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
662
+ %403 = fsub float 1.000000e+00, %402, !dbg !18
663
+ %404 = bitcast float %403 to i32, !dbg !18
664
+ %405 = bitcast float %40 to i32, !dbg !18
665
+ %406 = and i32 %405, -2147483648, !dbg !18
666
+ %407 = or i32 %406, %404, !dbg !18
667
+ %408 = bitcast i32 %407 to float, !dbg !18
668
+ br label %__nv_erff.exit157, !dbg !18
669
+
670
+ __nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %401
671
+ %r.0.i153 = phi float [ %408, %401 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
672
+ %409 = fmul float %32, 5.000000e-01, !dbg !19
673
+ %410 = fmul float %31, 5.000000e-01, !dbg !19
674
+ %411 = fmul float %30, 5.000000e-01, !dbg !19
675
+ %412 = fmul float %29, 5.000000e-01, !dbg !19
676
+ %413 = fmul float %28, 5.000000e-01, !dbg !19
677
+ %414 = fmul float %27, 5.000000e-01, !dbg !19
678
+ %415 = fmul float %26, 5.000000e-01, !dbg !19
679
+ %416 = fmul float %25, 5.000000e-01, !dbg !19
680
+ %417 = fadd float %r.0.i, 1.000000e+00, !dbg !20
681
+ %418 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
682
+ %419 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
683
+ %420 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
684
+ %421 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
685
+ %422 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
686
+ %423 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
687
+ %424 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
688
+ %425 = fmul float %416, %417, !dbg !21
689
+ %426 = fmul float %415, %418, !dbg !21
690
+ %427 = fmul float %414, %419, !dbg !21
691
+ %428 = fmul float %413, %420, !dbg !21
692
+ %429 = fmul float %412, %421, !dbg !21
693
+ %430 = fmul float %411, %422, !dbg !21
694
+ %431 = fmul float %410, %423, !dbg !21
695
+ %432 = fmul float %409, %424, !dbg !21
696
+ %433 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !22
697
+ %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !23
698
+ %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !23
699
+ %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !23
700
+ %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !23
701
+ %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !23
702
+ %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !23
703
+ %440 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !23
704
+ %441 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %432) #4, !dbg !23
705
+ %442 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !23
706
+ %443 = insertelement <2 x i16> %442, i16 %435, i64 1, !dbg !23
707
+ %444 = bitcast <2 x i16> %443 to i32, !dbg !23
708
+ %445 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !23
709
+ %446 = insertelement <2 x i16> %445, i16 %437, i64 1, !dbg !23
710
+ %447 = bitcast <2 x i16> %446 to i32, !dbg !23
711
+ %448 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !23
712
+ %449 = insertelement <2 x i16> %448, i16 %439, i64 1, !dbg !23
713
+ %450 = bitcast <2 x i16> %449 to i32, !dbg !23
714
+ %451 = insertelement <2 x i16> undef, i16 %440, i64 0, !dbg !23
715
+ %452 = insertelement <2 x i16> %451, i16 %441, i64 1, !dbg !23
716
+ %453 = bitcast <2 x i16> %452 to i32, !dbg !23
717
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %444, i32 %447, i32 %450, i32 %453, ptr addrspace(1) %433, i1 true) #4, !dbg !23
718
+ ret void, !dbg !24
719
+ }
720
+
721
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
722
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
723
+
724
+ ; Function Attrs: alwaysinline nounwind
725
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
726
+ __nv_fabsf.exit:
727
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
728
+ %.not = icmp eq i32 %0, 0
729
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
730
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
731
+ %.0 = select i1 %.not, float %2, float %1
732
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
733
+ br i1 %3, label %__nv_fabsf.exit1, label %5
734
+
735
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
736
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
737
+ %.not1 = icmp eq i32 %4, 0
738
+ %.01 = select i1 %.not1, float %2, float %1
739
+ br label %__internal_fmad.exit
740
+
741
+ 5: ; preds = %__nv_fabsf.exit
742
+ %6 = fmul float %a, %a
743
+ br label %__internal_fmad.exit
744
+
745
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
746
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
747
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
748
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
749
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
750
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
751
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
752
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
753
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
754
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
755
+ %.not2 = icmp eq i32 %15, 0
756
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
757
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
758
+ %.02 = select i1 %.not2, float %17, float %16
759
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
760
+ %.not3 = icmp eq i32 %18, 0
761
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
762
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
763
+ %.03 = select i1 %.not3, float %20, float %19
764
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
765
+ %.not4 = icmp eq i32 %21, 0
766
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
767
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
768
+ %.04 = select i1 %.not4, float %23, float %22
769
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
770
+ %.not5 = icmp eq i32 %24, 0
771
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
772
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
773
+ %.05 = select i1 %.not5, float %26, float %25
774
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
775
+ %.not6 = icmp eq i32 %27, 0
776
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
777
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
778
+ %.06 = select i1 %.not6, float %29, float %28
779
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
780
+ %.not7 = icmp eq i32 %30, 0
781
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
782
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
783
+ %.07 = select i1 %.not7, float %32, float %31
784
+ %33 = fneg float %14
785
+ %34 = select i1 %3, float %33, float %a
786
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
787
+ %.not8 = icmp eq i32 %35, 0
788
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
789
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
790
+ %.08 = select i1 %.not8, float %37, float %36
791
+ br i1 %3, label %38, label %46
792
+
793
+ 38: ; preds = %__internal_fmad.exit
794
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
795
+ %40 = fsub float 1.000000e+00, %39
796
+ %41 = bitcast float %40 to i32
797
+ %42 = bitcast float %a to i32
798
+ %43 = and i32 %42, -2147483648
799
+ %44 = or i32 %43, %41
800
+ %45 = bitcast i32 %44 to float
801
+ br label %46
802
+
803
+ 46: ; preds = %38, %__internal_fmad.exit
804
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
805
+ ret float %r.0
806
+ }
807
+
808
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
809
+
810
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
811
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
812
+
813
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
814
+ declare float @llvm.nvvm.fabs.f(float) #0
815
+
816
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
817
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
818
+
819
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
820
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
821
+
822
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
823
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
824
+
825
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
826
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
827
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
828
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
829
+ attributes #4 = { nounwind }
830
+
831
+ !llvm.module.flags = !{!0, !1}
832
+ !llvm.dbg.cu = !{!2}
833
+ !nvvm.annotations = !{!4, !5, !5, !4}
834
+ !llvm.ident = !{!6}
835
+
836
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
837
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
838
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
839
+ !3 = !DIFile(filename: "cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py", directory: "/tmp/torchinductor_root/jf")
840
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
841
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
842
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
843
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
844
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
845
+ !9 = !{}
846
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
847
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
848
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
849
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
850
+ !14 = !DILocation(line: 24, column: 30, scope: !7)
851
+ !15 = !DILocation(line: 24, column: 35, scope: !7)
852
+ !16 = !DILocation(line: 24, column: 44, scope: !7)
853
+ !17 = !DILocation(line: 29, column: 18, scope: !7)
854
+ !18 = !DILocation(line: 30, column: 23, scope: !7)
855
+ !19 = !DILocation(line: 27, column: 18, scope: !7)
856
+ !20 = !DILocation(line: 32, column: 18, scope: !7)
857
+ !21 = !DILocation(line: 33, column: 18, scope: !7)
858
+ !22 = !DILocation(line: 35, column: 25, scope: !7)
859
+ !23 = !DILocation(line: 35, column: 37, scope: !7)
860
+ !24 = !DILocation(line: 35, column: 4, scope: !7)
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %c1024_i32 = arith.constant 1024 : i32
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c1024_i32 : i32
9
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
10
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
11
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
12
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
15
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
16
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
17
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
18
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
19
+ %12 = arith.addf %11, %cst : tensor<1024xf32>
20
+ %13 = arith.mulf %9, %12 : tensor<1024xf32>
21
+ %14 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
22
+ %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
23
+ %16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
24
+ tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
25
+ tt.return
26
+ }
27
+ }
.triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.llir ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 510, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = sext i32 %9 to i64, !dbg !12
12
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
13
+ %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
14
+ %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
15
+ %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
16
+ %15 = bitcast i32 %13 to float, !dbg !13
17
+ %16 = bitcast i32 %14 to float, !dbg !13
18
+ %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
19
+ %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
20
+ %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
21
+ %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
22
+ %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
23
+ %22 = bitcast <2 x i16> %21 to i32, !dbg !15
24
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
25
+ ret void, !dbg !16
26
+ }
27
+
28
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
29
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
30
+
31
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
32
+ attributes #1 = { nounwind }
33
+
34
+ !llvm.module.flags = !{!0}
35
+ !llvm.dbg.cu = !{!1}
36
+ !nvvm.annotations = !{!3, !4, !4, !3}
37
+
38
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
39
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
40
+ !2 = !DIFile(filename: "czjxjqxojsyyr4zmce6q6twysnucw6p4l5ujgp6ts2ecrm3ue3ex.py", directory: "/tmp/torchinductor_root/zj")
41
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
42
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
43
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
44
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
45
+ !7 = !{}
46
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
47
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
48
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
49
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
50
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
51
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
52
+ !14 = !DILocation(line: 26, column: 25, scope: !5)
53
+ !15 = !DILocation(line: 26, column: 36, scope: !5)
54
+ !16 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin ADDED
Binary file (13.7 kB). View file
 
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.cubin ADDED
Binary file (19.5 kB). View file
 
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %c256_i32 = arith.constant 256 : i32
5
+ %cst = arith.constant 0.000000e+00 : f32
6
+ %cst_0 = arith.constant 2.560000e+02 : f32
7
+ %cst_1 = arith.constant 9.99999974E-6 : f32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_3 = arith.constant dense<256> : tensor<1xi64>
10
+ %cst_4 = arith.constant dense<50257> : tensor<1xi64>
11
+ %cst_5 = arith.constant dense<0> : tensor<1xi64>
12
+ %cst_6 = arith.constant dense<256> : tensor<256xi32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
15
+ %2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32>
16
+ %3 = arith.remsi %0, %c512_i32 : i32
17
+ %4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
18
+ %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
19
+ %6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
20
+ %7 = arith.muli %3, %c256_i32 : i32
21
+ %8 = tt.splat %7 : (i32) -> tensor<256xi32>
22
+ %9 = arith.addi %1, %8 : tensor<256xi32>
23
+ %10 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %11 = tt.addptr %10, %9 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
26
+ %13 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
27
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
28
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
29
+ %16 = arith.addi %6, %cst_4 : tensor<1xi64>
30
+ %17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64>
31
+ %18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64>
32
+ %19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64>
33
+ %20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64>
34
+ %21 = arith.andi %19, %20 : tensor<1xi1>
35
+ tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1>
36
+ %22 = arith.muli %18, %cst_3 : tensor<1xi64>
37
+ %23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64>
38
+ %24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
39
+ %25 = arith.addi %24, %23 : tensor<256xi64>
40
+ %26 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
41
+ %27 = tt.addptr %26, %25 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
42
+ %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
43
+ %29 = arith.addf %28, %12 : tensor<256xf32>
44
+ %30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32>
45
+ %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg10: f32, %arg11: f32):
47
+ %60 = arith.addf %arg10, %arg11 : f32
48
+ tt.reduce.return %60 : f32
49
+ }) : (tensor<256xf32>) -> f32
50
+ %32 = arith.addf %31, %cst : f32
51
+ %33 = arith.divf %32, %cst_0 : f32
52
+ %34 = tt.splat %33 : (f32) -> tensor<1xf32>
53
+ %35 = tt.splat %33 : (f32) -> tensor<256xf32>
54
+ %36 = arith.subf %29, %35 : tensor<256xf32>
55
+ %37 = arith.mulf %36, %36 : tensor<256xf32>
56
+ %38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32>
57
+ %39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({
58
+ ^bb0(%arg10: f32, %arg11: f32):
59
+ %60 = arith.addf %arg10, %arg11 : f32
60
+ tt.reduce.return %60 : f32
61
+ }) : (tensor<256xf32>) -> f32
62
+ %40 = arith.addf %39, %cst : f32
63
+ %41 = arith.divf %40, %cst_0 : f32
64
+ %42 = arith.addf %41, %cst_1 : f32
65
+ %43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
66
+ %44 = tt.splat %43 : (f32) -> tensor<1xf32>
67
+ %45 = tt.splat %43 : (f32) -> tensor<256xf32>
68
+ %46 = arith.mulf %36, %45 : tensor<256xf32>
69
+ %47 = arith.mulf %46, %15 : tensor<256xf32>
70
+ %48 = arith.muli %0, %c256_i32 : i32
71
+ %49 = tt.splat %48 : (i32) -> tensor<256xi32>
72
+ %50 = arith.addi %1, %49 : tensor<256xi32>
73
+ %51 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
74
+ %52 = tt.addptr %51, %50 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
75
+ tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
76
+ gpu.barrier
77
+ %53 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
78
+ %54 = tt.splat %53 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
79
+ tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
80
+ %55 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
81
+ %56 = tt.addptr %55, %50 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
82
+ %57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
83
+ tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
84
+ %58 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
85
+ %59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
86
+ tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
87
+ tt.return
88
+ }
89
+ }
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin ADDED
Binary file (5.16 kB). View file
 
.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 510, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 9, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = icmp slt i32 %9, 12865792, !dbg !12
12
+ %11 = sext i32 %9 to i64, !dbg !13
13
+ %12 = getelementptr i16, ptr addrspace(1) %0, i64 %11, !dbg !13
14
+ %13 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
15
+ %14 = trunc i32 %13 to i16, !dbg !14
16
+ %extelt.offset = lshr i32 %13, 16, !dbg !14
17
+ %15 = trunc i32 %extelt.offset to i16, !dbg !14
18
+ %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !15
19
+ %17 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !15
20
+ %18 = getelementptr float, ptr addrspace(1) %1, i64 %11, !dbg !16
21
+ %19 = bitcast float %16 to i32, !dbg !17
22
+ %20 = bitcast float %17 to i32, !dbg !17
23
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %19, i32 %20, ptr addrspace(1) %18, i1 %10) #1, !dbg !17
24
+ ret void, !dbg !18
25
+ }
26
+
27
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
28
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
29
+
30
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
31
+ attributes #1 = { nounwind }
32
+
33
+ !llvm.module.flags = !{!0}
34
+ !llvm.dbg.cu = !{!1}
35
+ !nvvm.annotations = !{!3, !4, !4, !3}
36
+
37
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
38
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
39
+ !2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
40
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
41
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
42
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
43
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
44
+ !7 = !{}
45
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
46
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
47
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
48
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
49
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
50
+ !13 = !DILocation(line: 24, column: 30, scope: !5)
51
+ !14 = !DILocation(line: 24, column: 35, scope: !5)
52
+ !15 = !DILocation(line: 24, column: 45, scope: !5)
53
+ !16 = !DILocation(line: 26, column: 25, scope: !5)
54
+ !17 = !DILocation(line: 26, column: 36, scope: !5)
55
+ !18 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 10, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = or i32 %8, 512, !dbg !11
12
+ %10 = sext i32 %8 to i64, !dbg !12
13
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
14
+ %12 = sext i32 %9 to i64, !dbg !12
15
+ %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !12
16
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !13
17
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 true) #1, !dbg !13
18
+ ret void, !dbg !14
19
+ }
20
+
21
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
22
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
23
+
24
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
25
+ attributes #1 = { nounwind }
26
+
27
+ !llvm.module.flags = !{!0}
28
+ !llvm.dbg.cu = !{!1}
29
+ !nvvm.annotations = !{!3, !4, !4, !3}
30
+
31
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
32
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
33
+ !2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w")
34
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
35
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
36
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
37
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
38
+ !7 = !{}
39
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
40
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
41
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
42
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
43
+ !12 = !DILocation(line: 25, column: 25, scope: !5)
44
+ !13 = !DILocation(line: 25, column: 36, scope: !5)
45
+ !14 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<512> : tensor<128xi32, #blocked>
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c128_i32 : i32
8
+ %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<128xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>, #blocked>, tensor<128xi32, #blocked>
14
+ %8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked>
15
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = or i32 %5, 512, !dbg !8
9
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
10
+ %8 = sext i32 %7 to i64, !dbg !10
11
+ %9 = shl nsw i64 %8, 10, !dbg !11
12
+ %10 = zext nneg i32 %5 to i64
13
+ %11 = zext nneg i32 %6 to i64
14
+ %12 = or i64 %9, %10, !dbg !12
15
+ %13 = or i64 %9, %11, !dbg !12
16
+ %14 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
17
+ %15 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !13
18
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %14, i1 true) #1, !dbg !14
19
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 true) #1, !dbg !14
20
+ ret void, !dbg !15
21
+ }
22
+
23
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
24
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
25
+
26
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
27
+ attributes #1 = { nounwind }
28
+
29
+ !llvm.module.flags = !{!0}
30
+ !llvm.dbg.cu = !{!1}
31
+ !nvvm.annotations = !{!3, !4, !4, !3}
32
+
33
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
34
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
35
+ !2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
36
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
37
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
38
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
39
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
40
+ !7 = !{}
41
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
42
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
43
+ !10 = !DILocation(line: 20, column: 34, scope: !5)
44
+ !11 = !DILocation(line: 20, column: 46, scope: !5)
45
+ !12 = !DILocation(line: 21, column: 23, scope: !5)
46
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
47
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
48
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
5
+ %c1024_i64 = arith.constant 1024 : i64
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.extsi %0 : i32 to i64
8
+ %2 = arith.muli %1, %c1024_i64 : i64
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
10
+ %4 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
11
+ %5 = tt.splat %2 : (i64) -> tensor<1024xi64, #blocked>
12
+ %6 = arith.addi %5, %4 : tensor<1024xi64, #blocked>
13
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
14
+ %8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi64, #blocked>
15
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin ADDED
Binary file (4.78 kB). View file
 
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<512> : tensor<256xi32, #blocked>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c256_i32 : i32
8
+ %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<256xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<256xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>, #blocked>, tensor<256xi32, #blocked>
14
+ %8 = arith.extsi %4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
15
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
6
+ %cst_0 = arith.constant dense<12865792> : tensor<1024xi32, #blocked1>
7
+ %c1024_i32 = arith.constant 1024 : i32
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.muli %0, %c1024_i32 : i32
10
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
11
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
12
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
13
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
14
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
15
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
16
+ %8 = arith.cmpi slt, %6, %cst : tensor<1024xi32, #blocked>
17
+ %9 = arith.cmpi slt, %7, %cst_0 : tensor<1024xi32, #blocked1>
18
+ %10 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
19
+ %11 = tt.addptr %10, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
20
+ %12 = tt.load %11, %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
21
+ %13 = triton_gpu.convert_layout %12 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
22
+ %14 = arith.extf %13 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
23
+ %15 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
24
+ %16 = tt.addptr %15, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
25
+ tt.store %16, %14, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
26
+ tt.return
27
+ }
28
+ }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<12865792> : tensor<1024xi32>
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
10
+ %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32>
11
+ %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
12
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
13
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
14
+ %9 = arith.extf %8 : tensor<1024xbf16> to tensor<1024xf32>
15
+ %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
16
+ %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
17
+ tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
18
+ tt.return
19
+ }
20
+ }
.triton/dump/962d1809855a53123762906133b1d960/triton_.llir ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 10, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = or i32 %8, 512, !dbg !11
12
+ %10 = icmp slt i32 %8, 12865792, !dbg !12
13
+ %11 = icmp slt i32 %9, 12865792, !dbg !12
14
+ %12 = sext i32 %8 to i64, !dbg !13
15
+ %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
16
+ %14 = sext i32 %9 to i64, !dbg !13
17
+ %15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13
18
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 %10) #1, !dbg !14
19
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 %11) #1, !dbg !14
20
+ ret void, !dbg !15
21
+ }
22
+
23
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
24
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
25
+
26
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
27
+ attributes #1 = { nounwind }
28
+
29
+ !llvm.module.flags = !{!0}
30
+ !llvm.dbg.cu = !{!1}
31
+ !nvvm.annotations = !{!3, !4, !4, !3}
32
+
33
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
34
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
35
+ !2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
36
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
37
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
38
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
39
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
40
+ !7 = !{}
41
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
42
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
43
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
44
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
45
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
46
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
47
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
48
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin ADDED
Binary file (49.4 kB). View file
 
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx ADDED
@@ -0,0 +1,771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
22
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
23
+ .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
24
+ .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
25
+ )
26
+ .maxntid 128, 1, 1
27
+ {
28
+ .reg .pred %p<38>;
29
+ .reg .b16 %rs<13>;
30
+ .reg .b32 %r<135>;
31
+ .reg .f32 %f<103>;
32
+ .reg .b64 %rd<41>;
33
+ .loc 1 18 0
34
+ $L__func_begin0:
35
+ .loc 1 18 0
36
+
37
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
38
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
39
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
40
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
41
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
42
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
43
+ $L__tmp0:
44
+ .loc 1 22 44
45
+ mov.u32 %r1, %tid.x;
46
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
47
+ shl.b32 %r17, %r1, 2;
48
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
49
+ and.b32 %r18, %r17, 60;
50
+ bfe.u32 %r19, %r1, 5, 2;
51
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
52
+ bfe.u32 %r20, %r1, 1, 4;
53
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
54
+ shl.b32 %r21, %r19, 4;
55
+ or.b32 %r2, %r21, %r20;
56
+ .loc 1 24 33
57
+ and.b32 %r22, %r17, 4;
58
+ bfe.u32 %r23, %r1, 4, 1;
59
+ shl.b32 %r24, %r19, 1;
60
+ or.b32 %r25, %r24, %r23;
61
+ .loc 1 21 28
62
+ mov.u32 %r15, %ctaid.x;
63
+ .loc 1 21 33
64
+ shl.b32 %r3, %r15, 6;
65
+ .loc 1 22 23
66
+ or.b32 %r26, %r3, %r18;
67
+ or.b32 %r27, %r3, %r2;
68
+ .loc 1 26 20
69
+ shr.s32 %r29, %r26, 31;
70
+ shr.u32 %r30, %r29, 24;
71
+ add.s32 %r31, %r26, %r30;
72
+ shr.s32 %r32, %r31, 8;
73
+ bfe.s32 %r33, %r15, 25, 1;
74
+ shr.u32 %r34, %r33, 24;
75
+ add.s32 %r35, %r27, %r34;
76
+ shr.s32 %r36, %r35, 8;
77
+ .loc 1 37 44
78
+ shl.b32 %r37, %r36, 7;
79
+ mul.lo.s32 %r38, %r18, 12;
80
+ or.b32 %r39, %r25, %r38;
81
+ shl.b32 %r40, %r39, 1;
82
+ mov.u32 %r41, global_smem;
83
+ add.s32 %r4, %r41, %r40;
84
+ mad.lo.s32 %r42, %r2, 12, %r22;
85
+ shl.b32 %r43, %r42, 1;
86
+ add.s32 %r6, %r41, %r43;
87
+ shl.b32 %r44, %r39, 2;
88
+ add.s32 %r7, %r41, %r44;
89
+ shl.b32 %r45, %r42, 2;
90
+ add.s32 %r9, %r41, %r45;
91
+ .loc 1 30 36
92
+ mad.lo.s32 %r46, %r32, 32512, %r26;
93
+ shl.b32 %r47, %r19, 9;
94
+ add.s32 %r48, %r46, %r47;
95
+ shl.b32 %r49, %r23, 8;
96
+ add.s32 %r133, %r48, %r49;
97
+ or.b32 %r50, %r37, %r22;
98
+ mul.wide.s32 %rd23, %r50, 4;
99
+ add.s64 %rd40, %rd22, %rd23;
100
+ add.s64 %rd39, %rd21, %rd23;
101
+ add.s64 %rd38, %rd20, %rd23;
102
+ add.s64 %rd37, %rd19, %rd23;
103
+ mov.f32 %f95, 0f00000000;
104
+ mov.b32 %r134, -8;
105
+ mov.pred %p1, -1;
106
+ mov.f32 %f96, %f95;
107
+ mov.f32 %f97, %f95;
108
+ mov.f32 %f98, %f95;
109
+ mov.f32 %f99, %f95;
110
+ mov.f32 %f100, %f95;
111
+ mov.f32 %f101, %f95;
112
+ mov.f32 %f102, %f95;
113
+ $L__BB0_1:
114
+ .loc 1 34 34
115
+ mul.wide.s32 %rd32, %r133, 2;
116
+ add.s64 %rd24, %rd13, %rd32;
117
+ mov.b32 %r53, 0;
118
+ .loc 1 34 63
119
+ mov.u32 %r51, 0x0;
120
+ mov.u32 %r52, 0x0;
121
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r51, %r52 }, [ %rd24 + 0 ];
122
+ @!%p1 mov.u32 %r51, %r53;
123
+ @!%p1 mov.u32 %r52, %r53;
124
+ shr.u32 %r115, %r51, 16;
125
+ shr.u32 %r116, %r52, 16;
126
+ .loc 1 34 115
127
+ bar.sync 0;
128
+ st.shared.u16 [%r4], %r51;
129
+ st.shared.u16 [%r4+24], %r115;
130
+ st.shared.u16 [%r4+48], %r52;
131
+ st.shared.u16 [%r4+72], %r116;
132
+ bar.sync 0;
133
+ ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%r6];
134
+ cvt.f32.bf16 %r55, %rs1;
135
+ mov.b32 %f25, %r55;
136
+ cvt.f32.bf16 %r56, %rs2;
137
+ mov.b32 %f26, %r56;
138
+ cvt.f32.bf16 %r57, %rs3;
139
+ mov.b32 %f27, %r57;
140
+ cvt.f32.bf16 %r58, %rs4;
141
+ mov.b32 %f28, %r58;
142
+ .loc 1 35 34
143
+ mul.wide.s32 %rd33, %r133, 4;
144
+ add.s64 %rd25, %rd14, %rd33;
145
+ .loc 1 35 63
146
+ mov.u32 %r59, 0x0;
147
+ mov.u32 %r60, 0x0;
148
+ mov.u32 %r61, 0x0;
149
+ mov.u32 %r62, 0x0;
150
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd25 + 0 ];
151
+ @!%p1 mov.u32 %r59, %r53;
152
+ @!%p1 mov.u32 %r60, %r53;
153
+ @!%p1 mov.u32 %r61, %r53;
154
+ @!%p1 mov.u32 %r62, %r53;
155
+ mov.b32 %f29, %r59;
156
+ mov.b32 %f30, %r60;
157
+ mov.b32 %f31, %r61;
158
+ mov.b32 %f32, %r62;
159
+ bar.sync 0;
160
+ st.shared.u32 [%r7], %r59;
161
+ st.shared.u32 [%r7+48], %r60;
162
+ st.shared.u32 [%r7+96], %r61;
163
+ st.shared.u32 [%r7+144], %r62;
164
+ bar.sync 0;
165
+ ld.shared.v4.f32 {%f33, %f34, %f35, %f36}, [%r9];
166
+ .loc 1 36 34
167
+ add.s64 %rd26, %rd15, %rd32;
168
+ .loc 1 36 63
169
+ mov.u32 %r67, 0x0;
170
+ mov.u32 %r68, 0x0;
171
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r67, %r68 }, [ %rd26 + 0 ];
172
+ @!%p1 mov.u32 %r67, %r53;
173
+ @!%p1 mov.u32 %r68, %r53;
174
+ cvt.u16.u32 %rs5, %r67;
175
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r67; }
176
+ cvt.u16.u32 %rs7, %r68;
177
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r68; }
178
+ .loc 1 36 115
179
+ cvt.f32.bf16 %r71, %rs5;
180
+ mov.b32 %f37, %r71;
181
+ cvt.f32.bf16 %r72, %rs6;
182
+ mov.b32 %f38, %r72;
183
+ cvt.f32.bf16 %r73, %rs7;
184
+ mov.b32 %f39, %r73;
185
+ cvt.f32.bf16 %r74, %rs8;
186
+ mov.b32 %f40, %r74;
187
+ .loc 1 37 50
188
+ mov.u32 %r75, 0x0;
189
+ mov.u32 %r76, 0x0;
190
+ mov.u32 %r77, 0x0;
191
+ mov.u32 %r78, 0x0;
192
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd37 + 0 ];
193
+ @!%p1 mov.u32 %r75, %r53;
194
+ @!%p1 mov.u32 %r76, %r53;
195
+ @!%p1 mov.u32 %r77, %r53;
196
+ @!%p1 mov.u32 %r78, %r53;
197
+ .loc 1 38 50
198
+ mov.u32 %r83, 0x0;
199
+ mov.u32 %r84, 0x0;
200
+ mov.u32 %r85, 0x0;
201
+ mov.u32 %r86, 0x0;
202
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd38 + 0 ];
203
+ @!%p1 mov.u32 %r83, %r53;
204
+ @!%p1 mov.u32 %r84, %r53;
205
+ @!%p1 mov.u32 %r85, %r53;
206
+ @!%p1 mov.u32 %r86, %r53;
207
+ .loc 1 39 35
208
+ add.s64 %rd29, %rd16, %rd32;
209
+ .loc 1 39 64
210
+ mov.u32 %r91, 0x0;
211
+ mov.u32 %r92, 0x0;
212
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r91, %r92 }, [ %rd29 + 0 ];
213
+ @!%p1 mov.u32 %r91, %r53;
214
+ @!%p1 mov.u32 %r92, %r53;
215
+ shr.u32 %r117, %r91, 16;
216
+ shr.u32 %r118, %r92, 16;
217
+ .loc 1 39 116
218
+ bar.sync 0;
219
+ st.shared.u16 [%r4], %r91;
220
+ st.shared.u16 [%r4+24], %r117;
221
+ st.shared.u16 [%r4+48], %r92;
222
+ st.shared.u16 [%r4+72], %r118;
223
+ bar.sync 0;
224
+ ld.shared.v4.u16 {%rs9, %rs10, %rs11, %rs12}, [%r6];
225
+ cvt.f32.bf16 %r95, %rs9;
226
+ mov.b32 %f41, %r95;
227
+ cvt.f32.bf16 %r96, %rs10;
228
+ mov.b32 %f42, %r96;
229
+ cvt.f32.bf16 %r97, %rs11;
230
+ mov.b32 %f43, %r97;
231
+ cvt.f32.bf16 %r98, %rs12;
232
+ mov.b32 %f44, %r98;
233
+ .loc 1 40 51
234
+ mov.u32 %r99, 0x0;
235
+ mov.u32 %r100, 0x0;
236
+ mov.u32 %r101, 0x0;
237
+ mov.u32 %r102, 0x0;
238
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd39 + 0 ];
239
+ @!%p1 mov.u32 %r99, %r53;
240
+ @!%p1 mov.u32 %r100, %r53;
241
+ @!%p1 mov.u32 %r101, %r53;
242
+ @!%p1 mov.u32 %r102, %r53;
243
+ .loc 1 41 51
244
+ mov.u32 %r107, 0x0;
245
+ mov.u32 %r108, 0x0;
246
+ mov.u32 %r109, 0x0;
247
+ mov.u32 %r110, 0x0;
248
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd40 + 0 ];
249
+ @!%p1 mov.u32 %r107, %r53;
250
+ @!%p1 mov.u32 %r108, %r53;
251
+ @!%p1 mov.u32 %r109, %r53;
252
+ @!%p1 mov.u32 %r110, %r53;
253
+ .loc 1 44 22
254
+ add.f32 %f45, %f37, %f29;
255
+ add.f32 %f46, %f38, %f30;
256
+ add.f32 %f47, %f39, %f31;
257
+ add.f32 %f48, %f40, %f32;
258
+ bar.sync 0;
259
+ st.shared.f32 [%r7], %f45;
260
+ st.shared.f32 [%r7+48], %f46;
261
+ st.shared.f32 [%r7+96], %f47;
262
+ st.shared.f32 [%r7+144], %f48;
263
+ bar.sync 0;
264
+ ld.shared.v4.f32 {%f49, %f50, %f51, %f52}, [%r9];
265
+ .loc 1 40 51
266
+ mov.b32 %f53, %r75;
267
+ mov.b32 %f54, %r76;
268
+ mov.b32 %f55, %r77;
269
+ mov.b32 %f56, %r78;
270
+ mov.b32 %f57, %r99;
271
+ mov.b32 %f58, %r100;
272
+ mov.b32 %f59, %r101;
273
+ mov.b32 %f60, %r102;
274
+ .loc 1 41 51
275
+ mov.b32 %f61, %r110;
276
+ mov.b32 %f62, %r109;
277
+ mov.b32 %f63, %r108;
278
+ mov.b32 %f64, %r107;
279
+ mov.b32 %f65, %r86;
280
+ mov.b32 %f66, %r85;
281
+ mov.b32 %f67, %r84;
282
+ mov.b32 %f68, %r83;
283
+ .loc 1 52 23
284
+ sub.f32 %f69, %f36, %f60;
285
+ sub.f32 %f70, %f35, %f59;
286
+ sub.f32 %f71, %f34, %f58;
287
+ sub.f32 %f72, %f33, %f57;
288
+ sub.f32 %f73, %f52, %f56;
289
+ sub.f32 %f74, %f51, %f55;
290
+ sub.f32 %f75, %f50, %f54;
291
+ sub.f32 %f76, %f49, %f53;
292
+ .loc 1 53 24
293
+ mul.f32 %f77, %f76, %f68;
294
+ mul.f32 %f78, %f75, %f67;
295
+ mul.f32 %f79, %f74, %f66;
296
+ mul.f32 %f80, %f73, %f65;
297
+ mul.f32 %f81, %f72, %f64;
298
+ mul.f32 %f82, %f71, %f63;
299
+ mul.f32 %f83, %f70, %f62;
300
+ mul.f32 %f84, %f69, %f61;
301
+ .loc 1 57 40
302
+ fma.rn.f32 %f98, %f44, %f84, %f98;
303
+ fma.rn.f32 %f97, %f43, %f83, %f97;
304
+ fma.rn.f32 %f96, %f42, %f82, %f96;
305
+ fma.rn.f32 %f95, %f41, %f81, %f95;
306
+ fma.rn.f32 %f102, %f28, %f80, %f102;
307
+ fma.rn.f32 %f101, %f27, %f79, %f101;
308
+ fma.rn.f32 %f100, %f26, %f78, %f100;
309
+ fma.rn.f32 %f99, %f25, %f77, %f99;
310
+ .loc 1 30 36
311
+ add.s32 %r134, %r134, 8;
312
+ add.s32 %r133, %r133, 2048;
313
+ add.s64 %rd40, %rd40, 32;
314
+ add.s64 %rd39, %rd39, 32;
315
+ add.s64 %rd38, %rd38, 32;
316
+ add.s64 %rd37, %rd37, 32;
317
+ setp.lt.u32 %p35, %r134, 120;
318
+ @%p35 bra $L__BB0_1;
319
+ .loc 1 22 44
320
+ and.b32 %r121, %r1, 63;
321
+ .loc 1 22 23
322
+ or.b32 %r122, %r3, %r121;
323
+ $L__tmp1:
324
+ .loc 2 233 15
325
+ add.f32 %f85, %f99, %f100;
326
+ add.f32 %f86, %f101, %f85;
327
+ add.f32 %f87, %f102, %f86;
328
+ $L__tmp2:
329
+ .loc 2 243 36
330
+ mov.b32 %r123, %f87;
331
+ shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1;
332
+ mov.b32 %f88, %r124;
333
+ $L__tmp3:
334
+ .loc 2 233 15
335
+ add.f32 %f89, %f87, %f88;
336
+ $L__tmp4:
337
+ .loc 1 58 30
338
+ bar.sync 0;
339
+ shl.b32 %r125, %r2, 2;
340
+ add.s32 %r127, %r41, %r125;
341
+ st.shared.f32 [%r127], %f89;
342
+ bar.sync 0;
343
+ shl.b32 %r128, %r121, 2;
344
+ add.s32 %r129, %r41, %r128;
345
+ ld.shared.u32 %r119, [%r129];
346
+ .loc 1 59 25
347
+ mul.wide.s32 %rd36, %r122, 4;
348
+ add.s64 %rd34, %rd17, %rd36;
349
+ .loc 1 59 37
350
+ and.b32 %r130, %r1, 64;
351
+ setp.eq.s32 %p36, %r130, 0;
352
+ @%p36 st.global.b32 [ %rd34 + 0 ], { %r119 };
353
+ $L__tmp5:
354
+ .loc 2 233 15
355
+ add.f32 %f90, %f95, %f96;
356
+ add.f32 %f91, %f97, %f90;
357
+ add.f32 %f92, %f98, %f91;
358
+ $L__tmp6:
359
+ .loc 2 243 36
360
+ mov.b32 %r131, %f92;
361
+ shfl.sync.bfly.b32 %r132, %r131, 1, 31, -1;
362
+ mov.b32 %f93, %r132;
363
+ $L__tmp7:
364
+ .loc 2 233 15
365
+ add.f32 %f94, %f92, %f93;
366
+ $L__tmp8:
367
+ .loc 1 60 30
368
+ bar.sync 0;
369
+ st.shared.f32 [%r127], %f94;
370
+ bar.sync 0;
371
+ ld.shared.u32 %r120, [%r129];
372
+ .loc 1 61 25
373
+ add.s64 %rd35, %rd18, %rd36;
374
+ .loc 1 61 37
375
+ @%p36 st.global.b32 [ %rd35 + 0 ], { %r120 };
376
+ .loc 1 61 4
377
+ ret;
378
+ $L__tmp9:
379
+ $L__func_end0:
380
+
381
+ }
382
+ .file 1 "/tmp/torchinductor_root/3x/c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py"
383
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
384
+ .section .debug_abbrev
385
+ {
386
+ .b8 1
387
+ .b8 17
388
+ .b8 1
389
+ .b8 37
390
+ .b8 8
391
+ .b8 19
392
+ .b8 5
393
+ .b8 3
394
+ .b8 8
395
+ .b8 16
396
+ .b8 6
397
+ .b8 27
398
+ .b8 8
399
+ .b8 180
400
+ .b8 66
401
+ .b8 12
402
+ .b8 17
403
+ .b8 1
404
+ .b8 18
405
+ .b8 1
406
+ .b8 0
407
+ .b8 0
408
+ .b8 2
409
+ .b8 46
410
+ .b8 0
411
+ .b8 135
412
+ .b8 64
413
+ .b8 8
414
+ .b8 3
415
+ .b8 8
416
+ .b8 58
417
+ .b8 11
418
+ .b8 59
419
+ .b8 11
420
+ .b8 63
421
+ .b8 12
422
+ .b8 32
423
+ .b8 11
424
+ .b8 0
425
+ .b8 0
426
+ .b8 3
427
+ .b8 46
428
+ .b8 1
429
+ .b8 17
430
+ .b8 1
431
+ .b8 18
432
+ .b8 1
433
+ .b8 64
434
+ .b8 10
435
+ .b8 49
436
+ .b8 19
437
+ .b8 0
438
+ .b8 0
439
+ .b8 4
440
+ .b8 29
441
+ .b8 1
442
+ .b8 49
443
+ .b8 19
444
+ .b8 17
445
+ .b8 1
446
+ .b8 18
447
+ .b8 1
448
+ .b8 88
449
+ .b8 11
450
+ .b8 89
451
+ .b8 11
452
+ .b8 87
453
+ .b8 11
454
+ .b8 0
455
+ .b8 0
456
+ .b8 5
457
+ .b8 29
458
+ .b8 0
459
+ .b8 49
460
+ .b8 19
461
+ .b8 17
462
+ .b8 1
463
+ .b8 18
464
+ .b8 1
465
+ .b8 88
466
+ .b8 11
467
+ .b8 89
468
+ .b8 11
469
+ .b8 87
470
+ .b8 11
471
+ .b8 0
472
+ .b8 0
473
+ .b8 0
474
+ }
475
+ .section .debug_info
476
+ {
477
+ .b32 371
478
+ .b8 2
479
+ .b8 0
480
+ .b32 .debug_abbrev
481
+ .b8 8
482
+ .b8 1
483
+ .b8 116
484
+ .b8 114
485
+ .b8 105
486
+ .b8 116
487
+ .b8 111
488
+ .b8 110
489
+ .b8 0
490
+ .b8 2
491
+ .b8 0
492
+ .b8 99
493
+ .b8 51
494
+ .b8 120
495
+ .b8 120
496
+ .b8 115
497
+ .b8 122
498
+ .b8 118
499
+ .b8 103
500
+ .b8 116
501
+ .b8 102
502
+ .b8 110
503
+ .b8 106
504
+ .b8 98
505
+ .b8 55
506
+ .b8 119
507
+ .b8 101
508
+ .b8 108
509
+ .b8 113
510
+ .b8 118
511
+ .b8 114
512
+ .b8 51
513
+ .b8 51
514
+ .b8 122
515
+ .b8 52
516
+ .b8 99
517
+ .b8 113
518
+ .b8 111
519
+ .b8 117
520
+ .b8 120
521
+ .b8 104
522
+ .b8 113
523
+ .b8 106
524
+ .b8 121
525
+ .b8 51
526
+ .b8 100
527
+ .b8 112
528
+ .b8 119
529
+ .b8 97
530
+ .b8 50
531
+ .b8 113
532
+ .b8 109
533
+ .b8 109
534
+ .b8 120
535
+ .b8 50
536
+ .b8 120
537
+ .b8 116
538
+ .b8 111
539
+ .b8 54
540
+ .b8 115
541
+ .b8 103
542
+ .b8 118
543
+ .b8 122
544
+ .b8 46
545
+ .b8 112
546
+ .b8 121
547
+ .b8 0
548
+ .b32 .debug_line
549
+ .b8 47
550
+ .b8 116
551
+ .b8 109
552
+ .b8 112
553
+ .b8 47
554
+ .b8 116
555
+ .b8 111
556
+ .b8 114
557
+ .b8 99
558
+ .b8 104
559
+ .b8 105
560
+ .b8 110
561
+ .b8 100
562
+ .b8 117
563
+ .b8 99
564
+ .b8 116
565
+ .b8 111
566
+ .b8 114
567
+ .b8 95
568
+ .b8 114
569
+ .b8 111
570
+ .b8 111
571
+ .b8 116
572
+ .b8 47
573
+ .b8 51
574
+ .b8 120
575
+ .b8 0
576
+ .b8 1
577
+ .b64 $L__func_begin0
578
+ .b64 $L__func_end0
579
+ .b8 2
580
+ .b8 116
581
+ .b8 114
582
+ .b8 105
583
+ .b8 116
584
+ .b8 111
585
+ .b8 110
586
+ .b8 95
587
+ .b8 95
588
+ .b8 48
589
+ .b8 100
590
+ .b8 49
591
+ .b8 100
592
+ .b8 50
593
+ .b8 100
594
+ .b8 51
595
+ .b8 100
596
+ .b8 52
597
+ .b8 100
598
+ .b8 53
599
+ .b8 100
600
+ .b8 54
601
+ .b8 100
602
+ .b8 55
603
+ .b8 100
604
+ .b8 56
605
+ .b8 100
606
+ .b8 57
607
+ .b8 100
608
+ .b8 49
609
+ .b8 48
610
+ .b8 100
611
+ .b8 101
612
+ .b8 49
613
+ .b8 49
614
+ .b8 100
615
+ .b8 101
616
+ .b8 0
617
+ .b8 116
618
+ .b8 114
619
+ .b8 105
620
+ .b8 116
621
+ .b8 111
622
+ .b8 110
623
+ .b8 95
624
+ .b8 95
625
+ .b8 48
626
+ .b8 100
627
+ .b8 49
628
+ .b8 100
629
+ .b8 50
630
+ .b8 100
631
+ .b8 51
632
+ .b8 100
633
+ .b8 52
634
+ .b8 100
635
+ .b8 53
636
+ .b8 100
637
+ .b8 54
638
+ .b8 100
639
+ .b8 55
640
+ .b8 100
641
+ .b8 56
642
+ .b8 100
643
+ .b8 57
644
+ .b8 100
645
+ .b8 49
646
+ .b8 48
647
+ .b8 100
648
+ .b8 101
649
+ .b8 49
650
+ .b8 49
651
+ .b8 100
652
+ .b8 101
653
+ .b8 0
654
+ .b8 1
655
+ .b8 18
656
+ .b8 1
657
+ .b8 1
658
+ .b8 3
659
+ .b64 $L__func_begin0
660
+ .b64 $L__func_end0
661
+ .b8 1
662
+ .b8 156
663
+ .b32 125
664
+ .b8 4
665
+ .b32 125
666
+ .b64 $L__tmp1
667
+ .b64 $L__tmp4
668
+ .b8 2
669
+ .b8 58
670
+ .b8 27
671
+ .b8 5
672
+ .b32 125
673
+ .b64 $L__tmp1
674
+ .b64 $L__tmp4
675
+ .b8 2
676
+ .b8 243
677
+ .b8 36
678
+ .b8 0
679
+ .b8 5
680
+ .b32 125
681
+ .b64 $L__tmp2
682
+ .b64 $L__tmp3
683
+ .b8 2
684
+ .b8 58
685
+ .b8 27
686
+ .b8 4
687
+ .b32 125
688
+ .b64 $L__tmp5
689
+ .b64 $L__tmp8
690
+ .b8 2
691
+ .b8 60
692
+ .b8 27
693
+ .b8 5
694
+ .b32 125
695
+ .b64 $L__tmp5
696
+ .b64 $L__tmp8
697
+ .b8 2
698
+ .b8 243
699
+ .b8 36
700
+ .b8 0
701
+ .b8 5
702
+ .b32 125
703
+ .b64 $L__tmp6
704
+ .b64 $L__tmp7
705
+ .b8 2
706
+ .b8 60
707
+ .b8 27
708
+ .b8 0
709
+ .b8 0
710
+ }
711
+ .section .debug_pubnames
712
+ {
713
+ .b32 $L__pubNames_end0-$L__pubNames_start0
714
+ $L__pubNames_start0:
715
+ .b8 2
716
+ .b8 0
717
+ .b32 .debug_info
718
+ .b32 375
719
+ .b32 125
720
+ .b8 116
721
+ .b8 114
722
+ .b8 105
723
+ .b8 116
724
+ .b8 111
725
+ .b8 110
726
+ .b8 95
727
+ .b8 95
728
+ .b8 48
729
+ .b8 100
730
+ .b8 49
731
+ .b8 100
732
+ .b8 50
733
+ .b8 100
734
+ .b8 51
735
+ .b8 100
736
+ .b8 52
737
+ .b8 100
738
+ .b8 53
739
+ .b8 100
740
+ .b8 54
741
+ .b8 100
742
+ .b8 55
743
+ .b8 100
744
+ .b8 56
745
+ .b8 100
746
+ .b8 57
747
+ .b8 100
748
+ .b8 49
749
+ .b8 48
750
+ .b8 100
751
+ .b8 101
752
+ .b8 49
753
+ .b8 49
754
+ .b8 100
755
+ .b8 101
756
+ .b8 0
757
+ .b32 0
758
+ $L__pubNames_end0:
759
+ }
760
+ .section .debug_pubtypes
761
+ {
762
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
763
+ $L__pubTypes_start0:
764
+ .b8 2
765
+ .b8 0
766
+ .b32 .debug_info
767
+ .b32 375
768
+ .b32 0
769
+ $L__pubTypes_end0:
770
+ }
771
+ .section .debug_loc { }
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %c8_i32 = arith.constant 8 : i32
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %c0_i32 = arith.constant 0 : i32
7
+ %cst_0 = arith.constant dense<128> : tensor<64x1xi32>
8
+ %cst_1 = arith.constant dense<32768> : tensor<64x1xi32>
9
+ %cst_2 = arith.constant dense<256> : tensor<1x8xi32>
10
+ %cst_3 = arith.constant dense<128> : tensor<1x8xi32>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi32>
13
+ %c64_i32 = arith.constant 64 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c64_i32 : i32
16
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
17
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
18
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
19
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
20
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
21
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
22
+ %8 = arith.remsi %5, %cst_5 : tensor<64x1xi32>
23
+ %9 = arith.divsi %5, %cst_5 : tensor<64x1xi32>
24
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
25
+ %11 = arith.muli %9, %cst_1 : tensor<64x1xi32>
26
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
27
+ %13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
28
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
29
+ %15 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
30
+ %16 = arith.muli %9, %cst_0 : tensor<64x1xi32>
31
+ %17 = tt.broadcast %16 : (tensor<64x1xi32>) -> tensor<64x8xi32>
32
+ %18 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
33
+ %19 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
34
+ %20 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
35
+ %21 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
36
+ %22 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
37
+ %23:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_4, %arg14 = %cst_4) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
38
+ %32 = tt.splat %arg12 : (i32) -> tensor<1x8xi32>
39
+ %33 = arith.addi %32, %7 : tensor<1x8xi32>
40
+ %34 = arith.cmpi slt, %33, %cst_3 : tensor<1x8xi32>
41
+ %35 = arith.muli %33, %cst_2 : tensor<1x8xi32>
42
+ %36 = tt.broadcast %35 : (tensor<1x8xi32>) -> tensor<64x8xi32>
43
+ %37 = arith.addi %10, %36 : tensor<64x8xi32>
44
+ %38 = arith.addi %37, %12 : tensor<64x8xi32>
45
+ %39 = tt.addptr %13, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
46
+ %40 = tt.broadcast %34 : (tensor<1x8xi1>) -> tensor<64x8xi1>
47
+ %41 = tt.load %39, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
48
+ %42 = arith.extf %41 : tensor<64x8xbf16> to tensor<64x8xf32>
49
+ %43 = tt.addptr %14, %38 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
50
+ %44 = tt.load %43, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
51
+ %45 = tt.addptr %15, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
52
+ %46 = tt.load %45, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
53
+ %47 = arith.extf %46 : tensor<64x8xbf16> to tensor<64x8xf32>
54
+ %48 = tt.broadcast %33 : (tensor<1x8xi32>) -> tensor<64x8xi32>
55
+ %49 = arith.addi %48, %17 : tensor<64x8xi32>
56
+ %50 = tt.addptr %18, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
57
+ %51 = tt.load %50, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
58
+ %52 = tt.addptr %19, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
59
+ %53 = tt.load %52, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
60
+ %54 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
61
+ %55 = tt.load %54, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
62
+ %56 = arith.extf %55 : tensor<64x8xbf16> to tensor<64x8xf32>
63
+ %57 = tt.addptr %21, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
64
+ %58 = tt.load %57, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
65
+ %59 = tt.addptr %22, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
66
+ %60 = tt.load %59, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
67
+ %61 = arith.addf %44, %47 : tensor<64x8xf32>
68
+ %62 = arith.subf %61, %51 : tensor<64x8xf32>
69
+ %63 = arith.mulf %62, %53 : tensor<64x8xf32>
70
+ %64 = arith.mulf %42, %63 : tensor<64x8xf32>
71
+ %65 = arith.addf %arg13, %64 : tensor<64x8xf32>
72
+ %66 = arith.select %40, %65, %arg13 : tensor<64x8xi1>, tensor<64x8xf32>
73
+ %67 = arith.subf %44, %58 : tensor<64x8xf32>
74
+ %68 = arith.mulf %67, %60 : tensor<64x8xf32>
75
+ %69 = arith.mulf %56, %68 : tensor<64x8xf32>
76
+ %70 = arith.addf %arg14, %69 : tensor<64x8xf32>
77
+ %71 = arith.select %40, %70, %arg14 : tensor<64x8xi1>, tensor<64x8xf32>
78
+ scf.yield %66, %71 : tensor<64x8xf32>, tensor<64x8xf32>
79
+ }
80
+ %24 = "tt.reduce"(%23#0) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg12: f32, %arg13: f32):
82
+ %32 = arith.addf %arg12, %arg13 : f32
83
+ tt.reduce.return %32 : f32
84
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
85
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
86
+ %26 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
87
+ %27 = tt.addptr %26, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
88
+ tt.store %27, %25 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
89
+ %28 = "tt.reduce"(%23#1) <{axis = 1 : i32}> ({
90
+ ^bb0(%arg12: f32, %arg13: f32):
91
+ %32 = arith.addf %arg12, %arg13 : f32
92
+ tt.reduce.return %32 : f32
93
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
94
+ %29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
95
+ %30 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
96
+ %31 = tt.addptr %30, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
97
+ tt.store %31, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
98
+ tt.return
99
+ }
100
+ }
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ptx ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 256, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<13>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 510;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r11, 4;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r4, 0x0;
45
+ mov.u32 %r5, 0x0;
46
+ @%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
47
+ .loc 1 26 25
48
+ mul.wide.s32 %rd6, %r11, 2;
49
+ add.s64 %rd2, %rd4, %rd6;
50
+ .loc 1 26 36
51
+ cvt.rn.bf16.f32 %rs1, %r4;
52
+ cvt.rn.bf16.f32 %rs2, %r5;
53
+ mov.b32 %r12, {%rs1, %rs2};
54
+ @%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
55
+ .loc 1 26 4
56
+ ret;
57
+ $L__tmp1:
58
+ $L__func_end0:
59
+
60
+ }
61
+ .file 1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py"
62
+ .section .debug_abbrev
63
+ {
64
+ .b8 1
65
+ .b8 17
66
+ .b8 1
67
+ .b8 37
68
+ .b8 8
69
+ .b8 19
70
+ .b8 5
71
+ .b8 3
72
+ .b8 8
73
+ .b8 16
74
+ .b8 6
75
+ .b8 27
76
+ .b8 8
77
+ .b8 180
78
+ .b8 66
79
+ .b8 12
80
+ .b8 17
81
+ .b8 1
82
+ .b8 18
83
+ .b8 1
84
+ .b8 0
85
+ .b8 0
86
+ .b8 2
87
+ .b8 46
88
+ .b8 0
89
+ .b8 17
90
+ .b8 1
91
+ .b8 18
92
+ .b8 1
93
+ .b8 64
94
+ .b8 10
95
+ .b8 135
96
+ .b8 64
97
+ .b8 8
98
+ .b8 3
99
+ .b8 8
100
+ .b8 58
101
+ .b8 11
102
+ .b8 59
103
+ .b8 11
104
+ .b8 63
105
+ .b8 12
106
+ .b8 0
107
+ .b8 0
108
+ .b8 0
109
+ }
110
+ .section .debug_info
111
+ {
112
+ .b32 176
113
+ .b8 2
114
+ .b8 0
115
+ .b32 .debug_abbrev
116
+ .b8 8
117
+ .b8 1
118
+ .b8 116
119
+ .b8 114
120
+ .b8 105
121
+ .b8 116
122
+ .b8 111
123
+ .b8 110
124
+ .b8 0
125
+ .b8 2
126
+ .b8 0
127
+ .b8 99
128
+ .b8 53
129
+ .b8 116
130
+ .b8 114
131
+ .b8 121
132
+ .b8 112
133
+ .b8 53
134
+ .b8 113
135
+ .b8 119
136
+ .b8 107
137
+ .b8 104
138
+ .b8 114
139
+ .b8 101
140
+ .b8 105
141
+ .b8 106
142
+ .b8 107
143
+ .b8 55
144
+ .b8 115
145
+ .b8 53
146
+ .b8 120
147
+ .b8 51
148
+ .b8 50
149
+ .b8 55
150
+ .b8 119
151
+ .b8 111
152
+ .b8 102
153
+ .b8 122
154
+ .b8 53
155
+ .b8 52
156
+ .b8 108
157
+ .b8 119
158
+ .b8 106
159
+ .b8 52
160
+ .b8 107
161
+ .b8 118
162
+ .b8 99
163
+ .b8 116
164
+ .b8 117
165
+ .b8 113
166
+ .b8 100
167
+ .b8 122
168
+ .b8 118
169
+ .b8 50
170
+ .b8 118
171
+ .b8 114
172
+ .b8 102
173
+ .b8 50
174
+ .b8 120
175
+ .b8 121
176
+ .b8 111
177
+ .b8 110
178
+ .b8 115
179
+ .b8 46
180
+ .b8 112
181
+ .b8 121
182
+ .b8 0
183
+ .b32 .debug_line
184
+ .b8 47
185
+ .b8 116
186
+ .b8 109
187
+ .b8 112
188
+ .b8 47
189
+ .b8 116
190
+ .b8 111
191
+ .b8 114
192
+ .b8 99
193
+ .b8 104
194
+ .b8 105
195
+ .b8 110
196
+ .b8 100
197
+ .b8 117
198
+ .b8 99
199
+ .b8 116
200
+ .b8 111
201
+ .b8 114
202
+ .b8 95
203
+ .b8 114
204
+ .b8 111
205
+ .b8 111
206
+ .b8 116
207
+ .b8 47
208
+ .b8 53
209
+ .b8 116
210
+ .b8 0
211
+ .b8 1
212
+ .b64 $L__func_begin0
213
+ .b64 $L__func_end0
214
+ .b8 2
215
+ .b64 $L__func_begin0
216
+ .b64 $L__func_end0
217
+ .b8 1
218
+ .b8 156
219
+ .b8 116
220
+ .b8 114
221
+ .b8 105
222
+ .b8 116
223
+ .b8 111
224
+ .b8 110
225
+ .b8 95
226
+ .b8 95
227
+ .b8 48
228
+ .b8 100
229
+ .b8 49
230
+ .b8 100
231
+ .b8 50
232
+ .b8 100
233
+ .b8 101
234
+ .b8 0
235
+ .b8 116
236
+ .b8 114
237
+ .b8 105
238
+ .b8 116
239
+ .b8 111
240
+ .b8 110
241
+ .b8 95
242
+ .b8 95
243
+ .b8 48
244
+ .b8 100
245
+ .b8 49
246
+ .b8 100
247
+ .b8 50
248
+ .b8 100
249
+ .b8 101
250
+ .b8 0
251
+ .b8 1
252
+ .b8 18
253
+ .b8 1
254
+ .b8 0
255
+ }
256
+ .section .debug_pubnames
257
+ {
258
+ .b32 $L__pubNames_end0-$L__pubNames_start0
259
+ $L__pubNames_start0:
260
+ .b8 2
261
+ .b8 0
262
+ .b32 .debug_info
263
+ .b32 180
264
+ .b32 125
265
+ .b8 116
266
+ .b8 114
267
+ .b8 105
268
+ .b8 116
269
+ .b8 111
270
+ .b8 110
271
+ .b8 95
272
+ .b8 95
273
+ .b8 48
274
+ .b8 100
275
+ .b8 49
276
+ .b8 100
277
+ .b8 50
278
+ .b8 100
279
+ .b8 101
280
+ .b8 0
281
+ .b32 0
282
+ $L__pubNames_end0:
283
+ }
284
+ .section .debug_pubtypes
285
+ {
286
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
287
+ $L__pubTypes_start0:
288
+ .b8 2
289
+ .b8 0
290
+ .b32 .debug_info
291
+ .b32 180
292
+ .b32 0
293
+ $L__pubTypes_end0:
294
+ }
295
+ .section .debug_loc { }
.triton/dump/9a2fb05196b13393bea452d08e9aaca8/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c512_i32 : i32
6
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
8
+ %4 = arith.addi %3, %2 : tensor<512xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32>
12
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
13
+ %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
14
+ %10 = arith.truncf %7 : tensor<512xf32> to tensor<512xbf16>
15
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ptx ADDED
@@ -0,0 +1,565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<18>;
22
+ .reg .b32 %r<92>;
23
+ .reg .f32 %f<43>;
24
+ .reg .b64 %rd<16>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
30
+ ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
31
+ ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
32
+ $L__tmp0:
33
+ .loc 1 22 44
34
+ mov.u32 %r1, %tid.x;
35
+ and.b32 %r2, %r1, 31;
36
+ shl.b32 %r13, %r1, 2;
37
+ and.b32 %r3, %r13, 60;
38
+ .loc 1 24 33
39
+ bfe.u32 %r4, %r1, 5, 2;
40
+ .loc 1 21 28
41
+ mov.u32 %r11, %ctaid.x;
42
+ .loc 1 21 33
43
+ shl.b32 %r5, %r11, 6;
44
+ .loc 1 27 36
45
+ shl.b32 %r14, %r4, 18;
46
+ shl.b32 %r15, %r1, 13;
47
+ and.b32 %r16, %r15, 131072;
48
+ or.b32 %r17, %r14, %r16;
49
+ add.s32 %r18, %r17, %r5;
50
+ or.b32 %r90, %r18, %r3;
51
+ mov.f32 %f39, 0f00000000;
52
+ mov.b32 %r91, -8;
53
+ mov.pred %p1, -1;
54
+ mov.f32 %f40, %f39;
55
+ mov.f32 %f41, %f39;
56
+ mov.f32 %f42, %f39;
57
+ $L__BB0_1:
58
+ .loc 1 31 34
59
+ mul.wide.s32 %rd5, %r90, 4;
60
+ add.s64 %rd4, %rd1, %rd5;
61
+ mov.b32 %r23, 0;
62
+ .loc 1 31 53
63
+ mov.u32 %r19, 0x0;
64
+ mov.u32 %r20, 0x0;
65
+ mov.u32 %r21, 0x0;
66
+ mov.u32 %r22, 0x0;
67
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r19, %r20, %r21, %r22 }, [ %rd4 + 0 ];
68
+ @!%p1 mov.u32 %r19, %r23;
69
+ @!%p1 mov.u32 %r20, %r23;
70
+ @!%p1 mov.u32 %r21, %r23;
71
+ @!%p1 mov.u32 %r22, %r23;
72
+ mov.b32 %f13, %r19;
73
+ mov.b32 %f14, %r20;
74
+ mov.b32 %f15, %r21;
75
+ mov.b32 %f16, %r22;
76
+ .loc 1 34 38
77
+ add.f32 %f42, %f42, %f16;
78
+ add.f32 %f41, %f41, %f15;
79
+ add.f32 %f40, %f40, %f14;
80
+ add.f32 %f39, %f39, %f13;
81
+ .loc 1 27 36
82
+ add.s32 %r91, %r91, 8;
83
+ add.s32 %r90, %r90, 1048576;
84
+ setp.lt.u32 %p6, %r91, 112;
85
+ @%p6 bra $L__BB0_1;
86
+ .loc 1 22 44
87
+ and.b32 %r45, %r1, 63;
88
+ .loc 1 22 23
89
+ or.b32 %r46, %r5, %r45;
90
+ $L__tmp1:
91
+ .loc 2 243 36
92
+ mov.b32 %r47, %f39;
93
+ shfl.sync.bfly.b32 %r48, %r47, 16, 31, -1;
94
+ mov.b32 %f17, %r48;
95
+ $L__tmp2:
96
+ .loc 2 233 15
97
+ add.f32 %f18, %f39, %f17;
98
+ $L__tmp3:
99
+ .loc 2 243 36
100
+ mov.b32 %r49, %f40;
101
+ shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1;
102
+ mov.b32 %f19, %r50;
103
+ $L__tmp4:
104
+ .loc 2 233 15
105
+ add.f32 %f20, %f40, %f19;
106
+ $L__tmp5:
107
+ .loc 2 243 36
108
+ mov.b32 %r51, %f41;
109
+ shfl.sync.bfly.b32 %r52, %r51, 16, 31, -1;
110
+ mov.b32 %f21, %r52;
111
+ $L__tmp6:
112
+ .loc 2 233 15
113
+ add.f32 %f22, %f41, %f21;
114
+ $L__tmp7:
115
+ .loc 2 243 36
116
+ mov.b32 %r53, %f42;
117
+ shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1;
118
+ mov.b32 %f23, %r54;
119
+ $L__tmp8:
120
+ .loc 2 233 15
121
+ add.f32 %f24, %f42, %f23;
122
+ $L__tmp9:
123
+ .loc 2 243 36
124
+ setp.lt.u32 %p7, %r2, 16;
125
+ shl.b32 %r55, %r3, 2;
126
+ or.b32 %r56, %r55, %r4;
127
+ shl.b32 %r57, %r56, 2;
128
+ mov.u32 %r58, global_smem;
129
+ add.s32 %r27, %r58, %r57;
130
+ mov.b32 %r28, %f18;
131
+ @%p7 st.shared.b32 [ %r27 + 0 ], %r28;
132
+ shl.b32 %r59, %r4, 2;
133
+ shl.b32 %r60, %r3, 4;
134
+ or.b32 %r61, %r60, 16;
135
+ or.b32 %r62, %r61, %r59;
136
+ add.s32 %r29, %r58, %r62;
137
+ mov.b32 %r30, %f20;
138
+ @%p7 st.shared.b32 [ %r29 + 0 ], %r30;
139
+ or.b32 %r63, %r60, 32;
140
+ or.b32 %r64, %r63, %r59;
141
+ add.s32 %r31, %r58, %r64;
142
+ mov.b32 %r32, %f22;
143
+ @%p7 st.shared.b32 [ %r31 + 0 ], %r32;
144
+ or.b32 %r65, %r60, 48;
145
+ or.b32 %r66, %r65, %r59;
146
+ add.s32 %r33, %r58, %r66;
147
+ mov.b32 %r34, %f24;
148
+ @%p7 st.shared.b32 [ %r33 + 0 ], %r34;
149
+ bar.sync 0;
150
+ setp.lt.s32 %p11, %r1, 256;
151
+ add.s32 %r36, %r58, %r13;
152
+ @%p11 ld.shared.b32 %r35, [ %r36 + 0 ];
153
+ mov.b32 %f25, %r35;
154
+ shfl.sync.bfly.b32 %r68, %r35, 2, 31, -1;
155
+ mov.b32 %f26, %r68;
156
+ $L__tmp10:
157
+ .loc 2 233 15
158
+ add.f32 %f27, %f25, %f26;
159
+ $L__tmp11:
160
+ .loc 2 243 36
161
+ mov.b32 %r69, %f27;
162
+ shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1;
163
+ mov.b32 %f28, %r70;
164
+ $L__tmp12:
165
+ .loc 2 233 15
166
+ add.f32 %f29, %f27, %f28;
167
+ $L__tmp13:
168
+ .loc 2 243 36
169
+ and.b32 %r71, %r1, 3;
170
+ setp.eq.s32 %p17, %r71, 0;
171
+ and.pred %p12, %p11, %p17;
172
+ mov.b32 %r38, %f29;
173
+ @%p12 st.shared.b32 [ %r36 + 0 ], %r38;
174
+ add.s32 %r40, %r36, 512;
175
+ @%p11 ld.shared.b32 %r39, [ %r40 + 0 ];
176
+ mov.b32 %f30, %r39;
177
+ shfl.sync.bfly.b32 %r72, %r39, 2, 31, -1;
178
+ mov.b32 %f31, %r72;
179
+ $L__tmp14:
180
+ .loc 2 233 15
181
+ add.f32 %f32, %f30, %f31;
182
+ $L__tmp15:
183
+ .loc 2 243 36
184
+ mov.b32 %r73, %f32;
185
+ shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1;
186
+ mov.b32 %f33, %r74;
187
+ $L__tmp16:
188
+ .loc 2 233 15
189
+ add.f32 %f34, %f32, %f33;
190
+ $L__tmp17:
191
+ .loc 2 243 36
192
+ mov.b32 %r42, %f34;
193
+ @%p12 st.shared.b32 [ %r40 + 0 ], %r42;
194
+ bar.sync 0;
195
+ add.s32 %r75, %r58, %r60;
196
+ ld.shared.f32 %f35, [%r75];
197
+ add.s32 %r76, %r58, %r61;
198
+ ld.shared.f32 %f36, [%r76];
199
+ add.s32 %r77, %r58, %r63;
200
+ ld.shared.f32 %f37, [%r77];
201
+ add.s32 %r78, %r58, %r65;
202
+ ld.shared.f32 %f38, [%r78];
203
+ $L__tmp18:
204
+ .loc 1 35 28
205
+ bar.sync 0;
206
+ add.s32 %r79, %r58, %r55;
207
+ st.shared.f32 [%r79], %f35;
208
+ st.shared.f32 [%r79+4], %f36;
209
+ st.shared.f32 [%r79+8], %f37;
210
+ st.shared.f32 [%r79+12], %f38;
211
+ bar.sync 0;
212
+ shl.b32 %r80, %r45, 2;
213
+ add.s32 %r81, %r58, %r80;
214
+ .loc 1 36 20
215
+ shr.s32 %r83, %r46, 31;
216
+ shr.u32 %r84, %r83, 24;
217
+ add.s32 %r85, %r46, %r84;
218
+ shr.s32 %r86, %r85, 8;
219
+ and.b32 %r87, %r85, -256;
220
+ sub.s32 %r88, %r46, %r87;
221
+ .loc 1 38 30
222
+ mul.wide.s32 %rd9, %r86, 8;
223
+ add.s64 %rd7, %rd2, %rd9;
224
+ .loc 1 45 55
225
+ ld.shared.u32 %r44, [%r81];
226
+ .loc 1 38 35
227
+ mov.u64 %rd6, 0x0;
228
+ @%p1 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
229
+ .loc 1 41 32
230
+ shr.u64 %rd10, %rd6, 54;
231
+ and.b64 %rd11, %rd10, 512;
232
+ add.s64 %rd12, %rd11, %rd6;
233
+ .loc 1 45 30
234
+ shl.b64 %rd13, %rd12, 10;
235
+ add.s64 %rd14, %rd3, %rd13;
236
+ mul.wide.s32 %rd15, %r88, 4;
237
+ add.s64 %rd8, %rd14, %rd15;
238
+ .loc 1 45 55
239
+ and.b32 %r89, %r1, 64;
240
+ setp.eq.s32 %p16, %r89, 0;
241
+ mov.u32 %r43, 0x0;
242
+ @%p16 atom.global.gpu.acq_rel.add.f32 %r43, [ %rd8 + 0 ], %r44;
243
+ .loc 1 45 4
244
+ ret;
245
+ $L__tmp19:
246
+ $L__func_end0:
247
+
248
+ }
249
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
250
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
251
+ .section .debug_abbrev
252
+ {
253
+ .b8 1
254
+ .b8 17
255
+ .b8 1
256
+ .b8 37
257
+ .b8 8
258
+ .b8 19
259
+ .b8 5
260
+ .b8 3
261
+ .b8 8
262
+ .b8 16
263
+ .b8 6
264
+ .b8 27
265
+ .b8 8
266
+ .b8 180
267
+ .b8 66
268
+ .b8 12
269
+ .b8 17
270
+ .b8 1
271
+ .b8 18
272
+ .b8 1
273
+ .b8 0
274
+ .b8 0
275
+ .b8 2
276
+ .b8 46
277
+ .b8 0
278
+ .b8 135
279
+ .b8 64
280
+ .b8 8
281
+ .b8 3
282
+ .b8 8
283
+ .b8 58
284
+ .b8 11
285
+ .b8 59
286
+ .b8 11
287
+ .b8 63
288
+ .b8 12
289
+ .b8 32
290
+ .b8 11
291
+ .b8 0
292
+ .b8 0
293
+ .b8 3
294
+ .b8 46
295
+ .b8 1
296
+ .b8 17
297
+ .b8 1
298
+ .b8 18
299
+ .b8 1
300
+ .b8 64
301
+ .b8 10
302
+ .b8 49
303
+ .b8 19
304
+ .b8 0
305
+ .b8 0
306
+ .b8 4
307
+ .b8 29
308
+ .b8 0
309
+ .b8 49
310
+ .b8 19
311
+ .b8 17
312
+ .b8 1
313
+ .b8 18
314
+ .b8 1
315
+ .b8 88
316
+ .b8 11
317
+ .b8 89
318
+ .b8 11
319
+ .b8 87
320
+ .b8 11
321
+ .b8 0
322
+ .b8 0
323
+ .b8 5
324
+ .b8 29
325
+ .b8 1
326
+ .b8 49
327
+ .b8 19
328
+ .b8 17
329
+ .b8 1
330
+ .b8 18
331
+ .b8 1
332
+ .b8 88
333
+ .b8 11
334
+ .b8 89
335
+ .b8 11
336
+ .b8 87
337
+ .b8 11
338
+ .b8 0
339
+ .b8 0
340
+ .b8 0
341
+ }
342
+ .section .debug_info
343
+ {
344
+ .b32 264
345
+ .b8 2
346
+ .b8 0
347
+ .b32 .debug_abbrev
348
+ .b8 8
349
+ .b8 1
350
+ .b8 116
351
+ .b8 114
352
+ .b8 105
353
+ .b8 116
354
+ .b8 111
355
+ .b8 110
356
+ .b8 0
357
+ .b8 2
358
+ .b8 0
359
+ .b8 99
360
+ .b8 54
361
+ .b8 105
362
+ .b8 107
363
+ .b8 53
364
+ .b8 118
365
+ .b8 120
366
+ .b8 55
367
+ .b8 112
368
+ .b8 50
369
+ .b8 50
370
+ .b8 102
371
+ .b8 112
372
+ .b8 107
373
+ .b8 52
374
+ .b8 100
375
+ .b8 99
376
+ .b8 118
377
+ .b8 104
378
+ .b8 53
379
+ .b8 53
380
+ .b8 122
381
+ .b8 105
382
+ .b8 109
383
+ .b8 119
384
+ .b8 52
385
+ .b8 116
386
+ .b8 53
387
+ .b8 110
388
+ .b8 114
389
+ .b8 53
390
+ .b8 122
391
+ .b8 110
392
+ .b8 50
393
+ .b8 98
394
+ .b8 55
395
+ .b8 105
396
+ .b8 110
397
+ .b8 117
398
+ .b8 106
399
+ .b8 120
400
+ .b8 106
401
+ .b8 97
402
+ .b8 117
403
+ .b8 120
404
+ .b8 115
405
+ .b8 104
406
+ .b8 108
407
+ .b8 106
408
+ .b8 117
409
+ .b8 109
410
+ .b8 109
411
+ .b8 46
412
+ .b8 112
413
+ .b8 121
414
+ .b8 0
415
+ .b32 .debug_line
416
+ .b8 47
417
+ .b8 116
418
+ .b8 109
419
+ .b8 112
420
+ .b8 47
421
+ .b8 116
422
+ .b8 111
423
+ .b8 114
424
+ .b8 99
425
+ .b8 104
426
+ .b8 105
427
+ .b8 110
428
+ .b8 100
429
+ .b8 117
430
+ .b8 99
431
+ .b8 116
432
+ .b8 111
433
+ .b8 114
434
+ .b8 95
435
+ .b8 114
436
+ .b8 111
437
+ .b8 111
438
+ .b8 116
439
+ .b8 47
440
+ .b8 54
441
+ .b8 105
442
+ .b8 0
443
+ .b8 1
444
+ .b64 $L__func_begin0
445
+ .b64 $L__func_end0
446
+ .b8 2
447
+ .b8 116
448
+ .b8 114
449
+ .b8 105
450
+ .b8 116
451
+ .b8 111
452
+ .b8 110
453
+ .b8 95
454
+ .b8 95
455
+ .b8 48
456
+ .b8 100
457
+ .b8 49
458
+ .b8 100
459
+ .b8 50
460
+ .b8 100
461
+ .b8 51
462
+ .b8 100
463
+ .b8 101
464
+ .b8 52
465
+ .b8 101
466
+ .b8 0
467
+ .b8 116
468
+ .b8 114
469
+ .b8 105
470
+ .b8 116
471
+ .b8 111
472
+ .b8 110
473
+ .b8 95
474
+ .b8 95
475
+ .b8 48
476
+ .b8 100
477
+ .b8 49
478
+ .b8 100
479
+ .b8 50
480
+ .b8 100
481
+ .b8 51
482
+ .b8 100
483
+ .b8 101
484
+ .b8 52
485
+ .b8 101
486
+ .b8 0
487
+ .b8 1
488
+ .b8 18
489
+ .b8 1
490
+ .b8 1
491
+ .b8 3
492
+ .b64 $L__func_begin0
493
+ .b64 $L__func_end0
494
+ .b8 1
495
+ .b8 156
496
+ .b32 125
497
+ .b8 4
498
+ .b32 125
499
+ .b64 $L__tmp1
500
+ .b64 $L__tmp18
501
+ .b8 2
502
+ .b8 35
503
+ .b8 25
504
+ .b8 5
505
+ .b32 125
506
+ .b64 $L__tmp2
507
+ .b64 $L__tmp17
508
+ .b8 2
509
+ .b8 35
510
+ .b8 25
511
+ .b8 4
512
+ .b32 125
513
+ .b64 $L__tmp2
514
+ .b64 $L__tmp17
515
+ .b8 2
516
+ .b8 243
517
+ .b8 36
518
+ .b8 0
519
+ .b8 0
520
+ .b8 0
521
+ }
522
+ .section .debug_pubnames
523
+ {
524
+ .b32 $L__pubNames_end0-$L__pubNames_start0
525
+ $L__pubNames_start0:
526
+ .b8 2
527
+ .b8 0
528
+ .b32 .debug_info
529
+ .b32 268
530
+ .b32 125
531
+ .b8 116
532
+ .b8 114
533
+ .b8 105
534
+ .b8 116
535
+ .b8 111
536
+ .b8 110
537
+ .b8 95
538
+ .b8 95
539
+ .b8 48
540
+ .b8 100
541
+ .b8 49
542
+ .b8 100
543
+ .b8 50
544
+ .b8 100
545
+ .b8 51
546
+ .b8 100
547
+ .b8 101
548
+ .b8 52
549
+ .b8 101
550
+ .b8 0
551
+ .b32 0
552
+ $L__pubNames_end0:
553
+ }
554
+ .section .debug_pubtypes
555
+ {
556
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
557
+ $L__pubTypes_start0:
558
+ .b8 2
559
+ .b8 0
560
+ .b32 .debug_info
561
+ .b32 268
562
+ .b32 0
563
+ $L__pubTypes_end0:
564
+ }
565
+ .section .debug_loc { }
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttgir ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x8xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x8xi32, #blocked1>
11
+ %c0_i32 = arith.constant 0 : i32
12
+ %c120_i32 = arith.constant 120 : i32
13
+ %c8_i32 = arith.constant 8 : i32
14
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
15
+ %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
16
+ %c64_i32 = arith.constant 64 : i32
17
+ %0 = tt.get_program_id x : i32
18
+ %1 = arith.muli %0, %c64_i32 : i32
19
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
20
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
21
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
22
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
23
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
24
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
25
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked1>
26
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked>
27
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
28
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
29
+ %12 = tt.broadcast %8 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
30
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
31
+ %14 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c8_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x8xf32, #blocked1>) : i32 {
32
+ %32 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked1>
33
+ %33 = arith.addi %32, %11 : tensor<1x8xi32, #blocked1>
34
+ %34 = arith.cmpi slt, %33, %cst_4 : tensor<1x8xi32, #blocked1>
35
+ %35 = arith.muli %33, %cst_3 : tensor<1x8xi32, #blocked1>
36
+ %36 = tt.broadcast %35 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
37
+ %37 = arith.addi %12, %36 : tensor<64x8xi32, #blocked1>
38
+ %38 = tt.addptr %13, %37 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
39
+ %39 = tt.broadcast %34 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
40
+ %40 = tt.load %38, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
41
+ %41 = arith.addf %arg6, %40 : tensor<64x8xf32, #blocked1>
42
+ %42 = arith.select %39, %41, %arg6 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
43
+ scf.yield %42 : tensor<64x8xf32, #blocked1>
44
+ }
45
+ %15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({
46
+ ^bb0(%arg5: f32, %arg6: f32):
47
+ %32 = arith.addf %arg5, %arg6 : f32
48
+ tt.reduce.return %32 : f32
49
+ }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
50
+ %16 = triton_gpu.convert_layout %15 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
51
+ %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
52
+ %18 = arith.divsi %9, %cst_2 : tensor<64x1xi32, #blocked>
53
+ %19 = arith.remsi %9, %cst_2 : tensor<64x1xi32, #blocked>
54
+ %20 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
55
+ %21 = tt.addptr %20, %18 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
56
+ %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
57
+ %23 = arith.addi %22, %cst_1 : tensor<64x1xi64, #blocked>
58
+ %24 = arith.cmpi slt, %22, %cst_0 : tensor<64x1xi64, #blocked>
59
+ %25 = arith.select %24, %23, %22 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
60
+ %26 = arith.muli %25, %cst : tensor<64x1xi64, #blocked>
61
+ %27 = arith.extsi %19 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
62
+ %28 = arith.addi %27, %26 : tensor<64x1xi64, #blocked>
63
+ %29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
64
+ %30 = tt.addptr %29, %28 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
65
+ %31 = "tt.atomic_rmw"(%30, %17, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
66
+ tt.return
67
+ }
68
+ }
.triton/dump/9aec2dd769dc1991d76fa64c70ec0e92/triton_.ttir ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<64x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64>
6
+ %c8_i32 = arith.constant 8 : i32
7
+ %c120_i32 = arith.constant 120 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<true> : tensor<64x1xi1>
10
+ %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
11
+ %cst_4 = arith.constant dense<131072> : tensor<1x8xi32>
12
+ %cst_5 = arith.constant dense<120> : tensor<1x8xi32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
14
+ %c64_i32 = arith.constant 64 : i32
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = arith.muli %0, %c64_i32 : i32
17
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
18
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
19
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
20
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
21
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
22
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
23
+ %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x8xi32>
24
+ %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
25
+ %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c8_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x8xf32>) : i32 {
26
+ %27 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
27
+ %28 = arith.addi %27, %7 : tensor<1x8xi32>
28
+ %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x8xi32>
29
+ %30 = arith.muli %28, %cst_4 : tensor<1x8xi32>
30
+ %31 = tt.broadcast %30 : (tensor<1x8xi32>) -> tensor<64x8xi32>
31
+ %32 = arith.addi %8, %31 : tensor<64x8xi32>
32
+ %33 = tt.addptr %9, %32 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
33
+ %34 = tt.broadcast %29 : (tensor<1x8xi1>) -> tensor<64x8xi1>
34
+ %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
35
+ %36 = arith.addf %arg6, %35 : tensor<64x8xf32>
36
+ %37 = arith.select %34, %36, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
37
+ scf.yield %37 : tensor<64x8xf32>
38
+ }
39
+ %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
40
+ ^bb0(%arg5: f32, %arg6: f32):
41
+ %27 = arith.addf %arg5, %arg6 : f32
42
+ tt.reduce.return %27 : f32
43
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
44
+ %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
45
+ %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
46
+ %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
47
+ %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
48
+ %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
49
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
50
+ %18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
51
+ %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
52
+ %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
53
+ %21 = arith.muli %20, %cst : tensor<64x1xi64>
54
+ %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
55
+ %23 = arith.addi %22, %21 : tensor<64x1xi64>
56
+ %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
57
+ %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
58
+ %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
59
+ tt.return
60
+ }
61
+ }
.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 256, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<12>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 510;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r11, 2;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r2, 0x0;
45
+ @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
46
+ cvt.u16.u32 %rs1, %r2;
47
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
48
+ .loc 1 24 44
49
+ cvt.f32.bf16 %r5, %rs1;
50
+ cvt.f32.bf16 %r6, %rs2;
51
+ .loc 1 26 25
52
+ mul.wide.s32 %rd6, %r11, 4;
53
+ add.s64 %rd2, %rd4, %rd6;
54
+ .loc 1 26 36
55
+ @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
56
+ .loc 1 26 4
57
+ ret;
58
+ $L__tmp1:
59
+ $L__func_end0:
60
+
61
+ }
62
+ .file 1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
63
+ .section .debug_abbrev
64
+ {
65
+ .b8 1
66
+ .b8 17
67
+ .b8 1
68
+ .b8 37
69
+ .b8 8
70
+ .b8 19
71
+ .b8 5
72
+ .b8 3
73
+ .b8 8
74
+ .b8 16
75
+ .b8 6
76
+ .b8 27
77
+ .b8 8
78
+ .b8 180
79
+ .b8 66
80
+ .b8 12
81
+ .b8 17
82
+ .b8 1
83
+ .b8 18
84
+ .b8 1
85
+ .b8 0
86
+ .b8 0
87
+ .b8 2
88
+ .b8 46
89
+ .b8 0
90
+ .b8 17
91
+ .b8 1
92
+ .b8 18
93
+ .b8 1
94
+ .b8 64
95
+ .b8 10
96
+ .b8 135
97
+ .b8 64
98
+ .b8 8
99
+ .b8 3
100
+ .b8 8
101
+ .b8 58
102
+ .b8 11
103
+ .b8 59
104
+ .b8 11
105
+ .b8 63
106
+ .b8 12
107
+ .b8 0
108
+ .b8 0
109
+ .b8 0
110
+ }
111
+ .section .debug_info
112
+ {
113
+ .b32 176
114
+ .b8 2
115
+ .b8 0
116
+ .b32 .debug_abbrev
117
+ .b8 8
118
+ .b8 1
119
+ .b8 116
120
+ .b8 114
121
+ .b8 105
122
+ .b8 116
123
+ .b8 111
124
+ .b8 110
125
+ .b8 0
126
+ .b8 2
127
+ .b8 0
128
+ .b8 99
129
+ .b8 107
130
+ .b8 54
131
+ .b8 50
132
+ .b8 107
133
+ .b8 50
134
+ .b8 120
135
+ .b8 122
136
+ .b8 98
137
+ .b8 98
138
+ .b8 54
139
+ .b8 53
140
+ .b8 55
141
+ .b8 115
142
+ .b8 110
143
+ .b8 102
144
+ .b8 100
145
+ .b8 111
146
+ .b8 119
147
+ .b8 119
148
+ .b8 97
149
+ .b8 110
150
+ .b8 122
151
+ .b8 115
152
+ .b8 122
153
+ .b8 97
154
+ .b8 105
155
+ .b8 106
156
+ .b8 54
157
+ .b8 113
158
+ .b8 122
159
+ .b8 119
160
+ .b8 54
161
+ .b8 118
162
+ .b8 117
163
+ .b8 99
164
+ .b8 55
165
+ .b8 99
166
+ .b8 102
167
+ .b8 105
168
+ .b8 100
169
+ .b8 111
170
+ .b8 109
171
+ .b8 106
172
+ .b8 112
173
+ .b8 107
174
+ .b8 107
175
+ .b8 54
176
+ .b8 105
177
+ .b8 103
178
+ .b8 99
179
+ .b8 109
180
+ .b8 46
181
+ .b8 112
182
+ .b8 121
183
+ .b8 0
184
+ .b32 .debug_line
185
+ .b8 47
186
+ .b8 116
187
+ .b8 109
188
+ .b8 112
189
+ .b8 47
190
+ .b8 116
191
+ .b8 111
192
+ .b8 114
193
+ .b8 99
194
+ .b8 104
195
+ .b8 105
196
+ .b8 110
197
+ .b8 100
198
+ .b8 117
199
+ .b8 99
200
+ .b8 116
201
+ .b8 111
202
+ .b8 114
203
+ .b8 95
204
+ .b8 114
205
+ .b8 111
206
+ .b8 111
207
+ .b8 116
208
+ .b8 47
209
+ .b8 107
210
+ .b8 54
211
+ .b8 0
212
+ .b8 1
213
+ .b64 $L__func_begin0
214
+ .b64 $L__func_end0
215
+ .b8 2
216
+ .b64 $L__func_begin0
217
+ .b64 $L__func_end0
218
+ .b8 1
219
+ .b8 156
220
+ .b8 116
221
+ .b8 114
222
+ .b8 105
223
+ .b8 116
224
+ .b8 111
225
+ .b8 110
226
+ .b8 95
227
+ .b8 95
228
+ .b8 48
229
+ .b8 100
230
+ .b8 49
231
+ .b8 100
232
+ .b8 50
233
+ .b8 100
234
+ .b8 101
235
+ .b8 0
236
+ .b8 116
237
+ .b8 114
238
+ .b8 105
239
+ .b8 116
240
+ .b8 111
241
+ .b8 110
242
+ .b8 95
243
+ .b8 95
244
+ .b8 48
245
+ .b8 100
246
+ .b8 49
247
+ .b8 100
248
+ .b8 50
249
+ .b8 100
250
+ .b8 101
251
+ .b8 0
252
+ .b8 1
253
+ .b8 18
254
+ .b8 1
255
+ .b8 0
256
+ }
257
+ .section .debug_pubnames
258
+ {
259
+ .b32 $L__pubNames_end0-$L__pubNames_start0
260
+ $L__pubNames_start0:
261
+ .b8 2
262
+ .b8 0
263
+ .b32 .debug_info
264
+ .b32 180
265
+ .b32 125
266
+ .b8 116
267
+ .b8 114
268
+ .b8 105
269
+ .b8 116
270
+ .b8 111
271
+ .b8 110
272
+ .b8 95
273
+ .b8 95
274
+ .b8 48
275
+ .b8 100
276
+ .b8 49
277
+ .b8 100
278
+ .b8 50
279
+ .b8 100
280
+ .b8 101
281
+ .b8 0
282
+ .b32 0
283
+ $L__pubNames_end0:
284
+ }
285
+ .section .debug_pubtypes
286
+ {
287
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
288
+ $L__pubTypes_start0:
289
+ .b8 2
290
+ .b8 0
291
+ .b32 .debug_info
292
+ .b32 180
293
+ .b32 0
294
+ $L__pubTypes_end0:
295
+ }
296
+ .section .debug_loc { }
.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c512_i32 : i32
6
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
8
+ %4 = arith.addi %3, %2 : tensor<512xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
12
+ %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
13
+ %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
14
+ %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
15
+ tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/a69784da01a97187168f22847465505f/triton_.ttir ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
28
+ %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
30
+ %20 = arith.addf %8, %12 : tensor<256xf32>
31
+ %21 = arith.addf %20, %16 : tensor<256xf32>
32
+ %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
33
+ %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
34
+ ^bb0(%arg9: f32, %arg10: f32):
35
+ %47 = arith.addf %arg9, %arg10 : f32
36
+ tt.reduce.return %47 : f32
37
+ }) : (tensor<256xf32>) -> f32
38
+ %24 = arith.addf %23, %cst_0 : f32
39
+ %25 = arith.divf %24, %cst_1 : f32
40
+ %26 = tt.splat %25 : (f32) -> tensor<1xf32>
41
+ %27 = tt.splat %25 : (f32) -> tensor<256xf32>
42
+ %28 = arith.subf %21, %27 : tensor<256xf32>
43
+ %29 = arith.mulf %28, %28 : tensor<256xf32>
44
+ %30 = arith.select %2, %29, %cst_3 : tensor<256xi1>, tensor<256xf32>
45
+ %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg9: f32, %arg10: f32):
47
+ %47 = arith.addf %arg9, %arg10 : f32
48
+ tt.reduce.return %47 : f32
49
+ }) : (tensor<256xf32>) -> f32
50
+ %32 = arith.addf %31, %cst_0 : f32
51
+ %33 = arith.divf %32, %cst_1 : f32
52
+ %34 = arith.addf %33, %cst_2 : f32
53
+ %35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
54
+ %36 = tt.splat %35 : (f32) -> tensor<1xf32>
55
+ %37 = tt.splat %35 : (f32) -> tensor<256xf32>
56
+ %38 = arith.mulf %28, %37 : tensor<256xf32>
57
+ %39 = arith.mulf %38, %19 : tensor<256xf32>
58
+ gpu.barrier
59
+ %40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
60
+ %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
61
+ tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
62
+ %42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
63
+ %43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
64
+ %44 = arith.truncf %39 : tensor<256xf32> to tensor<256xbf16>
65
+ tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
66
+ %45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
67
+ %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
68
+ tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
69
+ tt.return
70
+ }
71
+ }
.triton/dump/a75e14a8d2d1ec8471f1c7b615552f8c/triton_.llir ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 1, !dbg !8
7
+ %6 = and i32 %5, 254, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 8, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = sext i32 %9 to i64, !dbg !12
12
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
13
+ %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
14
+ %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
15
+ %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
16
+ %15 = bitcast i32 %13 to float, !dbg !13
17
+ %16 = bitcast i32 %14 to float, !dbg !13
18
+ %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
19
+ %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
20
+ %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
21
+ %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
22
+ %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
23
+ %22 = bitcast <2 x i16> %21 to i32, !dbg !15
24
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
25
+ ret void, !dbg !16
26
+ }
27
+
28
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
29
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
30
+
31
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
32
+ attributes #1 = { nounwind }
33
+
34
+ !llvm.module.flags = !{!0}
35
+ !llvm.dbg.cu = !{!1}
36
+ !nvvm.annotations = !{!3, !4, !4, !3}
37
+
38
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
39
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
40
+ !2 = !DIFile(filename: "cpqhcwm5bfrhuwddh4c4qks6bh7sovfbpfnmqhnm4h4w23icqnu6.py", directory: "/tmp/torchinductor_root/pq")
41
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
42
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
43
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
44
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
45
+ !7 = !{}
46
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
47
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
48
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
49
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
50
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
51
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
52
+ !14 = !DILocation(line: 26, column: 25, scope: !5)
53
+ !15 = !DILocation(line: 26, column: 36, scope: !5)
54
+ !16 = !DILocation(line: 26, column: 4, scope: !5)