0-hero commited on
Commit
d742687
·
verified ·
1 Parent(s): 8c1fe04

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin +0 -0
  2. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir +523 -0
  3. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx +951 -0
  4. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir +165 -0
  5. .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir +153 -0
  6. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin +0 -0
  7. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir +75 -0
  8. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin +0 -0
  9. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir +283 -0
  10. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx +687 -0
  11. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir +58 -0
  12. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir +1121 -0
  13. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx +1854 -0
  14. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir +134 -0
  15. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir +245 -0
  16. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir +858 -0
  17. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir +18 -0
  18. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir +330 -0
  19. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx +756 -0
  20. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir +141 -0
  21. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir +139 -0
  22. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir +235 -0
  23. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir +56 -0
  24. .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin +0 -0
  25. .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir +503 -0
  26. .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir +125 -0
  27. .triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir +125 -0
  28. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx +456 -0
  29. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir +62 -0
  30. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir +61 -0
  31. .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin +0 -0
  32. .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir +355 -0
  33. .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir +153 -0
  34. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx +717 -0
  35. .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir +109 -0
  36. .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx +1054 -0
  37. .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir +134 -0
  38. .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir +113 -0
  39. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.cubin +0 -0
  40. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.cubin +0 -0
  41. .triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir +37 -0
  42. .triton/dump/510522bb05917b836ed253751364fcad/triton_.llir +1211 -0
  43. .triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir +137 -0
  44. .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin +0 -0
  45. .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx +1154 -0
  46. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin +0 -0
  47. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx +758 -0
  48. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir +68 -0
  49. .triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttir +67 -0
  50. .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin +0 -0
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin ADDED
Binary file (28.5 kB). View file
 
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = and i32 %9, 31, !dbg !10
18
+ %11 = lshr i32 %9, 5, !dbg !10
19
+ %12 = and i32 %11, 3, !dbg !10
20
+ %13 = lshr i32 %10, 1, !dbg !10
21
+ %14 = shl nuw nsw i32 %12, 4, !dbg !10
22
+ %15 = or i32 %14, %13, !dbg !10
23
+ %16 = and i32 %9, 63, !dbg !10
24
+ %17 = shl i32 %9, 2, !dbg !11
25
+ %18 = and i32 %17, 4, !dbg !11
26
+ %19 = and i32 %9, 7, !dbg !11
27
+ %20 = shl nuw nsw i32 %12, 2, !dbg !12
28
+ %21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
29
+ %22 = shl i32 %21, 6, !dbg !14
30
+ %23 = or i32 %22, %15, !dbg !15
31
+ %24 = or i32 %22, %16, !dbg !15
32
+ %25 = sext i32 %23 to i64, !dbg !16
33
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
34
+ %27 = sext i32 %24 to i64, !dbg !16
35
+ %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
36
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
37
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
38
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
39
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
40
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
41
+ %34 = srem i32 %23, 512, !dbg !18
42
+ %35 = shl nsw i32 %34, 8, !dbg !19
43
+ %36 = shl i32 %23, 8, !dbg !20
44
+ %37 = add i64 %33, 50257, !dbg !21
45
+ %38 = icmp slt i64 %29, 0, !dbg !22
46
+ %39 = icmp slt i64 %33, 0, !dbg !22
47
+ %40 = select i1 %39, i64 %37, i64 %33, !dbg !23
48
+ %41 = icmp ugt i64 %40, 50256, !dbg !24
49
+ %42 = shl i64 %29, 8, !dbg !25
50
+ %43 = add i64 %42, 12865792, !dbg !25
51
+ %44 = select i1 %38, i64 %43, i64 %42, !dbg !25
52
+ %45 = getelementptr float, ptr addrspace(1) %1, i64 %44
53
+ br label %46, !dbg !12
54
+
55
+ 46: ; preds = %8, %92
56
+ %47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
57
+ %48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
58
+ %49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
59
+ %50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
60
+ %51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
61
+ %52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
62
+ %53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
63
+ %54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
64
+ %55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
65
+ %56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
66
+ %57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
67
+ %58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
68
+ %59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
69
+ %60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
70
+ %61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
71
+ %62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
72
+ %63 = phi i32 [ 0, %8 ], [ %144, %92 ]
73
+ %64 = or i32 %63, %18, !dbg !26
74
+ %65 = add i32 %64, %35, !dbg !27
75
+ %66 = sext i32 %65 to i64, !dbg !28
76
+ %67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
77
+ %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
78
+ %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
79
+ %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
80
+ %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
81
+ %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
82
+ %73 = bitcast i32 %69 to float, !dbg !29
83
+ %74 = bitcast i32 %70 to float, !dbg !29
84
+ %75 = bitcast i32 %71 to float, !dbg !29
85
+ %76 = bitcast i32 %72 to float, !dbg !29
86
+ %77 = add i32 %64, %36, !dbg !30
87
+ %78 = sext i32 %77 to i64, !dbg !31
88
+ %79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
89
+ %80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
90
+ %81 = extractvalue { i32, i32 } %80, 0, !dbg !32
91
+ %82 = extractvalue { i32, i32 } %80, 1, !dbg !32
92
+ %83 = trunc i32 %81 to i16, !dbg !32
93
+ %extelt.offset3 = lshr i32 %81, 16, !dbg !32
94
+ %84 = trunc i32 %extelt.offset3 to i16, !dbg !32
95
+ %85 = trunc i32 %82 to i16, !dbg !32
96
+ %extelt.offset4 = lshr i32 %82, 16, !dbg !32
97
+ %86 = trunc i32 %extelt.offset4 to i16, !dbg !32
98
+ %87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
99
+ %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
100
+ %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
101
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
102
+ br i1 %41, label %91, label %92, !dbg !34
103
+
104
+ 91: ; preds = %46
105
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
106
+ br label %92, !dbg !34
107
+
108
+ 92: ; preds = %91, %46
109
+ %93 = zext nneg i32 %64 to i64, !dbg !35
110
+ %94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
111
+ %95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
112
+ %96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
113
+ %97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
114
+ %98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
115
+ %99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
116
+ %100 = bitcast i32 %96 to float, !dbg !37
117
+ %101 = bitcast i32 %97 to float, !dbg !37
118
+ %102 = bitcast i32 %98 to float, !dbg !37
119
+ %103 = bitcast i32 %99 to float, !dbg !37
120
+ %104 = fadd float %73, %100, !dbg !38
121
+ %105 = fadd float %74, %101, !dbg !38
122
+ %106 = fadd float %75, %102, !dbg !38
123
+ %107 = fadd float %76, %103, !dbg !38
124
+ %108 = fadd float %87, %104, !dbg !39
125
+ %109 = fadd float %88, %105, !dbg !39
126
+ %110 = fadd float %89, %106, !dbg !39
127
+ %111 = fadd float %90, %107, !dbg !39
128
+ %112 = fsub float %108, %59, !dbg !40
129
+ %113 = fsub float %109, %60, !dbg !40
130
+ %114 = fsub float %110, %61, !dbg !40
131
+ %115 = fsub float %111, %62, !dbg !40
132
+ %116 = fadd float %47, 1.000000e+00, !dbg !44
133
+ %117 = fadd float %48, 1.000000e+00, !dbg !44
134
+ %118 = fadd float %49, 1.000000e+00, !dbg !44
135
+ %119 = fadd float %50, 1.000000e+00, !dbg !44
136
+ %120 = fadd float %51, 1.000000e+00, !dbg !44
137
+ %121 = fadd float %52, 1.000000e+00, !dbg !44
138
+ %122 = fadd float %53, 1.000000e+00, !dbg !44
139
+ %123 = fadd float %54, 1.000000e+00, !dbg !44
140
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
141
+ %125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
142
+ %126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
143
+ %127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
144
+ %128 = fadd float %59, %124, !dbg !46
145
+ %129 = fadd float %60, %125, !dbg !46
146
+ %130 = fadd float %61, %126, !dbg !46
147
+ %131 = fadd float %62, %127, !dbg !46
148
+ %132 = fsub float %108, %128, !dbg !47
149
+ %133 = fsub float %109, %129, !dbg !47
150
+ %134 = fsub float %110, %130, !dbg !47
151
+ %135 = fsub float %111, %131, !dbg !47
152
+ %136 = fmul float %112, %132, !dbg !48
153
+ %137 = fmul float %113, %133, !dbg !48
154
+ %138 = fmul float %114, %134, !dbg !48
155
+ %139 = fmul float %115, %135, !dbg !48
156
+ %140 = fadd float %55, %136, !dbg !49
157
+ %141 = fadd float %56, %137, !dbg !49
158
+ %142 = fadd float %57, %138, !dbg !49
159
+ %143 = fadd float %58, %139, !dbg !49
160
+ %144 = add nuw nsw i32 %63, 8, !dbg !12
161
+ %145 = icmp ult i32 %63, 248, !dbg !12
162
+ br i1 %145, label %46, label %146, !dbg !12
163
+
164
+ 146: ; preds = %92
165
+ %147 = lshr i32 %10, 3, !dbg !12
166
+ %148 = or i32 %20, %147, !dbg !12
167
+ %149 = mul nuw nsw i32 %148, 12, !dbg !12
168
+ %150 = add nuw nsw i32 %149, %19, !dbg !12
169
+ %151 = zext nneg i32 %150 to i64, !dbg !12
170
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
171
+ %153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
172
+ store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
173
+ %154 = or i32 %19, 192, !dbg !12
174
+ %155 = add nuw nsw i32 %154, %149, !dbg !12
175
+ %156 = zext nneg i32 %155 to i64, !dbg !12
176
+ %157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
177
+ %158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
178
+ store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
179
+ %159 = or i32 %19, 384, !dbg !12
180
+ %160 = add nuw nsw i32 %159, %149, !dbg !12
181
+ %161 = zext nneg i32 %160 to i64, !dbg !12
182
+ %162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
183
+ %163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
184
+ store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
185
+ %164 = or i32 %19, 576, !dbg !12
186
+ %165 = add nuw nsw i32 %164, %149, !dbg !12
187
+ %166 = zext nneg i32 %165 to i64, !dbg !12
188
+ %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
189
+ %168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
190
+ store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
191
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
192
+ %169 = mul nuw nsw i32 %15, 12, !dbg !12
193
+ %170 = add nuw nsw i32 %169, %18, !dbg !12
194
+ %171 = zext nneg i32 %170 to i64, !dbg !12
195
+ %172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
196
+ %173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
197
+ %174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
198
+ %175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
199
+ %176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
200
+ %177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
201
+ %178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
202
+ %179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
203
+ %180 = fsub float %129, %128, !dbg !50
204
+ %181 = fadd float %173, %175, !dbg !54
205
+ %182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
206
+ %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
207
+ %184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
208
+ %185 = fmul float %180, %184, !dbg !58
209
+ %186 = fadd float %128, %185, !dbg !59
210
+ %187 = fadd float %140, %141, !dbg !60
211
+ %188 = fmul float %180, %180, !dbg !61
212
+ %189 = fmul float %188, %173, !dbg !62
213
+ %190 = fmul float %189, %184, !dbg !63
214
+ %191 = fadd float %187, %190, !dbg !64
215
+ %192 = fsub float %130, %186, !dbg !50
216
+ %193 = fadd float %177, %181, !dbg !54
217
+ %194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
218
+ %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
219
+ %196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
220
+ %197 = fmul float %196, %192, !dbg !58
221
+ %198 = fadd float %186, %197, !dbg !59
222
+ %199 = fadd float %142, %191, !dbg !60
223
+ %200 = fmul float %192, %192, !dbg !61
224
+ %201 = fmul float %181, %200, !dbg !62
225
+ %202 = fmul float %196, %201, !dbg !63
226
+ %203 = fadd float %199, %202, !dbg !64
227
+ %204 = fsub float %131, %198, !dbg !50
228
+ %205 = fadd float %179, %193, !dbg !54
229
+ %206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
230
+ %207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
231
+ %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
232
+ %209 = fmul float %208, %204, !dbg !58
233
+ %210 = fadd float %198, %209, !dbg !59
234
+ %211 = fadd float %143, %203, !dbg !60
235
+ %212 = fmul float %204, %204, !dbg !61
236
+ %213 = fmul float %193, %212, !dbg !62
237
+ %214 = fmul float %208, %213, !dbg !63
238
+ %215 = fadd float %211, %214, !dbg !64
239
+ %216 = bitcast float %210 to i32, !dbg !65
240
+ %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
241
+ %218 = bitcast i32 %217 to float, !dbg !65
242
+ %219 = bitcast float %215 to i32, !dbg !65
243
+ %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
244
+ %221 = bitcast i32 %220 to float, !dbg !65
245
+ %222 = bitcast float %205 to i32, !dbg !65
246
+ %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
247
+ %224 = bitcast i32 %223 to float, !dbg !65
248
+ %225 = fsub float %218, %210, !dbg !50
249
+ %226 = fadd float %205, %224, !dbg !54
250
+ %227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
251
+ %228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
252
+ %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
253
+ %230 = fmul float %229, %225, !dbg !58
254
+ %231 = fadd float %210, %230, !dbg !59
255
+ %232 = fadd float %215, %221, !dbg !60
256
+ %233 = fmul float %225, %225, !dbg !61
257
+ %234 = fmul float %205, %233, !dbg !62
258
+ %235 = fmul float %229, %234, !dbg !63
259
+ %236 = fadd float %232, %235, !dbg !64
260
+ %237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
261
+ %238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
262
+ %239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
263
+ %240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
264
+ %241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
265
+ br label %242, !dbg !69
266
+
267
+ 242: ; preds = %146, %__nv_rsqrtf.exit
268
+ %243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
269
+ %244 = or i32 %243, %18, !dbg !70
270
+ %245 = add i32 %244, %35, !dbg !71
271
+ %246 = sext i32 %245 to i64, !dbg !72
272
+ %247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
273
+ %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
274
+ %249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
275
+ %250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
276
+ %251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
277
+ %252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
278
+ %253 = bitcast i32 %249 to float, !dbg !73
279
+ %254 = bitcast i32 %250 to float, !dbg !73
280
+ %255 = bitcast i32 %251 to float, !dbg !73
281
+ %256 = bitcast i32 %252 to float, !dbg !73
282
+ %257 = add i32 %244, %36, !dbg !74
283
+ %258 = sext i32 %257 to i64, !dbg !75
284
+ %259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
285
+ %260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
286
+ %261 = extractvalue { i32, i32 } %260, 0, !dbg !76
287
+ %262 = extractvalue { i32, i32 } %260, 1, !dbg !76
288
+ %263 = trunc i32 %261 to i16, !dbg !76
289
+ %extelt.offset = lshr i32 %261, 16, !dbg !76
290
+ %264 = trunc i32 %extelt.offset to i16, !dbg !76
291
+ %265 = trunc i32 %262 to i16, !dbg !76
292
+ %extelt.offset2 = lshr i32 %262, 16, !dbg !76
293
+ %266 = trunc i32 %extelt.offset2 to i16, !dbg !76
294
+ %267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
295
+ %268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
296
+ %269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
297
+ %270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
298
+ %271 = zext nneg i32 %244 to i64, !dbg !78
299
+ %272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
300
+ %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
301
+ %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
302
+ %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
303
+ %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
304
+ %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
305
+ %278 = bitcast i32 %274 to float, !dbg !79
306
+ %279 = bitcast i32 %275 to float, !dbg !79
307
+ %280 = bitcast i32 %276 to float, !dbg !79
308
+ %281 = bitcast i32 %277 to float, !dbg !79
309
+ br i1 %41, label %282, label %283, !dbg !80
310
+
311
+ 282: ; preds = %242
312
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
313
+ br label %283, !dbg !80
314
+
315
+ 283: ; preds = %282, %242
316
+ %284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
317
+ %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
318
+ %286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
319
+ %287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
320
+ %288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
321
+ %289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
322
+ %290 = bitcast i32 %286 to float, !dbg !82
323
+ %291 = bitcast i32 %287 to float, !dbg !82
324
+ %292 = bitcast i32 %288 to float, !dbg !82
325
+ %293 = bitcast i32 %289 to float, !dbg !82
326
+ %294 = fadd float %253, %290, !dbg !83
327
+ %295 = fadd float %254, %291, !dbg !83
328
+ %296 = fadd float %255, %292, !dbg !83
329
+ %297 = fadd float %256, %293, !dbg !83
330
+ %298 = fadd float %267, %294, !dbg !84
331
+ %299 = fadd float %268, %295, !dbg !84
332
+ %300 = fadd float %269, %296, !dbg !84
333
+ %301 = fadd float %270, %297, !dbg !84
334
+ %302 = fsub float %298, %231, !dbg !85
335
+ %303 = fsub float %299, %231, !dbg !85
336
+ %304 = fsub float %300, %231, !dbg !85
337
+ %305 = fsub float %301, %231, !dbg !85
338
+ %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
339
+ %.not.i = icmp eq i32 %306, 0, !dbg !86
340
+ br i1 %.not.i, label %309, label %307, !dbg !86
341
+
342
+ 307: ; preds = %283
343
+ %308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
344
+ br label %__nv_rsqrtf.exit, !dbg !86
345
+
346
+ 309: ; preds = %283
347
+ %310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
348
+ br label %__nv_rsqrtf.exit, !dbg !86
349
+
350
+ __nv_rsqrtf.exit: ; preds = %307, %309
351
+ %.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
352
+ %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
353
+ %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
354
+ %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
355
+ %314 = fmul float %302, %.0.i, !dbg !87
356
+ %315 = fmul float %303, %.0.i, !dbg !87
357
+ %316 = fmul float %304, %.0.i, !dbg !87
358
+ %317 = fmul float %305, %.0.i, !dbg !87
359
+ %318 = fmul float %314, %278, !dbg !88
360
+ %319 = fmul float %315, %279, !dbg !88
361
+ %320 = fmul float %316, %280, !dbg !88
362
+ %321 = fmul float %317, %281, !dbg !88
363
+ %322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
364
+ %323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
365
+ %324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
366
+ %325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
367
+ %326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
368
+ %327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
369
+ %328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
370
+ %329 = bitcast <2 x i16> %328 to i32, !dbg !90
371
+ %330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
372
+ %331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
373
+ %332 = bitcast <2 x i16> %331 to i32, !dbg !90
374
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
375
+ %333 = add nuw nsw i32 %243, 8, !dbg !69
376
+ %334 = icmp ult i32 %243, 248, !dbg !69
377
+ br i1 %334, label %242, label %335, !dbg !69
378
+
379
+ 335: ; preds = %__nv_rsqrtf.exit
380
+ ret void, !dbg !91
381
+ }
382
+
383
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
384
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
385
+
386
+ ; Function Attrs: convergent nocallback nounwind
387
+ declare void @llvm.nvvm.barrier0() #1
388
+
389
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
390
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
391
+
392
+ ; Function Attrs: alwaysinline nounwind
393
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
394
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
395
+ %.not = icmp eq i32 %1, 0
396
+ br i1 %.not, label %4, label %2
397
+
398
+ 2: ; preds = %0
399
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
400
+ br label %6
401
+
402
+ 4: ; preds = %0
403
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
404
+ br label %6
405
+
406
+ 6: ; preds = %4, %2
407
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
408
+ ret float %.0
409
+ }
410
+
411
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
412
+
413
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
414
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
415
+
416
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
417
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
418
+
419
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
420
+ attributes #1 = { convergent nocallback nounwind }
421
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
422
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
423
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
424
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
425
+ attributes #6 = { nounwind }
426
+
427
+ !llvm.module.flags = !{!0, !1}
428
+ !llvm.dbg.cu = !{!2}
429
+ !nvvm.annotations = !{!4, !5, !5, !4}
430
+ !llvm.ident = !{!6}
431
+
432
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
433
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
434
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
435
+ !3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
436
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
437
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
438
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
439
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
440
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
441
+ !9 = !{}
442
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
443
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
444
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
445
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
446
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
447
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
448
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
449
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
450
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
451
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
452
+ !20 = !DILocation(line: 36, column: 44, scope: !7)
453
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
454
+ !22 = !DILocation(line: 38, column: 22, scope: !7)
455
+ !23 = !DILocation(line: 39, column: 36, scope: !7)
456
+ !24 = !DILocation(line: 40, column: 40, scope: !7)
457
+ !25 = !DILocation(line: 41, column: 44, scope: !7)
458
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
459
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
460
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
461
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
462
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
463
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
464
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
465
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
466
+ !34 = !DILocation(line: 40, column: 55, scope: !7)
467
+ !35 = !DILocation(line: 41, column: 40, scope: !7)
468
+ !36 = !DILocation(line: 41, column: 34, scope: !7)
469
+ !37 = !DILocation(line: 41, column: 52, scope: !7)
470
+ !38 = !DILocation(line: 42, column: 22, scope: !7)
471
+ !39 = !DILocation(line: 44, column: 22, scope: !7)
472
+ !40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
473
+ !41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
474
+ !42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
475
+ !43 = !DILocation(line: 47, column: 41, scope: !41)
476
+ !44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
477
+ !45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
478
+ !46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
479
+ !47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
480
+ !48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
481
+ !49 = !DILocation(line: 50, column: 50, scope: !7)
482
+ !50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
483
+ !51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
484
+ !52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
485
+ !53 = !DILocation(line: 53, column: 44, scope: !51)
486
+ !54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
487
+ !55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
488
+ !56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
489
+ !57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
490
+ !58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
491
+ !59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
492
+ !60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
493
+ !61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
494
+ !62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
495
+ !63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
496
+ !64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
497
+ !65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
498
+ !66 = !DILocation(line: 53, column: 44, scope: !41)
499
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
500
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
501
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
502
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
503
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
504
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
505
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
506
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
507
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
508
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
509
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
510
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
511
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
512
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
513
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
514
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
515
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
516
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
517
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
518
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
519
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
520
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
521
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
522
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
523
+ !91 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<48>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<158>;
43
+ .reg .f32 %f<164>;
44
+ .reg .b64 %rd<73>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6de7de_param_5];
50
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_4];
51
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_3];
52
+ ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6de7de_param_0];
53
+ ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6de7de_param_1];
54
+ $L__tmp0:
55
+ .loc 1 22 44
56
+ mov.u32 %r13, %tid.x;
57
+ and.b32 %r1, %r13, 31;
58
+ ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6de7de_param_2];
59
+ bfe.u32 %r14, %r13, 5, 2;
60
+ bfe.u32 %r15, %r13, 1, 4;
61
+ shl.b32 %r16, %r14, 4;
62
+ or.b32 %r2, %r16, %r15;
63
+ and.b32 %r17, %r13, 63;
64
+ .loc 1 24 33
65
+ shl.b32 %r18, %r13, 2;
66
+ and.b32 %r3, %r18, 4;
67
+ and.b32 %r4, %r13, 7;
68
+ .loc 1 31 36
69
+ shl.b32 %r5, %r14, 2;
70
+ .loc 1 21 28
71
+ mov.u32 %r11, %ctaid.x;
72
+ .loc 1 21 33
73
+ shl.b32 %r19, %r11, 6;
74
+ .loc 1 22 23
75
+ or.b32 %r20, %r19, %r2;
76
+ or.b32 %r21, %r19, %r17;
77
+ .loc 1 26 30
78
+ mul.wide.s32 %rd33, %r20, 8;
79
+ add.s64 %rd21, %rd30, %rd33;
80
+ mul.wide.s32 %rd34, %r21, 8;
81
+ add.s64 %rd29, %rd30, %rd34;
82
+ mov.pred %p1, -1;
83
+ .loc 1 26 35
84
+ mov.u64 %rd20, 0x0;
85
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
86
+ mov.u64 %rd22, 0x0;
87
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd21 + 0 ];
88
+ mov.u64 %rd24, 0x0;
89
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd21 + 0 ];
90
+ mov.u64 %rd26, 0x0;
91
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd21 + 0 ];
92
+ mov.u64 %rd28, 0x0;
93
+ @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
94
+ .loc 1 27 18
95
+ bfe.s32 %r22, %r11, 25, 1;
96
+ shr.u32 %r23, %r22, 23;
97
+ add.s32 %r24, %r20, %r23;
98
+ and.b32 %r25, %r24, 16776704;
99
+ sub.s32 %r26, %r20, %r25;
100
+ .loc 1 35 44
101
+ shl.b32 %r27, %r26, 8;
102
+ .loc 1 37 22
103
+ add.s64 %rd35, %rd28, 50257;
104
+ .loc 1 38 22
105
+ setp.lt.s64 %p6, %rd20, 0;
106
+ setp.lt.s64 %p7, %rd28, 0;
107
+ .loc 1 39 36
108
+ selp.b64 %rd1, %rd35, %rd28, %p7;
109
+ .loc 1 41 44
110
+ shl.b64 %rd36, %rd20, 8;
111
+ add.s64 %rd37, %rd36, 12865792;
112
+ selp.b64 %rd38, %rd37, %rd36, %p6;
113
+ .loc 1 31 36
114
+ and.b32 %r28, %r13, 1;
115
+ mul.wide.u32 %rd2, %r28, 16;
116
+ shl.b64 %rd39, %rd38, 2;
117
+ or.b64 %rd40, %rd2, %rd39;
118
+ add.s64 %rd72, %rd31, %rd40;
119
+ shl.b32 %r29, %r11, 14;
120
+ shl.b32 %r30, %r14, 12;
121
+ or.b32 %r31, %r29, %r30;
122
+ shl.b32 %r32, %r15, 8;
123
+ or.b32 %r33, %r31, %r32;
124
+ or.b32 %r6, %r33, %r3;
125
+ or.b32 %r34, %r27, %r3;
126
+ mul.wide.s32 %rd41, %r34, 4;
127
+ add.s64 %rd70, %rd32, %rd41;
128
+ mov.f32 %f148, 0f00000000;
129
+ mov.b32 %r156, -8;
130
+ mov.u64 %rd68, %rd70;
131
+ mov.u64 %rd69, %rd72;
132
+ mov.f32 %f149, %f148;
133
+ mov.f32 %f150, %f148;
134
+ mov.f32 %f151, %f148;
135
+ mov.f32 %f152, %f148;
136
+ mov.f32 %f153, %f148;
137
+ mov.f32 %f154, %f148;
138
+ mov.f32 %f155, %f148;
139
+ mov.f32 %f156, %f148;
140
+ mov.f32 %f157, %f148;
141
+ mov.f32 %f158, %f148;
142
+ mov.f32 %f159, %f148;
143
+ mov.f32 %f160, %f148;
144
+ mov.f32 %f161, %f148;
145
+ mov.f32 %f162, %f148;
146
+ mov.f32 %f163, %f148;
147
+ bra.uni $L__BB0_1;
148
+ $L__BB0_3:
149
+ .loc 1 0 0
150
+ mov.b32 %f17, %r35;
151
+ mov.b32 %f18, %r36;
152
+ mov.b32 %f19, %r37;
153
+ mov.b32 %f20, %r38;
154
+ cvt.u16.u32 %rs1, %r43;
155
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r43; }
156
+ cvt.u16.u32 %rs3, %r44;
157
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r44; }
158
+ cvt.f32.bf16 %r47, %rs1;
159
+ mov.b32 %f21, %r47;
160
+ cvt.f32.bf16 %r48, %rs2;
161
+ mov.b32 %f22, %r48;
162
+ cvt.f32.bf16 %r49, %rs3;
163
+ mov.b32 %f23, %r49;
164
+ cvt.f32.bf16 %r50, %rs4;
165
+ mov.b32 %f24, %r50;
166
+ .loc 1 41 52
167
+ mov.u32 %r54, 0x0;
168
+ mov.u32 %r55, 0x0;
169
+ mov.u32 %r56, 0x0;
170
+ mov.u32 %r57, 0x0;
171
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd69 + 0 ];
172
+ @!%p1 mov.u32 %r54, %r143;
173
+ @!%p1 mov.u32 %r55, %r143;
174
+ @!%p1 mov.u32 %r56, %r143;
175
+ @!%p1 mov.u32 %r57, %r143;
176
+ mov.b32 %f56, %r54;
177
+ mov.b32 %f57, %r55;
178
+ mov.b32 %f58, %r56;
179
+ mov.b32 %f59, %r57;
180
+ .loc 1 42 22
181
+ add.f32 %f60, %f17, %f56;
182
+ add.f32 %f61, %f18, %f57;
183
+ add.f32 %f62, %f19, %f58;
184
+ add.f32 %f63, %f20, %f59;
185
+ .loc 1 44 22
186
+ add.f32 %f64, %f21, %f60;
187
+ add.f32 %f65, %f22, %f61;
188
+ add.f32 %f66, %f23, %f62;
189
+ add.f32 %f67, %f24, %f63;
190
+ $L__tmp1:
191
+ .loc 2 96 20
192
+ sub.f32 %f68, %f64, %f160;
193
+ sub.f32 %f69, %f65, %f161;
194
+ sub.f32 %f70, %f66, %f162;
195
+ sub.f32 %f71, %f67, %f163;
196
+ .loc 2 97 26
197
+ add.f32 %f148, %f148, 0f3F800000;
198
+ add.f32 %f149, %f149, 0f3F800000;
199
+ add.f32 %f150, %f150, 0f3F800000;
200
+ add.f32 %f151, %f151, 0f3F800000;
201
+ add.f32 %f152, %f152, 0f3F800000;
202
+ add.f32 %f153, %f153, 0f3F800000;
203
+ add.f32 %f154, %f154, 0f3F800000;
204
+ add.f32 %f155, %f155, 0f3F800000;
205
+ .loc 2 98 30
206
+ mov.b32 %r63, %f68;
207
+ mov.b32 %r64, %f148;
208
+ div.full.f32 %r62, %r63, %r64;
209
+ mov.b32 %f72, %r62;
210
+ mov.b32 %r66, %f69;
211
+ mov.b32 %r67, %f149;
212
+ div.full.f32 %r65, %r66, %r67;
213
+ mov.b32 %f73, %r65;
214
+ mov.b32 %r69, %f70;
215
+ mov.b32 %r70, %f150;
216
+ div.full.f32 %r68, %r69, %r70;
217
+ mov.b32 %f74, %r68;
218
+ mov.b32 %r72, %f71;
219
+ mov.b32 %r73, %f151;
220
+ div.full.f32 %r71, %r72, %r73;
221
+ mov.b32 %f75, %r71;
222
+ .loc 2 98 22
223
+ add.f32 %f160, %f160, %f72;
224
+ add.f32 %f161, %f161, %f73;
225
+ add.f32 %f162, %f162, %f74;
226
+ add.f32 %f163, %f163, %f75;
227
+ .loc 2 101 30
228
+ sub.f32 %f76, %f64, %f160;
229
+ sub.f32 %f77, %f65, %f161;
230
+ sub.f32 %f78, %f66, %f162;
231
+ sub.f32 %f79, %f67, %f163;
232
+ $L__tmp2:
233
+ .loc 1 50 50
234
+ fma.rn.f32 %f156, %f68, %f76, %f156;
235
+ fma.rn.f32 %f157, %f69, %f77, %f157;
236
+ fma.rn.f32 %f158, %f70, %f78, %f158;
237
+ fma.rn.f32 %f159, %f71, %f79, %f159;
238
+ .loc 1 31 36
239
+ add.s32 %r156, %r156, 8;
240
+ add.s64 %rd69, %rd69, 32;
241
+ add.s64 %rd68, %rd68, 32;
242
+ setp.lt.u32 %p22, %r156, 248;
243
+ @%p22 bra $L__BB0_1;
244
+ bra.uni $L__BB0_4;
245
+ $L__BB0_1:
246
+ .loc 1 40 40
247
+ setp.lt.u64 %p16, %rd1, 50257;
248
+ mov.b32 %r143, 0;
249
+ .loc 1 35 50
250
+ mov.u32 %r35, 0x0;
251
+ mov.u32 %r36, 0x0;
252
+ mov.u32 %r37, 0x0;
253
+ mov.u32 %r38, 0x0;
254
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd68 + 0 ];
255
+ @!%p1 mov.u32 %r35, %r143;
256
+ @!%p1 mov.u32 %r36, %r143;
257
+ @!%p1 mov.u32 %r37, %r143;
258
+ @!%p1 mov.u32 %r38, %r143;
259
+ .loc 1 36 34
260
+ add.s32 %r51, %r6, %r156;
261
+ add.s32 %r52, %r51, 8;
262
+ mul.wide.s32 %rd44, %r52, 2;
263
+ add.s64 %rd43, %rd17, %rd44;
264
+ .loc 1 36 50
265
+ mov.u32 %r43, 0x0;
266
+ mov.u32 %r44, 0x0;
267
+ @%p1 ld.global.L1::evict_last.v2.b32 { %r43, %r44 }, [ %rd43 + 0 ];
268
+ @!%p1 mov.u32 %r43, %r143;
269
+ @!%p1 mov.u32 %r44, %r143;
270
+ mov.b32 %r155, 883;
271
+ mov.u64 %rd67, 1;
272
+ .loc 1 40 55
273
+ @%p16 bra $L__BB0_3;
274
+ mov.u64 %rd45, assertMessage_0;
275
+ cvta.global.u64 %rd46, %rd45;
276
+ mov.u64 %rd47, assertFile_0;
277
+ cvta.global.u64 %rd48, %rd47;
278
+ mov.u64 %rd49, assertFunc_0;
279
+ cvta.global.u64 %rd50, %rd49;
280
+ { // callseq 2, 0
281
+ .reg .b32 temp_param_reg;
282
+ .param .b64 param0;
283
+ st.param.b64 [param0+0], %rd46;
284
+ .param .b64 param1;
285
+ st.param.b64 [param1+0], %rd48;
286
+ .param .b32 param2;
287
+ st.param.b32 [param2+0], %r155;
288
+ .param .b64 param3;
289
+ st.param.b64 [param3+0], %rd50;
290
+ .param .b64 param4;
291
+ st.param.b64 [param4+0], %rd67;
292
+ call.uni
293
+ __assertfail,
294
+ (
295
+ param0,
296
+ param1,
297
+ param2,
298
+ param3,
299
+ param4
300
+ );
301
+ } // callseq 2
302
+ bra.uni $L__BB0_3;
303
+ $L__BB0_4:
304
+ .loc 1 31 36
305
+ shr.u32 %r99, %r1, 3;
306
+ or.b32 %r100, %r5, %r99;
307
+ mad.lo.s32 %r101, %r100, 12, %r4;
308
+ shl.b32 %r102, %r101, 2;
309
+ mov.u32 %r103, global_smem;
310
+ add.s32 %r104, %r103, %r102;
311
+ st.shared.f32 [%r104], %f152;
312
+ st.shared.f32 [%r104+768], %f153;
313
+ st.shared.f32 [%r104+1536], %f154;
314
+ st.shared.f32 [%r104+2304], %f155;
315
+ bar.sync 0;
316
+ mad.lo.s32 %r105, %r2, 12, %r3;
317
+ shl.b32 %r106, %r105, 2;
318
+ add.s32 %r107, %r103, %r106;
319
+ ld.shared.v4.f32 {%f80, %f81, %f82, %f83}, [%r107];
320
+ $L__tmp3:
321
+ .loc 2 108 21
322
+ sub.f32 %f84, %f161, %f160;
323
+ .loc 2 109 28
324
+ add.f32 %f85, %f80, %f81;
325
+ .loc 2 110 39
326
+ setp.eq.f32 %p23, %f85, 0f00000000;
327
+ .loc 2 110 60
328
+ mov.b32 %r75, %f81;
329
+ mov.b32 %r76, %f85;
330
+ div.full.f32 %r74, %r75, %r76;
331
+ mov.b32 %f86, %r74;
332
+ .loc 2 110 49
333
+ selp.f32 %f87, 0f00000000, %f86, %p23;
334
+ .loc 2 112 17
335
+ fma.rn.f32 %f88, %f84, %f87, %f160;
336
+ .loc 2 113 15
337
+ add.f32 %f89, %f156, %f157;
338
+ .loc 2 113 30
339
+ mul.f32 %f90, %f84, %f84;
340
+ .loc 2 113 38
341
+ mul.f32 %f91, %f90, %f80;
342
+ .loc 2 113 22
343
+ fma.rn.f32 %f92, %f91, %f87, %f89;
344
+ .loc 2 108 21
345
+ sub.f32 %f93, %f162, %f88;
346
+ .loc 2 109 28
347
+ add.f32 %f94, %f82, %f85;
348
+ .loc 2 110 39
349
+ setp.eq.f32 %p24, %f94, 0f00000000;
350
+ .loc 2 110 60
351
+ mov.b32 %r79, %f94;
352
+ mov.b32 %r78, %f82;
353
+ div.full.f32 %r77, %r78, %r79;
354
+ mov.b32 %f95, %r77;
355
+ .loc 2 110 49
356
+ selp.f32 %f96, 0f00000000, %f95, %p24;
357
+ .loc 2 112 17
358
+ fma.rn.f32 %f97, %f96, %f93, %f88;
359
+ .loc 2 113 15
360
+ add.f32 %f98, %f158, %f92;
361
+ .loc 2 113 30
362
+ mul.f32 %f99, %f93, %f93;
363
+ .loc 2 113 38
364
+ mul.f32 %f100, %f85, %f99;
365
+ .loc 2 113 22
366
+ fma.rn.f32 %f101, %f96, %f100, %f98;
367
+ .loc 2 108 21
368
+ sub.f32 %f102, %f163, %f97;
369
+ .loc 2 109 28
370
+ add.f32 %f103, %f83, %f94;
371
+ .loc 2 110 39
372
+ setp.eq.f32 %p25, %f103, 0f00000000;
373
+ .loc 2 110 60
374
+ mov.b32 %r82, %f103;
375
+ mov.b32 %r81, %f83;
376
+ div.full.f32 %r80, %r81, %r82;
377
+ mov.b32 %f104, %r80;
378
+ .loc 2 110 49
379
+ selp.f32 %f105, 0f00000000, %f104, %p25;
380
+ .loc 2 112 17
381
+ fma.rn.f32 %f106, %f105, %f102, %f97;
382
+ .loc 2 113 15
383
+ add.f32 %f107, %f159, %f101;
384
+ .loc 2 113 30
385
+ mul.f32 %f108, %f102, %f102;
386
+ .loc 2 113 38
387
+ mul.f32 %f109, %f94, %f108;
388
+ .loc 2 113 22
389
+ fma.rn.f32 %f110, %f105, %f109, %f107;
390
+ $L__tmp4:
391
+ .loc 2 120 46
392
+ mov.b32 %r108, %f106;
393
+ shfl.sync.bfly.b32 %r109, %r108, 1, 31, -1;
394
+ mov.b32 %f111, %r109;
395
+ mov.b32 %r110, %f110;
396
+ shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
397
+ mov.b32 %f112, %r111;
398
+ shfl.sync.bfly.b32 %r84, %r82, 1, 31, -1;
399
+ mov.b32 %f113, %r84;
400
+ $L__tmp5:
401
+ .loc 2 108 21
402
+ sub.f32 %f114, %f111, %f106;
403
+ .loc 2 109 28
404
+ add.f32 %f115, %f103, %f113;
405
+ .loc 2 110 39
406
+ setp.eq.f32 %p26, %f115, 0f00000000;
407
+ .loc 2 110 60
408
+ mov.b32 %r85, %f115;
409
+ div.full.f32 %r83, %r84, %r85;
410
+ mov.b32 %f116, %r83;
411
+ .loc 2 110 49
412
+ selp.f32 %f117, 0f00000000, %f116, %p26;
413
+ .loc 2 112 17
414
+ fma.rn.f32 %f41, %f117, %f114, %f106;
415
+ .loc 2 113 15
416
+ add.f32 %f118, %f110, %f112;
417
+ .loc 2 113 30
418
+ mul.f32 %f119, %f114, %f114;
419
+ .loc 2 113 38
420
+ mul.f32 %f120, %f103, %f119;
421
+ .loc 2 113 22
422
+ fma.rn.f32 %f121, %f117, %f120, %f118;
423
+ $L__tmp6:
424
+ .loc 1 75 24
425
+ mov.b32 %r87, %f121;
426
+ mov.b32 %r88, 1132462080;
427
+ div.full.f32 %r86, %r87, %r88;
428
+ mov.b32 %f122, %r86;
429
+ .loc 1 77 24
430
+ add.f32 %f42, %f122, 0f3727C5AC;
431
+ .loc 1 58 36
432
+ add.s64 %rd71, %rd18, %rd2;
433
+ mov.b32 %r157, -8;
434
+ rsqrt.approx.ftz.f32 %f139, %f42;
435
+ bra.uni $L__BB0_5;
436
+ $L__BB0_7:
437
+ .loc 1 0 0
438
+ mov.b32 %f43, %r112;
439
+ mov.b32 %f44, %r113;
440
+ mov.b32 %f45, %r114;
441
+ mov.b32 %f46, %r115;
442
+ cvt.s64.s32 %rd13, %r137;
443
+ mov.b32 %f47, %r124;
444
+ mov.b32 %f48, %r125;
445
+ mov.b32 %f49, %r126;
446
+ mov.b32 %f50, %r127;
447
+ mov.b32 %f51, %r128;
448
+ mov.b32 %f52, %r129;
449
+ mov.b32 %f53, %r130;
450
+ mov.b32 %f54, %r131;
451
+ .loc 1 69 54
452
+ mov.u32 %r139, 0x0;
453
+ mov.u32 %r140, 0x0;
454
+ mov.u32 %r141, 0x0;
455
+ mov.u32 %r142, 0x0;
456
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r139, %r140, %r141, %r142 }, [ %rd72 + 0 ];
457
+ @!%p1 mov.u32 %r139, %r143;
458
+ @!%p1 mov.u32 %r140, %r143;
459
+ @!%p1 mov.u32 %r141, %r143;
460
+ @!%p1 mov.u32 %r142, %r143;
461
+ mov.b32 %f123, %r139;
462
+ mov.b32 %f124, %r140;
463
+ mov.b32 %f125, %r141;
464
+ mov.b32 %f126, %r142;
465
+ .loc 1 70 24
466
+ add.f32 %f127, %f43, %f123;
467
+ add.f32 %f128, %f44, %f124;
468
+ add.f32 %f129, %f45, %f125;
469
+ add.f32 %f130, %f46, %f126;
470
+ .loc 1 72 24
471
+ add.f32 %f131, %f47, %f127;
472
+ add.f32 %f132, %f48, %f128;
473
+ add.f32 %f133, %f49, %f129;
474
+ add.f32 %f134, %f50, %f130;
475
+ .loc 1 73 24
476
+ sub.f32 %f135, %f131, %f41;
477
+ sub.f32 %f136, %f132, %f41;
478
+ sub.f32 %f137, %f133, %f41;
479
+ sub.f32 %f138, %f134, %f41;
480
+ .loc 1 79 24
481
+ mul.f32 %f140, %f135, %f139;
482
+ mul.f32 %f141, %f136, %f139;
483
+ mul.f32 %f142, %f137, %f139;
484
+ mul.f32 %f143, %f138, %f139;
485
+ .loc 1 80 24
486
+ mul.f32 %f144, %f140, %f51;
487
+ mul.f32 %f145, %f141, %f52;
488
+ mul.f32 %f146, %f142, %f53;
489
+ mul.f32 %f147, %f143, %f54;
490
+ .loc 1 82 29
491
+ shl.b64 %rd66, %rd13, 1;
492
+ add.s64 %rd65, %rd19, %rd66;
493
+ .loc 1 82 52
494
+ mov.b32 %r147, %f144;
495
+ cvt.rn.bf16.f32 %rs9, %r147;
496
+ mov.b32 %r148, %f145;
497
+ cvt.rn.bf16.f32 %rs10, %r148;
498
+ mov.b32 %r149, %f146;
499
+ cvt.rn.bf16.f32 %rs11, %r149;
500
+ mov.b32 %r150, %f147;
501
+ cvt.rn.bf16.f32 %rs12, %r150;
502
+ mov.b32 %r153, {%rs9, %rs10};
503
+ mov.b32 %r154, {%rs11, %rs12};
504
+ @%p1 st.global.v2.b32 [ %rd65 + 0 ], { %r153, %r154 };
505
+ .loc 1 58 36
506
+ add.s32 %r157, %r157, 8;
507
+ add.s64 %rd72, %rd72, 32;
508
+ add.s64 %rd71, %rd71, 32;
509
+ add.s64 %rd70, %rd70, 32;
510
+ setp.lt.u32 %p47, %r157, 248;
511
+ @%p47 bra $L__BB0_5;
512
+ bra.uni $L__BB0_8;
513
+ $L__BB0_5:
514
+ .loc 1 62 51
515
+ mov.u32 %r112, 0x0;
516
+ mov.u32 %r113, 0x0;
517
+ mov.u32 %r114, 0x0;
518
+ mov.u32 %r115, 0x0;
519
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r112, %r113, %r114, %r115 }, [ %rd70 + 0 ];
520
+ @!%p1 mov.u32 %r112, %r143;
521
+ @!%p1 mov.u32 %r113, %r143;
522
+ @!%p1 mov.u32 %r114, %r143;
523
+ @!%p1 mov.u32 %r115, %r143;
524
+ .loc 1 63 35
525
+ add.s32 %r136, %r6, %r157;
526
+ add.s32 %r137, %r136, 8;
527
+ mul.wide.s32 %rd56, %r137, 2;
528
+ add.s64 %rd54, %rd17, %rd56;
529
+ .loc 1 63 51
530
+ mov.u32 %r120, 0x0;
531
+ mov.u32 %r121, 0x0;
532
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r120, %r121 }, [ %rd54 + 0 ];
533
+ @!%p1 mov.u32 %r120, %r143;
534
+ @!%p1 mov.u32 %r121, %r143;
535
+ cvt.u16.u32 %rs5, %r120;
536
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r120; }
537
+ cvt.u16.u32 %rs7, %r121;
538
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r121; }
539
+ .loc 1 63 103
540
+ cvt.f32.bf16 %r124, %rs5;
541
+ cvt.f32.bf16 %r125, %rs6;
542
+ cvt.f32.bf16 %r126, %rs7;
543
+ cvt.f32.bf16 %r127, %rs8;
544
+ .loc 1 64 40
545
+ mov.u32 %r128, 0x0;
546
+ mov.u32 %r129, 0x0;
547
+ mov.u32 %r130, 0x0;
548
+ mov.u32 %r131, 0x0;
549
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r128, %r129, %r130, %r131 }, [ %rd71 + 0 ];
550
+ @!%p1 mov.u32 %r128, %r143;
551
+ @!%p1 mov.u32 %r129, %r143;
552
+ @!%p1 mov.u32 %r130, %r143;
553
+ @!%p1 mov.u32 %r131, %r143;
554
+ .loc 1 68 57
555
+ @%p16 bra $L__BB0_7;
556
+ mov.u64 %rd57, assertMessage_1;
557
+ cvta.global.u64 %rd58, %rd57;
558
+ mov.u64 %rd59, assertFile_1;
559
+ cvta.global.u64 %rd60, %rd59;
560
+ mov.u64 %rd61, assertFunc_1;
561
+ cvta.global.u64 %rd62, %rd61;
562
+ { // callseq 3, 0
563
+ .reg .b32 temp_param_reg;
564
+ .param .b64 param0;
565
+ st.param.b64 [param0+0], %rd58;
566
+ .param .b64 param1;
567
+ st.param.b64 [param1+0], %rd60;
568
+ .param .b32 param2;
569
+ st.param.b32 [param2+0], %r155;
570
+ .param .b64 param3;
571
+ st.param.b64 [param3+0], %rd62;
572
+ .param .b64 param4;
573
+ st.param.b64 [param4+0], %rd67;
574
+ call.uni
575
+ __assertfail,
576
+ (
577
+ param0,
578
+ param1,
579
+ param2,
580
+ param3,
581
+ param4
582
+ );
583
+ } // callseq 3
584
+ bra.uni $L__BB0_7;
585
+ $L__BB0_8:
586
+ .loc 1 58 4
587
+ ret;
588
+ $L__tmp7:
589
+ $L__func_end0:
590
+
591
+ }
592
+ // .globl __nv_rsqrtf
593
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
594
+ .param .b32 __nv_rsqrtf_param_0
595
+ )
596
+ {
597
+ .reg .f32 %f<3>;
598
+ $L__func_begin1:
599
+
600
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
601
+ rsqrt.approx.ftz.f32 %f2, %f1;
602
+ st.param.f32 [func_retval0+0], %f2;
603
+ ret;
604
+ $L__func_end1:
605
+
606
+ }
607
+ .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
608
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
609
+ .section .debug_abbrev
610
+ {
611
+ .b8 1
612
+ .b8 17
613
+ .b8 1
614
+ .b8 37
615
+ .b8 8
616
+ .b8 19
617
+ .b8 5
618
+ .b8 3
619
+ .b8 8
620
+ .b8 16
621
+ .b8 6
622
+ .b8 27
623
+ .b8 8
624
+ .b8 180
625
+ .b8 66
626
+ .b8 12
627
+ .b8 17
628
+ .b8 1
629
+ .b8 18
630
+ .b8 1
631
+ .b8 0
632
+ .b8 0
633
+ .b8 2
634
+ .b8 46
635
+ .b8 0
636
+ .b8 135
637
+ .b8 64
638
+ .b8 8
639
+ .b8 3
640
+ .b8 8
641
+ .b8 58
642
+ .b8 11
643
+ .b8 59
644
+ .b8 11
645
+ .b8 63
646
+ .b8 12
647
+ .b8 32
648
+ .b8 11
649
+ .b8 0
650
+ .b8 0
651
+ .b8 3
652
+ .b8 46
653
+ .b8 1
654
+ .b8 17
655
+ .b8 1
656
+ .b8 18
657
+ .b8 1
658
+ .b8 64
659
+ .b8 10
660
+ .b8 49
661
+ .b8 19
662
+ .b8 0
663
+ .b8 0
664
+ .b8 4
665
+ .b8 29
666
+ .b8 0
667
+ .b8 49
668
+ .b8 19
669
+ .b8 17
670
+ .b8 1
671
+ .b8 18
672
+ .b8 1
673
+ .b8 88
674
+ .b8 11
675
+ .b8 89
676
+ .b8 11
677
+ .b8 87
678
+ .b8 11
679
+ .b8 0
680
+ .b8 0
681
+ .b8 5
682
+ .b8 29
683
+ .b8 1
684
+ .b8 49
685
+ .b8 19
686
+ .b8 17
687
+ .b8 1
688
+ .b8 18
689
+ .b8 1
690
+ .b8 88
691
+ .b8 11
692
+ .b8 89
693
+ .b8 11
694
+ .b8 87
695
+ .b8 11
696
+ .b8 0
697
+ .b8 0
698
+ .b8 0
699
+ }
700
+ .section .debug_info
701
+ {
702
+ .b32 302
703
+ .b8 2
704
+ .b8 0
705
+ .b32 .debug_abbrev
706
+ .b8 8
707
+ .b8 1
708
+ .b8 116
709
+ .b8 114
710
+ .b8 105
711
+ .b8 116
712
+ .b8 111
713
+ .b8 110
714
+ .b8 0
715
+ .b8 2
716
+ .b8 0
717
+ .b8 99
718
+ .b8 99
719
+ .b8 105
720
+ .b8 103
721
+ .b8 54
722
+ .b8 102
723
+ .b8 107
724
+ .b8 105
725
+ .b8 54
726
+ .b8 112
727
+ .b8 52
728
+ .b8 108
729
+ .b8 120
730
+ .b8 114
731
+ .b8 100
732
+ .b8 109
733
+ .b8 103
734
+ .b8 103
735
+ .b8 54
736
+ .b8 101
737
+ .b8 117
738
+ .b8 100
739
+ .b8 97
740
+ .b8 104
741
+ .b8 105
742
+ .b8 101
743
+ .b8 120
744
+ .b8 99
745
+ .b8 118
746
+ .b8 117
747
+ .b8 101
748
+ .b8 101
749
+ .b8 111
750
+ .b8 108
751
+ .b8 50
752
+ .b8 112
753
+ .b8 52
754
+ .b8 113
755
+ .b8 112
756
+ .b8 53
757
+ .b8 51
758
+ .b8 50
759
+ .b8 112
760
+ .b8 118
761
+ .b8 118
762
+ .b8 101
763
+ .b8 50
764
+ .b8 121
765
+ .b8 52
766
+ .b8 54
767
+ .b8 51
768
+ .b8 121
769
+ .b8 46
770
+ .b8 112
771
+ .b8 121
772
+ .b8 0
773
+ .b32 .debug_line
774
+ .b8 47
775
+ .b8 116
776
+ .b8 109
777
+ .b8 112
778
+ .b8 47
779
+ .b8 116
780
+ .b8 111
781
+ .b8 114
782
+ .b8 99
783
+ .b8 104
784
+ .b8 105
785
+ .b8 110
786
+ .b8 100
787
+ .b8 117
788
+ .b8 99
789
+ .b8 116
790
+ .b8 111
791
+ .b8 114
792
+ .b8 95
793
+ .b8 114
794
+ .b8 111
795
+ .b8 111
796
+ .b8 116
797
+ .b8 47
798
+ .b8 99
799
+ .b8 105
800
+ .b8 0
801
+ .b8 1
802
+ .b64 $L__func_begin0
803
+ .b64 $L__func_end0
804
+ .b8 2
805
+ .b8 116
806
+ .b8 114
807
+ .b8 105
808
+ .b8 116
809
+ .b8 111
810
+ .b8 110
811
+ .b8 95
812
+ .b8 95
813
+ .b8 48
814
+ .b8 100
815
+ .b8 49
816
+ .b8 100
817
+ .b8 50
818
+ .b8 100
819
+ .b8 51
820
+ .b8 100
821
+ .b8 52
822
+ .b8 100
823
+ .b8 53
824
+ .b8 100
825
+ .b8 54
826
+ .b8 100
827
+ .b8 101
828
+ .b8 55
829
+ .b8 100
830
+ .b8 101
831
+ .b8 0
832
+ .b8 116
833
+ .b8 114
834
+ .b8 105
835
+ .b8 116
836
+ .b8 111
837
+ .b8 110
838
+ .b8 95
839
+ .b8 95
840
+ .b8 48
841
+ .b8 100
842
+ .b8 49
843
+ .b8 100
844
+ .b8 50
845
+ .b8 100
846
+ .b8 51
847
+ .b8 100
848
+ .b8 52
849
+ .b8 100
850
+ .b8 53
851
+ .b8 100
852
+ .b8 54
853
+ .b8 100
854
+ .b8 101
855
+ .b8 55
856
+ .b8 100
857
+ .b8 101
858
+ .b8 0
859
+ .b8 1
860
+ .b8 18
861
+ .b8 1
862
+ .b8 1
863
+ .b8 3
864
+ .b64 $L__func_begin0
865
+ .b64 $L__func_end0
866
+ .b8 1
867
+ .b8 156
868
+ .b32 125
869
+ .b8 4
870
+ .b32 125
871
+ .b64 $L__tmp1
872
+ .b64 $L__tmp2
873
+ .b8 2
874
+ .b8 47
875
+ .b8 41
876
+ .b8 5
877
+ .b32 125
878
+ .b64 $L__tmp3
879
+ .b64 $L__tmp6
880
+ .b8 2
881
+ .b8 53
882
+ .b8 44
883
+ .b8 4
884
+ .b32 125
885
+ .b64 $L__tmp3
886
+ .b64 $L__tmp6
887
+ .b8 2
888
+ .b8 120
889
+ .b8 46
890
+ .b8 0
891
+ .b8 4
892
+ .b32 125
893
+ .b64 $L__tmp4
894
+ .b64 $L__tmp5
895
+ .b8 2
896
+ .b8 53
897
+ .b8 44
898
+ .b8 0
899
+ .b8 0
900
+ }
901
+ .section .debug_pubnames
902
+ {
903
+ .b32 $L__pubNames_end0-$L__pubNames_start0
904
+ $L__pubNames_start0:
905
+ .b8 2
906
+ .b8 0
907
+ .b32 .debug_info
908
+ .b32 306
909
+ .b32 125
910
+ .b8 116
911
+ .b8 114
912
+ .b8 105
913
+ .b8 116
914
+ .b8 111
915
+ .b8 110
916
+ .b8 95
917
+ .b8 95
918
+ .b8 48
919
+ .b8 100
920
+ .b8 49
921
+ .b8 100
922
+ .b8 50
923
+ .b8 100
924
+ .b8 51
925
+ .b8 100
926
+ .b8 52
927
+ .b8 100
928
+ .b8 53
929
+ .b8 100
930
+ .b8 54
931
+ .b8 100
932
+ .b8 101
933
+ .b8 55
934
+ .b8 100
935
+ .b8 101
936
+ .b8 0
937
+ .b32 0
938
+ $L__pubNames_end0:
939
+ }
940
+ .section .debug_pubtypes
941
+ {
942
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
943
+ $L__pubTypes_start0:
944
+ .b8 2
945
+ .b8 0
946
+ .b32 .debug_info
947
+ .b32 306
948
+ .b32 0
949
+ $L__pubTypes_end0:
950
+ }
951
+ .section .debug_loc { }
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c8_i32 = arith.constant 8 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
26
+ %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
27
+ %c64_i32 = arith.constant 64 : i32
28
+ %0 = tt.get_program_id x : i32
29
+ %1 = arith.muli %0, %c64_i32 : i32
30
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
31
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
32
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
33
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
34
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
35
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
36
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
37
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
38
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
39
+ %11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
40
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
41
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
42
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
43
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
44
+ %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
45
+ %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
46
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
47
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
48
+ %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
49
+ %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
50
+ %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
51
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
52
+ %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
53
+ %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
54
+ %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
55
+ %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
56
+ %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
57
+ %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
58
+ %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
59
+ %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
60
+ %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
61
+ %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
62
+ %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
63
+ %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
64
+ %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
65
+ %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
66
+ %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
67
+ %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>) : i32 {
68
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
69
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked2>
70
+ %51 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
71
+ %52 = arith.addi %50, %13 : tensor<1x8xi32, #blocked2>
72
+ %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x8xi32, #blocked>
73
+ %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x8xi32, #blocked2>
74
+ %55 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
75
+ %56 = arith.addi %55, %22 : tensor<64x8xi32, #blocked>
76
+ %57 = tt.addptr %23, %56 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
77
+ %58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
78
+ %59 = tt.broadcast %54 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
79
+ %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
80
+ %61 = arith.addi %55, %25 : tensor<64x8xi32, #blocked>
81
+ %62 = tt.addptr %26, %61 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
82
+ %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
83
+ %64 = arith.extf %63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
84
+ tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
85
+ %65 = arith.extsi %51 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
86
+ %66 = tt.broadcast %65 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
87
+ %67 = arith.addi %66, %37 : tensor<64x8xi64, #blocked>
88
+ %68 = tt.addptr %38, %67 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
89
+ %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
90
+ %70 = arith.addf %69, %60 : tensor<64x8xf32, #blocked>
91
+ %71 = arith.addf %70, %64 : tensor<64x8xf32, #blocked>
92
+ %72 = arith.subf %71, %arg9 : tensor<64x8xf32, #blocked>
93
+ %73 = arith.addf %arg12, %cst_4 : tensor<64x8xf32, #blocked>
94
+ %74 = arith.addf %arg11, %cst_10 : tensor<64x8xf32, #blocked2>
95
+ %75 = arith.divf %72, %73 : tensor<64x8xf32, #blocked>
96
+ %76 = arith.addf %arg9, %75 : tensor<64x8xf32, #blocked>
97
+ %77 = arith.subf %71, %76 : tensor<64x8xf32, #blocked>
98
+ %78 = arith.mulf %72, %77 : tensor<64x8xf32, #blocked>
99
+ %79 = arith.addf %arg10, %78 : tensor<64x8xf32, #blocked>
100
+ %80 = arith.select %58, %76, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
101
+ %81 = arith.select %58, %79, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
102
+ %82 = arith.select %58, %73, %arg12 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
103
+ %83 = arith.select %59, %74, %arg11 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
104
+ scf.yield %80, %81, %83, %82 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
105
+ }
106
+ %40 = triton_gpu.convert_layout %39#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
107
+ %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
108
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
109
+ %49 = arith.subf %arg11, %arg8 : f32
110
+ %50 = arith.addf %arg10, %arg13 : f32
111
+ %51 = arith.cmpf oeq, %50, %cst_11 : f32
112
+ %52 = arith.divf %arg13, %50 : f32
113
+ %53 = arith.select %51, %cst_11, %52 : f32
114
+ %54 = arith.mulf %49, %53 : f32
115
+ %55 = arith.addf %arg8, %54 : f32
116
+ %56 = arith.addf %arg9, %arg12 : f32
117
+ %57 = arith.mulf %49, %49 : f32
118
+ %58 = arith.mulf %57, %arg10 : f32
119
+ %59 = arith.mulf %58, %53 : f32
120
+ %60 = arith.addf %56, %59 : f32
121
+ tt.reduce.return %55, %60, %50 : f32, f32, f32
122
+ }) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
123
+ %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
124
+ %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
125
+ %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
126
+ %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
127
+ %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
128
+ %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
129
+ %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
130
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
131
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
132
+ %50 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
133
+ %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
134
+ %52 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
135
+ %53 = arith.addi %52, %22 : tensor<64x8xi32, #blocked>
136
+ %54 = tt.addptr %23, %53 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
137
+ %55 = tt.broadcast %51 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
138
+ %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
139
+ %57 = arith.addi %52, %25 : tensor<64x8xi32, #blocked>
140
+ %58 = tt.addptr %26, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
141
+ %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
142
+ %60 = arith.extf %59 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
143
+ %61 = tt.addptr %44, %50 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
144
+ %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
145
+ tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
146
+ %63 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
147
+ %64 = tt.broadcast %63 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
148
+ %65 = arith.addi %64, %37 : tensor<64x8xi64, #blocked>
149
+ %66 = tt.addptr %38, %65 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
150
+ %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
151
+ %68 = arith.addf %67, %56 : tensor<64x8xf32, #blocked>
152
+ %69 = arith.addf %68, %60 : tensor<64x8xf32, #blocked>
153
+ %70 = arith.subf %69, %45 : tensor<64x8xf32, #blocked>
154
+ %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
155
+ %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
156
+ %73 = arith.mulf %70, %72 : tensor<64x8xf32, #blocked>
157
+ %74 = tt.broadcast %62 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
158
+ %75 = arith.mulf %73, %74 : tensor<64x8xf32, #blocked>
159
+ %76 = tt.addptr %48, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
160
+ %77 = arith.truncf %75 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
161
+ tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
162
+ }
163
+ tt.return
164
+ }
165
+ }
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x8xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c8_i32 = arith.constant 8 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x8xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
23
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
24
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
25
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
26
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
27
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
28
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
29
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
30
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
31
+ %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
32
+ %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
33
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x8xi32>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
35
+ %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
36
+ %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x8xi32>
37
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
38
+ %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
39
+ %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
40
+ %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
41
+ %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
42
+ %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
43
+ %23 = arith.andi %21, %22 : tensor<64x1xi1>
44
+ %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
45
+ %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x8xi64>
46
+ %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
47
+ %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
48
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
49
+ %52 = arith.addi %51, %7 : tensor<1x8xi32>
50
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
51
+ %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
52
+ %55 = arith.addi %54, %13 : tensor<64x8xi32>
53
+ %56 = tt.addptr %14, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
54
+ %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
55
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
56
+ %59 = arith.addi %54, %16 : tensor<64x8xi32>
57
+ %60 = tt.addptr %17, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
58
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16>
59
+ %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
60
+ tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
61
+ %63 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
62
+ %64 = tt.broadcast %63 : (tensor<1x8xi64>) -> tensor<64x8xi64>
63
+ %65 = arith.addi %64, %25 : tensor<64x8xi64>
64
+ %66 = tt.addptr %26, %65 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
65
+ %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
66
+ %68 = arith.addf %67, %58 : tensor<64x8xf32>
67
+ %69 = arith.addf %68, %62 : tensor<64x8xf32>
68
+ %70 = arith.subf %69, %arg9 : tensor<64x8xf32>
69
+ %71 = arith.addf %arg11, %cst_1 : tensor<64x8xf32>
70
+ %72 = arith.divf %70, %71 : tensor<64x8xf32>
71
+ %73 = arith.addf %arg9, %72 : tensor<64x8xf32>
72
+ %74 = arith.subf %69, %73 : tensor<64x8xf32>
73
+ %75 = arith.mulf %70, %74 : tensor<64x8xf32>
74
+ %76 = arith.addf %arg10, %75 : tensor<64x8xf32>
75
+ %77 = arith.select %57, %73, %arg9 : tensor<64x8xi1>, tensor<64x8xf32>
76
+ %78 = arith.select %57, %76, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
77
+ %79 = arith.select %57, %71, %arg11 : tensor<64x8xi1>, tensor<64x8xf32>
78
+ scf.yield %77, %78, %79 : tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>
79
+ }
80
+ %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
82
+ %51 = arith.subf %arg11, %arg8 : f32
83
+ %52 = arith.addf %arg10, %arg13 : f32
84
+ %53 = arith.cmpf oeq, %52, %cst_0 : f32
85
+ %54 = arith.divf %arg13, %52 : f32
86
+ %55 = arith.select %53, %cst_0, %54 : f32
87
+ %56 = arith.mulf %51, %55 : f32
88
+ %57 = arith.addf %arg8, %56 : f32
89
+ %58 = arith.addf %arg9, %arg12 : f32
90
+ %59 = arith.mulf %51, %51 : f32
91
+ %60 = arith.mulf %59, %arg10 : f32
92
+ %61 = arith.mulf %60, %55 : f32
93
+ %62 = arith.addf %58, %61 : f32
94
+ tt.reduce.return %57, %62, %52 : f32, f32, f32
95
+ }) : (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
96
+ %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
97
+ %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
98
+ %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
99
+ %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x8xi32>
100
+ %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
101
+ %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
102
+ %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x8xi32>
103
+ %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
104
+ %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
105
+ %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
106
+ %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
107
+ %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
108
+ %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
109
+ %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
110
+ %43 = arith.andi %41, %42 : tensor<64x1xi1>
111
+ %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
112
+ %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x8xi64>
113
+ %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
114
+ %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x8xf32>
115
+ %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
116
+ %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
117
+ %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
118
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
119
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
120
+ %52 = arith.addi %51, %7 : tensor<1x8xi32>
121
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
122
+ %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
123
+ %55 = arith.addi %54, %32 : tensor<64x8xi32>
124
+ %56 = tt.addptr %33, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
125
+ %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
126
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
127
+ %59 = arith.addi %54, %35 : tensor<64x8xi32>
128
+ %60 = tt.addptr %36, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
129
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
130
+ %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
131
+ %63 = tt.addptr %37, %52 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
132
+ %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32>
133
+ tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
134
+ %65 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
135
+ %66 = tt.broadcast %65 : (tensor<1x8xi64>) -> tensor<64x8xi64>
136
+ %67 = arith.addi %66, %45 : tensor<64x8xi64>
137
+ %68 = tt.addptr %46, %67 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
138
+ %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
139
+ %70 = arith.addf %69, %58 : tensor<64x8xf32>
140
+ %71 = arith.addf %70, %62 : tensor<64x8xf32>
141
+ %72 = arith.subf %71, %47 : tensor<64x8xf32>
142
+ %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
143
+ %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x8xf32>
144
+ %75 = arith.mulf %72, %74 : tensor<64x8xf32>
145
+ %76 = tt.broadcast %64 : (tensor<1x8xf32>) -> tensor<64x8xf32>
146
+ %77 = arith.mulf %75, %76 : tensor<64x8xf32>
147
+ %78 = tt.addptr %50, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
148
+ %79 = arith.truncf %77 : tensor<64x8xf32> to tensor<64x8xbf16>
149
+ tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin ADDED
Binary file (16.9 kB). View file
 
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
28
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
30
+ %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
31
+ %21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
32
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
33
+ %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
34
+ %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
35
+ %25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
36
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
37
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
38
+ %28 = arith.addf %8, %12 : tensor<256xf32>
39
+ %29 = arith.addf %28, %16 : tensor<256xf32>
40
+ %30 = arith.addf %29, %20 : tensor<256xf32>
41
+ %31 = arith.addf %30, %24 : tensor<256xf32>
42
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
43
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
44
+ ^bb0(%arg10: f32, %arg11: f32):
45
+ %53 = arith.addf %arg10, %arg11 : f32
46
+ tt.reduce.return %53 : f32
47
+ }) : (tensor<256xf32>) -> f32
48
+ %34 = arith.addf %33, %cst_0 : f32
49
+ %35 = arith.divf %34, %cst_1 : f32
50
+ %36 = tt.splat %35 : (f32) -> tensor<256xf32>
51
+ %37 = arith.subf %31, %36 : tensor<256xf32>
52
+ %38 = arith.mulf %37, %37 : tensor<256xf32>
53
+ %39 = arith.select %2, %38, %cst_3 : tensor<256xi1>, tensor<256xf32>
54
+ %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
55
+ ^bb0(%arg10: f32, %arg11: f32):
56
+ %53 = arith.addf %arg10, %arg11 : f32
57
+ tt.reduce.return %53 : f32
58
+ }) : (tensor<256xf32>) -> f32
59
+ %41 = arith.addf %40, %cst_0 : f32
60
+ %42 = arith.divf %41, %cst_1 : f32
61
+ %43 = arith.addf %42, %cst_2 : f32
62
+ %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
63
+ %45 = tt.splat %44 : (f32) -> tensor<256xf32>
64
+ %46 = arith.mulf %37, %45 : tensor<256xf32>
65
+ %47 = arith.mulf %46, %27 : tensor<256xf32>
66
+ %48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
67
+ %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
68
+ tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
69
+ %50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
70
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
71
+ %52 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
72
+ tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
73
+ tt.return
74
+ }
75
+ }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin ADDED
Binary file (13.2 kB). View file
 
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 {
8
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %8 = and i32 %7, 31, !dbg !10
10
+ %9 = lshr i32 %7, 5, !dbg !10
11
+ %10 = and i32 %9, 1, !dbg !10
12
+ %urem = shl i32 %7, 2, !dbg !10
13
+ %11 = and i32 %urem, 252, !dbg !10
14
+ %12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %13 = shl i32 %12, 8, !dbg !12
16
+ %14 = or i32 %13, %11, !dbg !13
17
+ %15 = sext i32 %14 to i64, !dbg !14
18
+ %16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14
19
+ %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15
21
+ %19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15
22
+ %20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15
23
+ %21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15
24
+ %22 = bitcast i32 %18 to float, !dbg !15
25
+ %23 = bitcast i32 %19 to float, !dbg !15
26
+ %24 = bitcast i32 %20 to float, !dbg !15
27
+ %25 = bitcast i32 %21 to float, !dbg !15
28
+ %26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16
29
+ %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
30
+ %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
31
+ %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
32
+ %30 = trunc i32 %28 to i16, !dbg !17
33
+ %extelt.offset = lshr i32 %28, 16, !dbg !17
34
+ %31 = trunc i32 %extelt.offset to i16, !dbg !17
35
+ %32 = trunc i32 %29 to i16, !dbg !17
36
+ %extelt.offset1 = lshr i32 %29, 16, !dbg !17
37
+ %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
38
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
39
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
40
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
41
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
42
+ %38 = zext nneg i32 %11 to i64, !dbg !19
43
+ %39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19
44
+ %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
45
+ %41 = fadd float %34, %22, !dbg !21
46
+ %42 = fadd float %35, %23, !dbg !21
47
+ %43 = fadd float %36, %24, !dbg !21
48
+ %44 = fadd float %37, %25, !dbg !21
49
+ %45 = fadd float %41, %42, !dbg !22
50
+ %46 = fadd float %45, %43, !dbg !22
51
+ %47 = fadd float %46, %44, !dbg !22
52
+ %48 = bitcast float %47 to i32, !dbg !28
53
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28
54
+ %50 = bitcast i32 %49 to float, !dbg !28
55
+ %51 = fadd float %47, %50, !dbg !22
56
+ %52 = bitcast float %51 to i32, !dbg !28
57
+ %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28
58
+ %54 = bitcast i32 %53 to float, !dbg !28
59
+ %55 = fadd float %51, %54, !dbg !22
60
+ %56 = bitcast float %55 to i32, !dbg !28
61
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28
62
+ %58 = bitcast i32 %57 to float, !dbg !28
63
+ %59 = fadd float %55, %58, !dbg !22
64
+ %60 = bitcast float %59 to i32, !dbg !28
65
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28
66
+ %62 = bitcast i32 %61 to float, !dbg !28
67
+ %63 = fadd float %59, %62, !dbg !22
68
+ %64 = bitcast float %63 to i32, !dbg !28
69
+ %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28
70
+ %66 = bitcast i32 %65 to float, !dbg !28
71
+ %67 = fadd float %63, %66, !dbg !22
72
+ %68 = icmp eq i32 %8, 0, !dbg !28
73
+ %69 = zext nneg i32 %10 to i64, !dbg !28
74
+ %70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28
75
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28
76
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
77
+ %71 = icmp slt i32 %7, 2, !dbg !28
78
+ %72 = sext i32 %7 to i64, !dbg !28
79
+ %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28
80
+ %74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28
81
+ %75 = bitcast float %74 to i32, !dbg !28
82
+ %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28
83
+ %77 = bitcast i32 %76 to float, !dbg !28
84
+ %78 = fadd float %74, %77, !dbg !22
85
+ %79 = and i32 %7, 1, !dbg !28
86
+ %80 = icmp eq i32 %79, 0, !dbg !28
87
+ %81 = and i1 %71, %80, !dbg !28
88
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28
89
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
90
+ %82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
91
+ %83 = fadd float %82, 0.000000e+00, !dbg !30
92
+ %84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34
93
+ %85 = fsub float %41, %84, !dbg !35
94
+ %86 = fsub float %42, %84, !dbg !35
95
+ %87 = fsub float %43, %84, !dbg !35
96
+ %88 = fsub float %44, %84, !dbg !35
97
+ %89 = fmul float %85, %85, !dbg !36
98
+ %90 = fmul float %86, %86, !dbg !36
99
+ %91 = fmul float %87, %87, !dbg !36
100
+ %92 = fmul float %88, %88, !dbg !36
101
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
102
+ %93 = fadd float %89, %90, !dbg !39
103
+ %94 = fadd float %91, %93, !dbg !39
104
+ %95 = fadd float %92, %94, !dbg !39
105
+ %96 = bitcast float %95 to i32, !dbg !37
106
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37
107
+ %98 = bitcast i32 %97 to float, !dbg !37
108
+ %99 = fadd float %95, %98, !dbg !39
109
+ %100 = bitcast float %99 to i32, !dbg !37
110
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37
111
+ %102 = bitcast i32 %101 to float, !dbg !37
112
+ %103 = fadd float %99, %102, !dbg !39
113
+ %104 = bitcast float %103 to i32, !dbg !37
114
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37
115
+ %106 = bitcast i32 %105 to float, !dbg !37
116
+ %107 = fadd float %103, %106, !dbg !39
117
+ %108 = bitcast float %107 to i32, !dbg !37
118
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37
119
+ %110 = bitcast i32 %109 to float, !dbg !37
120
+ %111 = fadd float %107, %110, !dbg !39
121
+ %112 = bitcast float %111 to i32, !dbg !37
122
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37
123
+ %114 = bitcast i32 %113 to float, !dbg !37
124
+ %115 = fadd float %111, %114, !dbg !39
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37
126
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
127
+ %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37
128
+ %117 = bitcast float %116 to i32, !dbg !37
129
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
130
+ %119 = bitcast i32 %118 to float, !dbg !37
131
+ %120 = fadd float %116, %119, !dbg !39
132
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37
133
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
134
+ %121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
135
+ %122 = fadd float %121, 0.000000e+00, !dbg !42
136
+ %123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44
137
+ %124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45
138
+ %125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
139
+ %.not.i = icmp eq i32 %125, 0, !dbg !46
140
+ br i1 %.not.i, label %128, label %126, !dbg !46
141
+
142
+ 126: ; preds = %6
143
+ %127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46
144
+ br label %__nv_rsqrtf.exit, !dbg !46
145
+
146
+ 128: ; preds = %6
147
+ %129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46
148
+ br label %__nv_rsqrtf.exit, !dbg !46
149
+
150
+ __nv_rsqrtf.exit: ; preds = %126, %128
151
+ %.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46
152
+ %130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20
153
+ %131 = bitcast i32 %130 to float, !dbg !20
154
+ %132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20
155
+ %133 = bitcast i32 %132 to float, !dbg !20
156
+ %134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20
157
+ %135 = bitcast i32 %134 to float, !dbg !20
158
+ %136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20
159
+ %137 = bitcast i32 %136 to float, !dbg !20
160
+ %138 = fmul float %85, %.0.i, !dbg !47
161
+ %139 = fmul float %86, %.0.i, !dbg !47
162
+ %140 = fmul float %87, %.0.i, !dbg !47
163
+ %141 = fmul float %88, %.0.i, !dbg !47
164
+ %142 = fmul float %138, %137, !dbg !48
165
+ %143 = fmul float %139, %135, !dbg !48
166
+ %144 = fmul float %140, %133, !dbg !48
167
+ %145 = fmul float %141, %131, !dbg !48
168
+ %146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49
169
+ %147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50
170
+ %148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50
171
+ %149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50
172
+ %150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50
173
+ %151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50
174
+ %152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50
175
+ %153 = bitcast <2 x i16> %152 to i32, !dbg !50
176
+ %154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50
177
+ %155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50
178
+ %156 = bitcast <2 x i16> %155 to i32, !dbg !50
179
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50
180
+ ret void, !dbg !51
181
+ }
182
+
183
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
184
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
185
+
186
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
187
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
188
+
189
+ ; Function Attrs: convergent nocallback nounwind
190
+ declare void @llvm.nvvm.barrier0() #2
191
+
192
+ ; Function Attrs: alwaysinline nounwind
193
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
194
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
195
+ %.not = icmp eq i32 %1, 0
196
+ br i1 %.not, label %4, label %2
197
+
198
+ 2: ; preds = %0
199
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
200
+ br label %6
201
+
202
+ 4: ; preds = %0
203
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
204
+ br label %6
205
+
206
+ 6: ; preds = %4, %2
207
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
208
+ ret float %.0
209
+ }
210
+
211
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
212
+
213
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
214
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
215
+
216
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
217
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
218
+
219
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
220
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
221
+ attributes #2 = { convergent nocallback nounwind }
222
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
223
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
224
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
225
+ attributes #6 = { nounwind }
226
+
227
+ !llvm.module.flags = !{!0, !1}
228
+ !llvm.dbg.cu = !{!2}
229
+ !nvvm.annotations = !{!4, !5, !5, !4}
230
+ !llvm.ident = !{!6}
231
+
232
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
233
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
234
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
235
+ !3 = !DIFile(filename: "ce5cemaf763zop7tgmdl7oghweh4i2o3g632qnkrhju2cthbxnfd.py", directory: "/tmp/torchinductor_root/e5")
236
+ !4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1}
237
+ !5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64}
238
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
239
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
240
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
241
+ !9 = !{}
242
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
243
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
244
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
245
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
246
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
247
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
248
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
249
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
250
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
251
+ !19 = !DILocation(line: 32, column: 31, scope: !7)
252
+ !20 = !DILocation(line: 32, column: 36, scope: !7)
253
+ !21 = !DILocation(line: 34, column: 18, scope: !7)
254
+ !22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
255
+ !23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
256
+ !24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
257
+ !25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
258
+ !26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
259
+ !27 = !DILocation(line: 39, column: 58, scope: !23)
260
+ !28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
261
+ !29 = !DILocation(line: 39, column: 58, scope: !25)
262
+ !30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
263
+ !31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
264
+ !32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
265
+ !33 = !DILocation(line: 39, column: 45, scope: !31)
266
+ !34 = !DILocation(line: 42, column: 20, scope: !7)
267
+ !35 = !DILocation(line: 43, column: 19, scope: !7)
268
+ !36 = !DILocation(line: 44, column: 20, scope: !7)
269
+ !37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
270
+ !38 = !DILocation(line: 47, column: 59, scope: !25)
271
+ !39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
272
+ !40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
273
+ !41 = !DILocation(line: 47, column: 59, scope: !23)
274
+ !42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
275
+ !43 = !DILocation(line: 47, column: 45, scope: !31)
276
+ !44 = !DILocation(line: 50, column: 20, scope: !7)
277
+ !45 = !DILocation(line: 52, column: 20, scope: !7)
278
+ !46 = !DILocation(line: 53, column: 26, scope: !7)
279
+ !47 = !DILocation(line: 54, column: 20, scope: !7)
280
+ !48 = !DILocation(line: 55, column: 20, scope: !7)
281
+ !49 = !DILocation(line: 57, column: 25, scope: !7)
282
+ !50 = !DILocation(line: 57, column: 48, scope: !7)
283
+ !51 = !DILocation(line: 57, column: 4, scope: !7)
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4de5de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4de5de(
14
+ .param .u64 triton__0d1d2d3d4de5de_param_0,
15
+ .param .u64 triton__0d1d2d3d4de5de_param_1,
16
+ .param .u64 triton__0d1d2d3d4de5de_param_2,
17
+ .param .u64 triton__0d1d2d3d4de5de_param_3,
18
+ .param .u32 triton__0d1d2d3d4de5de_param_4,
19
+ .param .u32 triton__0d1d2d3d4de5de_param_5
20
+ )
21
+ .maxntid 64, 1, 1
22
+ {
23
+ .reg .pred %p<23>;
24
+ .reg .b16 %rs<9>;
25
+ .reg .b32 %r<84>;
26
+ .reg .f32 %f<70>;
27
+ .reg .b64 %rd<12>;
28
+ .loc 1 18 0
29
+ $L__func_begin0:
30
+ .loc 1 18 0
31
+
32
+ ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0];
33
+ ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1];
34
+ $L__tmp0:
35
+ .loc 1 26 26
36
+ mov.u32 %r50, %tid.x;
37
+ and.b32 %r51, %r50, 31;
38
+ ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2];
39
+ ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3];
40
+ shl.b32 %r52, %r50, 2;
41
+ and.b32 %r53, %r52, 252;
42
+ .loc 1 23 28
43
+ mov.u32 %r1, %ctaid.x;
44
+ .loc 1 30 40
45
+ shl.b32 %r54, %r1, 8;
46
+ .loc 1 30 36
47
+ or.b32 %r55, %r54, %r53;
48
+ .loc 1 30 30
49
+ mul.wide.s32 %rd9, %r55, 4;
50
+ add.s64 %rd1, %rd5, %rd9;
51
+ mov.b32 %r6, 0;
52
+ mov.pred %p1, -1;
53
+ .loc 1 30 46
54
+ mov.u32 %r2, 0x0;
55
+ mov.u32 %r3, 0x0;
56
+ mov.u32 %r4, 0x0;
57
+ mov.u32 %r5, 0x0;
58
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
59
+ @!%p1 mov.u32 %r2, %r6;
60
+ @!%p1 mov.u32 %r3, %r6;
61
+ @!%p1 mov.u32 %r4, %r6;
62
+ @!%p1 mov.u32 %r5, %r6;
63
+ mov.b32 %f1, %r2;
64
+ mov.b32 %f2, %r3;
65
+ mov.b32 %f3, %r4;
66
+ mov.b32 %f4, %r5;
67
+ .loc 1 31 30
68
+ mul.wide.s32 %rd10, %r55, 2;
69
+ add.s64 %rd2, %rd6, %rd10;
70
+ .loc 1 31 46
71
+ mov.u32 %r10, 0x0;
72
+ mov.u32 %r11, 0x0;
73
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
74
+ @!%p1 mov.u32 %r10, %r6;
75
+ @!%p1 mov.u32 %r11, %r6;
76
+ cvt.u16.u32 %rs1, %r10;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
78
+ cvt.u16.u32 %rs3, %r11;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
80
+ .loc 1 31 67
81
+ cvt.f32.bf16 %r14, %rs1;
82
+ mov.b32 %f5, %r14;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ mov.b32 %f6, %r15;
85
+ cvt.f32.bf16 %r16, %rs3;
86
+ mov.b32 %f7, %r16;
87
+ cvt.f32.bf16 %r17, %rs4;
88
+ mov.b32 %f8, %r17;
89
+ .loc 1 32 31
90
+ mul.wide.u32 %rd11, %r53, 4;
91
+ add.s64 %rd3, %rd7, %rd11;
92
+ .loc 1 32 36
93
+ mov.u32 %r18, 0x0;
94
+ mov.u32 %r19, 0x0;
95
+ mov.u32 %r20, 0x0;
96
+ mov.u32 %r21, 0x0;
97
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
98
+ @!%p1 mov.u32 %r18, %r6;
99
+ @!%p1 mov.u32 %r19, %r6;
100
+ @!%p1 mov.u32 %r20, %r6;
101
+ @!%p1 mov.u32 %r21, %r6;
102
+ .loc 1 34 18
103
+ add.f32 %f9, %f5, %f1;
104
+ add.f32 %f10, %f6, %f2;
105
+ add.f32 %f11, %f7, %f3;
106
+ add.f32 %f12, %f8, %f4;
107
+ $L__tmp1:
108
+ .loc 2 233 15
109
+ add.f32 %f13, %f9, %f10;
110
+ add.f32 %f14, %f13, %f11;
111
+ add.f32 %f15, %f14, %f12;
112
+ $L__tmp2:
113
+ .loc 2 243 36
114
+ mov.b32 %r56, %f15;
115
+ shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1;
116
+ mov.b32 %f16, %r57;
117
+ $L__tmp3:
118
+ .loc 2 233 15
119
+ add.f32 %f17, %f15, %f16;
120
+ $L__tmp4:
121
+ .loc 2 243 36
122
+ mov.b32 %r58, %f17;
123
+ shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1;
124
+ mov.b32 %f18, %r59;
125
+ $L__tmp5:
126
+ .loc 2 233 15
127
+ add.f32 %f19, %f17, %f18;
128
+ $L__tmp6:
129
+ .loc 2 243 36
130
+ mov.b32 %r60, %f19;
131
+ shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1;
132
+ mov.b32 %f20, %r61;
133
+ $L__tmp7:
134
+ .loc 2 233 15
135
+ add.f32 %f21, %f19, %f20;
136
+ $L__tmp8:
137
+ .loc 2 243 36
138
+ mov.b32 %r62, %f21;
139
+ shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1;
140
+ mov.b32 %f22, %r63;
141
+ $L__tmp9:
142
+ .loc 2 233 15
143
+ add.f32 %f23, %f21, %f22;
144
+ $L__tmp10:
145
+ .loc 2 243 36
146
+ mov.b32 %r64, %f23;
147
+ shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1;
148
+ mov.b32 %f24, %r65;
149
+ $L__tmp11:
150
+ .loc 2 233 15
151
+ add.f32 %f25, %f23, %f24;
152
+ $L__tmp12:
153
+ .loc 2 243 36
154
+ setp.eq.s32 %p14, %r51, 0;
155
+ shr.u32 %r66, %r50, 3;
156
+ and.b32 %r67, %r66, 4;
157
+ mov.u32 %r68, global_smem;
158
+ add.s32 %r26, %r68, %r67;
159
+ mov.b32 %r27, %f25;
160
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r27;
161
+ bar.sync 0;
162
+ setp.lt.s32 %p15, %r50, 2;
163
+ add.s32 %r29, %r68, %r52;
164
+ @%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
165
+ mov.b32 %f26, %r28;
166
+ shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1;
167
+ mov.b32 %f27, %r69;
168
+ $L__tmp13:
169
+ .loc 2 233 15
170
+ add.f32 %f28, %f26, %f27;
171
+ $L__tmp14:
172
+ .loc 2 243 36
173
+ and.b32 %r70, %r50, 1;
174
+ setp.eq.b32 %p21, %r70, 1;
175
+ not.pred %p22, %p21;
176
+ and.pred %p16, %p15, %p22;
177
+ mov.b32 %r31, %f28;
178
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r31;
179
+ bar.sync 0;
180
+ ld.shared.f32 %f29, [global_smem];
181
+ $L__tmp15:
182
+ .loc 3 8 15
183
+ add.f32 %f30, %f29, 0f00000000;
184
+ $L__tmp16:
185
+ .loc 1 42 20
186
+ mov.b32 %r33, %f30;
187
+ mov.b32 %r34, 1132462080;
188
+ div.full.f32 %r32, %r33, %r34;
189
+ mov.b32 %f31, %r32;
190
+ .loc 1 43 19
191
+ sub.f32 %f32, %f9, %f31;
192
+ sub.f32 %f33, %f10, %f31;
193
+ sub.f32 %f34, %f11, %f31;
194
+ sub.f32 %f35, %f12, %f31;
195
+ .loc 1 44 20
196
+ mul.f32 %f36, %f33, %f33;
197
+ $L__tmp17:
198
+ .loc 2 243 36
199
+ bar.sync 0;
200
+ $L__tmp18:
201
+ .loc 2 233 15
202
+ fma.rn.f32 %f37, %f32, %f32, %f36;
203
+ fma.rn.f32 %f38, %f34, %f34, %f37;
204
+ fma.rn.f32 %f39, %f35, %f35, %f38;
205
+ $L__tmp19:
206
+ .loc 2 243 36
207
+ mov.b32 %r71, %f39;
208
+ shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1;
209
+ mov.b32 %f40, %r72;
210
+ $L__tmp20:
211
+ .loc 2 233 15
212
+ add.f32 %f41, %f39, %f40;
213
+ $L__tmp21:
214
+ .loc 2 243 36
215
+ mov.b32 %r73, %f41;
216
+ shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1;
217
+ mov.b32 %f42, %r74;
218
+ $L__tmp22:
219
+ .loc 2 233 15
220
+ add.f32 %f43, %f41, %f42;
221
+ $L__tmp23:
222
+ .loc 2 243 36
223
+ mov.b32 %r75, %f43;
224
+ shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1;
225
+ mov.b32 %f44, %r76;
226
+ $L__tmp24:
227
+ .loc 2 233 15
228
+ add.f32 %f45, %f43, %f44;
229
+ $L__tmp25:
230
+ .loc 2 243 36
231
+ mov.b32 %r77, %f45;
232
+ shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1;
233
+ mov.b32 %f46, %r78;
234
+ $L__tmp26:
235
+ .loc 2 233 15
236
+ add.f32 %f47, %f45, %f46;
237
+ $L__tmp27:
238
+ .loc 2 243 36
239
+ mov.b32 %r79, %f47;
240
+ shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1;
241
+ mov.b32 %f48, %r80;
242
+ $L__tmp28:
243
+ .loc 2 233 15
244
+ add.f32 %f49, %f47, %f48;
245
+ $L__tmp29:
246
+ .loc 2 243 36
247
+ mov.b32 %r36, %f49;
248
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r36;
249
+ bar.sync 0;
250
+ @%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
251
+ mov.b32 %f50, %r37;
252
+ shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1;
253
+ mov.b32 %f51, %r81;
254
+ $L__tmp30:
255
+ .loc 2 233 15
256
+ add.f32 %f52, %f50, %f51;
257
+ $L__tmp31:
258
+ .loc 2 243 36
259
+ mov.b32 %r40, %f52;
260
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r40;
261
+ bar.sync 0;
262
+ ld.shared.f32 %f53, [global_smem];
263
+ $L__tmp32:
264
+ .loc 3 8 15
265
+ add.f32 %f54, %f53, 0f00000000;
266
+ $L__tmp33:
267
+ .loc 1 50 20
268
+ mov.b32 %r42, %f54;
269
+ div.full.f32 %r41, %r42, %r34;
270
+ mov.b32 %f55, %r41;
271
+ .loc 1 52 20
272
+ add.f32 %f56, %f55, 0f3727C5AC;
273
+ .loc 1 53 26
274
+ rsqrt.approx.ftz.f32 %f57, %f56;
275
+ .loc 1 32 36
276
+ mov.b32 %f58, %r21;
277
+ mov.b32 %f59, %r20;
278
+ mov.b32 %f60, %r19;
279
+ mov.b32 %f61, %r18;
280
+ .loc 1 54 20
281
+ mul.f32 %f62, %f32, %f57;
282
+ mul.f32 %f63, %f33, %f57;
283
+ mul.f32 %f64, %f34, %f57;
284
+ mul.f32 %f65, %f35, %f57;
285
+ .loc 1 55 20
286
+ mul.f32 %f66, %f62, %f61;
287
+ mul.f32 %f67, %f63, %f60;
288
+ mul.f32 %f68, %f64, %f59;
289
+ mul.f32 %f69, %f65, %f58;
290
+ .loc 1 57 25
291
+ add.s64 %rd4, %rd8, %rd10;
292
+ .loc 1 57 48
293
+ mov.b32 %r44, %f66;
294
+ cvt.rn.bf16.f32 %rs5, %r44;
295
+ mov.b32 %r45, %f67;
296
+ cvt.rn.bf16.f32 %rs6, %r45;
297
+ mov.b32 %r46, %f68;
298
+ cvt.rn.bf16.f32 %rs7, %r46;
299
+ mov.b32 %r47, %f69;
300
+ cvt.rn.bf16.f32 %rs8, %r47;
301
+ mov.b32 %r82, {%rs5, %rs6};
302
+ mov.b32 %r83, {%rs7, %rs8};
303
+ @%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 };
304
+ .loc 1 57 4
305
+ ret;
306
+ $L__tmp34:
307
+ $L__func_end0:
308
+
309
+ }
310
+ // .globl __nv_rsqrtf
311
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
312
+ .param .b32 __nv_rsqrtf_param_0
313
+ )
314
+ {
315
+ .reg .f32 %f<3>;
316
+ $L__func_begin1:
317
+
318
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
319
+ rsqrt.approx.ftz.f32 %f2, %f1;
320
+ st.param.f32 [func_retval0+0], %f2;
321
+ ret;
322
+ $L__func_end1:
323
+
324
+ }
325
+ .file 1 "/tmp/torchinductor_root/e5/ce5cemaf763zop7tgmdl7oghweh4i2o3g632qnkrhju2cthbxnfd.py"
326
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
327
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
328
+ .section .debug_abbrev
329
+ {
330
+ .b8 1
331
+ .b8 17
332
+ .b8 1
333
+ .b8 37
334
+ .b8 8
335
+ .b8 19
336
+ .b8 5
337
+ .b8 3
338
+ .b8 8
339
+ .b8 16
340
+ .b8 6
341
+ .b8 27
342
+ .b8 8
343
+ .b8 180
344
+ .b8 66
345
+ .b8 12
346
+ .b8 17
347
+ .b8 1
348
+ .b8 18
349
+ .b8 1
350
+ .b8 0
351
+ .b8 0
352
+ .b8 2
353
+ .b8 46
354
+ .b8 0
355
+ .b8 135
356
+ .b8 64
357
+ .b8 8
358
+ .b8 3
359
+ .b8 8
360
+ .b8 58
361
+ .b8 11
362
+ .b8 59
363
+ .b8 11
364
+ .b8 63
365
+ .b8 12
366
+ .b8 32
367
+ .b8 11
368
+ .b8 0
369
+ .b8 0
370
+ .b8 3
371
+ .b8 46
372
+ .b8 1
373
+ .b8 17
374
+ .b8 1
375
+ .b8 18
376
+ .b8 1
377
+ .b8 64
378
+ .b8 10
379
+ .b8 49
380
+ .b8 19
381
+ .b8 0
382
+ .b8 0
383
+ .b8 4
384
+ .b8 29
385
+ .b8 1
386
+ .b8 49
387
+ .b8 19
388
+ .b8 17
389
+ .b8 1
390
+ .b8 18
391
+ .b8 1
392
+ .b8 88
393
+ .b8 11
394
+ .b8 89
395
+ .b8 11
396
+ .b8 87
397
+ .b8 11
398
+ .b8 0
399
+ .b8 0
400
+ .b8 5
401
+ .b8 29
402
+ .b8 0
403
+ .b8 49
404
+ .b8 19
405
+ .b8 17
406
+ .b8 1
407
+ .b8 18
408
+ .b8 1
409
+ .b8 88
410
+ .b8 11
411
+ .b8 89
412
+ .b8 11
413
+ .b8 87
414
+ .b8 11
415
+ .b8 0
416
+ .b8 0
417
+ .b8 0
418
+ }
419
+ .section .debug_info
420
+ {
421
+ .b32 391
422
+ .b8 2
423
+ .b8 0
424
+ .b32 .debug_abbrev
425
+ .b8 8
426
+ .b8 1
427
+ .b8 116
428
+ .b8 114
429
+ .b8 105
430
+ .b8 116
431
+ .b8 111
432
+ .b8 110
433
+ .b8 0
434
+ .b8 2
435
+ .b8 0
436
+ .b8 99
437
+ .b8 101
438
+ .b8 53
439
+ .b8 99
440
+ .b8 101
441
+ .b8 109
442
+ .b8 97
443
+ .b8 102
444
+ .b8 55
445
+ .b8 54
446
+ .b8 51
447
+ .b8 122
448
+ .b8 111
449
+ .b8 112
450
+ .b8 55
451
+ .b8 116
452
+ .b8 103
453
+ .b8 109
454
+ .b8 100
455
+ .b8 108
456
+ .b8 55
457
+ .b8 111
458
+ .b8 103
459
+ .b8 104
460
+ .b8 119
461
+ .b8 101
462
+ .b8 104
463
+ .b8 52
464
+ .b8 105
465
+ .b8 50
466
+ .b8 111
467
+ .b8 51
468
+ .b8 103
469
+ .b8 54
470
+ .b8 51
471
+ .b8 50
472
+ .b8 113
473
+ .b8 110
474
+ .b8 107
475
+ .b8 114
476
+ .b8 104
477
+ .b8 106
478
+ .b8 117
479
+ .b8 50
480
+ .b8 99
481
+ .b8 116
482
+ .b8 104
483
+ .b8 98
484
+ .b8 120
485
+ .b8 110
486
+ .b8 102
487
+ .b8 100
488
+ .b8 46
489
+ .b8 112
490
+ .b8 121
491
+ .b8 0
492
+ .b32 .debug_line
493
+ .b8 47
494
+ .b8 116
495
+ .b8 109
496
+ .b8 112
497
+ .b8 47
498
+ .b8 116
499
+ .b8 111
500
+ .b8 114
501
+ .b8 99
502
+ .b8 104
503
+ .b8 105
504
+ .b8 110
505
+ .b8 100
506
+ .b8 117
507
+ .b8 99
508
+ .b8 116
509
+ .b8 111
510
+ .b8 114
511
+ .b8 95
512
+ .b8 114
513
+ .b8 111
514
+ .b8 111
515
+ .b8 116
516
+ .b8 47
517
+ .b8 101
518
+ .b8 53
519
+ .b8 0
520
+ .b8 1
521
+ .b64 $L__func_begin0
522
+ .b64 $L__func_end0
523
+ .b8 2
524
+ .b8 116
525
+ .b8 114
526
+ .b8 105
527
+ .b8 116
528
+ .b8 111
529
+ .b8 110
530
+ .b8 95
531
+ .b8 95
532
+ .b8 48
533
+ .b8 100
534
+ .b8 49
535
+ .b8 100
536
+ .b8 50
537
+ .b8 100
538
+ .b8 51
539
+ .b8 100
540
+ .b8 52
541
+ .b8 100
542
+ .b8 101
543
+ .b8 53
544
+ .b8 100
545
+ .b8 101
546
+ .b8 0
547
+ .b8 116
548
+ .b8 114
549
+ .b8 105
550
+ .b8 116
551
+ .b8 111
552
+ .b8 110
553
+ .b8 95
554
+ .b8 95
555
+ .b8 48
556
+ .b8 100
557
+ .b8 49
558
+ .b8 100
559
+ .b8 50
560
+ .b8 100
561
+ .b8 51
562
+ .b8 100
563
+ .b8 52
564
+ .b8 100
565
+ .b8 101
566
+ .b8 53
567
+ .b8 100
568
+ .b8 101
569
+ .b8 0
570
+ .b8 1
571
+ .b8 18
572
+ .b8 1
573
+ .b8 1
574
+ .b8 3
575
+ .b64 $L__func_begin0
576
+ .b64 $L__func_end0
577
+ .b8 1
578
+ .b8 156
579
+ .b32 125
580
+ .b8 4
581
+ .b32 125
582
+ .b64 $L__tmp1
583
+ .b64 $L__tmp14
584
+ .b8 2
585
+ .b8 39
586
+ .b8 58
587
+ .b8 5
588
+ .b32 125
589
+ .b64 $L__tmp1
590
+ .b64 $L__tmp14
591
+ .b8 2
592
+ .b8 243
593
+ .b8 36
594
+ .b8 0
595
+ .b8 5
596
+ .b32 125
597
+ .b64 $L__tmp2
598
+ .b64 $L__tmp15
599
+ .b8 2
600
+ .b8 39
601
+ .b8 58
602
+ .b8 5
603
+ .b32 125
604
+ .b64 $L__tmp15
605
+ .b64 $L__tmp16
606
+ .b8 3
607
+ .b8 39
608
+ .b8 45
609
+ .b8 5
610
+ .b32 125
611
+ .b64 $L__tmp17
612
+ .b64 $L__tmp32
613
+ .b8 2
614
+ .b8 47
615
+ .b8 59
616
+ .b8 4
617
+ .b32 125
618
+ .b64 $L__tmp18
619
+ .b64 $L__tmp31
620
+ .b8 2
621
+ .b8 47
622
+ .b8 59
623
+ .b8 5
624
+ .b32 125
625
+ .b64 $L__tmp18
626
+ .b64 $L__tmp31
627
+ .b8 2
628
+ .b8 243
629
+ .b8 36
630
+ .b8 0
631
+ .b8 5
632
+ .b32 125
633
+ .b64 $L__tmp32
634
+ .b64 $L__tmp33
635
+ .b8 3
636
+ .b8 47
637
+ .b8 45
638
+ .b8 0
639
+ .b8 0
640
+ }
641
+ .section .debug_pubnames
642
+ {
643
+ .b32 $L__pubNames_end0-$L__pubNames_start0
644
+ $L__pubNames_start0:
645
+ .b8 2
646
+ .b8 0
647
+ .b32 .debug_info
648
+ .b32 395
649
+ .b32 125
650
+ .b8 116
651
+ .b8 114
652
+ .b8 105
653
+ .b8 116
654
+ .b8 111
655
+ .b8 110
656
+ .b8 95
657
+ .b8 95
658
+ .b8 48
659
+ .b8 100
660
+ .b8 49
661
+ .b8 100
662
+ .b8 50
663
+ .b8 100
664
+ .b8 51
665
+ .b8 100
666
+ .b8 52
667
+ .b8 100
668
+ .b8 101
669
+ .b8 53
670
+ .b8 100
671
+ .b8 101
672
+ .b8 0
673
+ .b32 0
674
+ $L__pubNames_end0:
675
+ }
676
+ .section .debug_pubtypes
677
+ {
678
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
679
+ $L__pubTypes_start0:
680
+ .b8 2
681
+ .b8 0
682
+ .b32 .debug_info
683
+ .b32 395
684
+ .b32 0
685
+ $L__pubTypes_end0:
686
+ }
687
+ .section .debug_loc { }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
28
+ %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
29
+ %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
30
+ ^bb0(%arg6: f32, %arg7: f32):
31
+ %36 = arith.addf %arg6, %arg7 : f32
32
+ tt.reduce.return %36 : f32
33
+ }) : (tensor<256xf32, #blocked>) -> f32
34
+ %19 = arith.addf %18, %cst_2 : f32
35
+ %20 = arith.divf %19, %cst_1 : f32
36
+ %21 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
37
+ %22 = arith.subf %16, %21 : tensor<256xf32, #blocked>
38
+ %23 = arith.mulf %22, %22 : tensor<256xf32, #blocked>
39
+ %24 = arith.select %2, %23, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
40
+ %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
41
+ ^bb0(%arg6: f32, %arg7: f32):
42
+ %36 = arith.addf %arg6, %arg7 : f32
43
+ tt.reduce.return %36 : f32
44
+ }) : (tensor<256xf32, #blocked>) -> f32
45
+ %26 = arith.addf %25, %cst_2 : f32
46
+ %27 = arith.divf %26, %cst_1 : f32
47
+ %28 = arith.addf %27, %cst_0 : f32
48
+ %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
49
+ %30 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
50
+ %31 = arith.mulf %22, %30 : tensor<256xf32, #blocked>
51
+ %32 = arith.mulf %31, %15 : tensor<256xf32, #blocked>
52
+ %33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
53
+ %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
54
+ %35 = arith.truncf %32 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
55
+ tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
56
+ tt.return
57
+ }
58
+ }
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir ADDED
@@ -0,0 +1,1121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = lshr i32 %9, 5, !dbg !10
18
+ %11 = and i32 %10, 7, !dbg !10
19
+ %12 = and i32 %9, 15, !dbg !10
20
+ %13 = shl i32 %9, 3, !dbg !11
21
+ %14 = and i32 %13, 248, !dbg !11
22
+ %15 = or i32 %14, 4, !dbg !11
23
+ %urem = and i32 %9, 255, !dbg !11
24
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
25
+ %17 = shl i32 %16, 4, !dbg !13
26
+ %18 = or i32 %17, %11, !dbg !14
27
+ %19 = or i32 %18, 8, !dbg !14
28
+ %20 = or i32 %17, %12, !dbg !14
29
+ %21 = sext i32 %18 to i64, !dbg !15
30
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
31
+ %23 = sext i32 %19 to i64, !dbg !15
32
+ %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
33
+ %25 = sext i32 %20 to i64, !dbg !15
34
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
40
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
41
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
42
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
43
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
44
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
45
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
46
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
47
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
48
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
49
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
50
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
51
+ %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
52
+ %44 = srem i32 %18, 512, !dbg !17
53
+ %45 = srem i32 %19, 512, !dbg !17
54
+ %46 = shl nsw i32 %44, 8, !dbg !18
55
+ %47 = shl nsw i32 %45, 8, !dbg !18
56
+ %48 = or i32 %46, %14, !dbg !19
57
+ %49 = or i32 %46, %15, !dbg !19
58
+ %50 = or i32 %47, %14, !dbg !19
59
+ %51 = or i32 %47, %15, !dbg !19
60
+ %52 = sext i32 %48 to i64, !dbg !20
61
+ %53 = getelementptr float, ptr addrspace(1) %2, i64 %52, !dbg !20
62
+ %54 = sext i32 %49 to i64, !dbg !20
63
+ %55 = getelementptr float, ptr addrspace(1) %2, i64 %54, !dbg !20
64
+ %56 = sext i32 %50 to i64, !dbg !20
65
+ %57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !20
66
+ %58 = sext i32 %51 to i64, !dbg !20
67
+ %59 = getelementptr float, ptr addrspace(1) %2, i64 %58, !dbg !20
68
+ %60 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
69
+ %61 = extractvalue { i32, i32, i32, i32 } %60, 0, !dbg !21
70
+ %62 = extractvalue { i32, i32, i32, i32 } %60, 1, !dbg !21
71
+ %63 = extractvalue { i32, i32, i32, i32 } %60, 2, !dbg !21
72
+ %64 = extractvalue { i32, i32, i32, i32 } %60, 3, !dbg !21
73
+ %65 = bitcast i32 %61 to float, !dbg !21
74
+ %66 = bitcast i32 %62 to float, !dbg !21
75
+ %67 = bitcast i32 %63 to float, !dbg !21
76
+ %68 = bitcast i32 %64 to float, !dbg !21
77
+ %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
78
+ %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !21
79
+ %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !21
80
+ %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !21
81
+ %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !21
82
+ %74 = bitcast i32 %70 to float, !dbg !21
83
+ %75 = bitcast i32 %71 to float, !dbg !21
84
+ %76 = bitcast i32 %72 to float, !dbg !21
85
+ %77 = bitcast i32 %73 to float, !dbg !21
86
+ %78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
87
+ %79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !21
88
+ %80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !21
89
+ %81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !21
90
+ %82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !21
91
+ %83 = bitcast i32 %79 to float, !dbg !21
92
+ %84 = bitcast i32 %80 to float, !dbg !21
93
+ %85 = bitcast i32 %81 to float, !dbg !21
94
+ %86 = bitcast i32 %82 to float, !dbg !21
95
+ %87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
96
+ %88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !21
97
+ %89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !21
98
+ %90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !21
99
+ %91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !21
100
+ %92 = bitcast i32 %88 to float, !dbg !21
101
+ %93 = bitcast i32 %89 to float, !dbg !21
102
+ %94 = bitcast i32 %90 to float, !dbg !21
103
+ %95 = bitcast i32 %91 to float, !dbg !21
104
+ %96 = shl i32 %18, 8, !dbg !22
105
+ %97 = shl i32 %19, 8, !dbg !22
106
+ %98 = or i32 %96, %14, !dbg !23
107
+ %99 = or i32 %97, %14, !dbg !23
108
+ %100 = sext i32 %98 to i64, !dbg !24
109
+ %101 = getelementptr i16, ptr addrspace(1) %3, i64 %100, !dbg !24
110
+ %102 = sext i32 %99 to i64, !dbg !24
111
+ %103 = getelementptr i16, ptr addrspace(1) %3, i64 %102, !dbg !24
112
+ %104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
113
+ %105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !25
114
+ %106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !25
115
+ %107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !25
116
+ %108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !25
117
+ %109 = trunc i32 %105 to i16, !dbg !25
118
+ %extelt.offset = lshr i32 %105, 16, !dbg !25
119
+ %110 = trunc i32 %extelt.offset to i16, !dbg !25
120
+ %111 = trunc i32 %106 to i16, !dbg !25
121
+ %extelt.offset1 = lshr i32 %106, 16, !dbg !25
122
+ %112 = trunc i32 %extelt.offset1 to i16, !dbg !25
123
+ %113 = trunc i32 %107 to i16, !dbg !25
124
+ %extelt.offset2 = lshr i32 %107, 16, !dbg !25
125
+ %114 = trunc i32 %extelt.offset2 to i16, !dbg !25
126
+ %115 = trunc i32 %108 to i16, !dbg !25
127
+ %extelt.offset3 = lshr i32 %108, 16, !dbg !25
128
+ %116 = trunc i32 %extelt.offset3 to i16, !dbg !25
129
+ %117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
130
+ %118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !25
131
+ %119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !25
132
+ %120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !25
133
+ %121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !25
134
+ %122 = trunc i32 %118 to i16, !dbg !25
135
+ %extelt.offset4 = lshr i32 %118, 16, !dbg !25
136
+ %123 = trunc i32 %extelt.offset4 to i16, !dbg !25
137
+ %124 = trunc i32 %119 to i16, !dbg !25
138
+ %extelt.offset5 = lshr i32 %119, 16, !dbg !25
139
+ %125 = trunc i32 %extelt.offset5 to i16, !dbg !25
140
+ %126 = trunc i32 %120 to i16, !dbg !25
141
+ %extelt.offset6 = lshr i32 %120, 16, !dbg !25
142
+ %127 = trunc i32 %extelt.offset6 to i16, !dbg !25
143
+ %128 = trunc i32 %121 to i16, !dbg !25
144
+ %extelt.offset7 = lshr i32 %121, 16, !dbg !25
145
+ %129 = trunc i32 %extelt.offset7 to i16, !dbg !25
146
+ %130 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %109) #6, !dbg !26
147
+ %131 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %110) #6, !dbg !26
148
+ %132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #6, !dbg !26
149
+ %133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #6, !dbg !26
150
+ %134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !26
151
+ %135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !26
152
+ %136 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !26
153
+ %137 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !26
154
+ %138 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %122) #6, !dbg !26
155
+ %139 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %123) #6, !dbg !26
156
+ %140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %124) #6, !dbg !26
157
+ %141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %125) #6, !dbg !26
158
+ %142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %126) #6, !dbg !26
159
+ %143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %127) #6, !dbg !26
160
+ %144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #6, !dbg !26
161
+ %145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #6, !dbg !26
162
+ %146 = add i64 %43, 50257, !dbg !27
163
+ %147 = icmp slt i64 %27, 0, !dbg !28
164
+ %148 = icmp slt i64 %35, 0, !dbg !28
165
+ %149 = icmp slt i64 %43, 0, !dbg !28
166
+ %150 = select i1 %149, i64 %146, i64 %43, !dbg !29
167
+ %151 = icmp ugt i64 %150, 50256, !dbg !30
168
+ br i1 %151, label %152, label %153, !dbg !31
169
+
170
+ 152: ; preds = %8
171
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
172
+ br label %153, !dbg !31
173
+
174
+ 153: ; preds = %152, %8
175
+ %154 = shl i64 %27, 8, !dbg !32
176
+ %155 = add i64 %154, 12865792, !dbg !32
177
+ %156 = select i1 %147, i64 %155, i64 %154, !dbg !32
178
+ %157 = shl i64 %35, 8, !dbg !32
179
+ %158 = add i64 %157, 12865792, !dbg !32
180
+ %159 = select i1 %148, i64 %158, i64 %157, !dbg !32
181
+ %160 = zext nneg i32 %14 to i64
182
+ %161 = zext nneg i32 %15 to i64
183
+ %162 = or i64 %156, %160, !dbg !33
184
+ %163 = or i64 %156, %161, !dbg !33
185
+ %164 = or i64 %159, %160, !dbg !33
186
+ %165 = or i64 %159, %161, !dbg !33
187
+ %166 = getelementptr float, ptr addrspace(1) %1, i64 %162, !dbg !34
188
+ %167 = getelementptr float, ptr addrspace(1) %1, i64 %163, !dbg !34
189
+ %168 = getelementptr float, ptr addrspace(1) %1, i64 %164, !dbg !34
190
+ %169 = getelementptr float, ptr addrspace(1) %1, i64 %165, !dbg !34
191
+ %170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
192
+ %171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !35
193
+ %172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !35
194
+ %173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !35
195
+ %174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !35
196
+ %175 = bitcast i32 %171 to float, !dbg !35
197
+ %176 = bitcast i32 %172 to float, !dbg !35
198
+ %177 = bitcast i32 %173 to float, !dbg !35
199
+ %178 = bitcast i32 %174 to float, !dbg !35
200
+ %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
201
+ %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !35
202
+ %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !35
203
+ %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !35
204
+ %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !35
205
+ %184 = bitcast i32 %180 to float, !dbg !35
206
+ %185 = bitcast i32 %181 to float, !dbg !35
207
+ %186 = bitcast i32 %182 to float, !dbg !35
208
+ %187 = bitcast i32 %183 to float, !dbg !35
209
+ %188 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
210
+ %189 = extractvalue { i32, i32, i32, i32 } %188, 0, !dbg !35
211
+ %190 = extractvalue { i32, i32, i32, i32 } %188, 1, !dbg !35
212
+ %191 = extractvalue { i32, i32, i32, i32 } %188, 2, !dbg !35
213
+ %192 = extractvalue { i32, i32, i32, i32 } %188, 3, !dbg !35
214
+ %193 = bitcast i32 %189 to float, !dbg !35
215
+ %194 = bitcast i32 %190 to float, !dbg !35
216
+ %195 = bitcast i32 %191 to float, !dbg !35
217
+ %196 = bitcast i32 %192 to float, !dbg !35
218
+ %197 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
219
+ %198 = extractvalue { i32, i32, i32, i32 } %197, 0, !dbg !35
220
+ %199 = extractvalue { i32, i32, i32, i32 } %197, 1, !dbg !35
221
+ %200 = extractvalue { i32, i32, i32, i32 } %197, 2, !dbg !35
222
+ %201 = extractvalue { i32, i32, i32, i32 } %197, 3, !dbg !35
223
+ %202 = bitcast i32 %198 to float, !dbg !35
224
+ %203 = bitcast i32 %199 to float, !dbg !35
225
+ %204 = bitcast i32 %200 to float, !dbg !35
226
+ %205 = bitcast i32 %201 to float, !dbg !35
227
+ %206 = fadd float %65, %175, !dbg !36
228
+ %207 = fadd float %66, %176, !dbg !36
229
+ %208 = fadd float %67, %177, !dbg !36
230
+ %209 = fadd float %68, %178, !dbg !36
231
+ %210 = fadd float %74, %184, !dbg !36
232
+ %211 = fadd float %75, %185, !dbg !36
233
+ %212 = fadd float %76, %186, !dbg !36
234
+ %213 = fadd float %77, %187, !dbg !36
235
+ %214 = fadd float %83, %193, !dbg !36
236
+ %215 = fadd float %84, %194, !dbg !36
237
+ %216 = fadd float %85, %195, !dbg !36
238
+ %217 = fadd float %86, %196, !dbg !36
239
+ %218 = fadd float %92, %202, !dbg !36
240
+ %219 = fadd float %93, %203, !dbg !36
241
+ %220 = fadd float %94, %204, !dbg !36
242
+ %221 = fadd float %95, %205, !dbg !36
243
+ %222 = fadd float %130, %206, !dbg !37
244
+ %223 = fadd float %131, %207, !dbg !37
245
+ %224 = fadd float %132, %208, !dbg !37
246
+ %225 = fadd float %133, %209, !dbg !37
247
+ %226 = fadd float %134, %210, !dbg !37
248
+ %227 = fadd float %135, %211, !dbg !37
249
+ %228 = fadd float %136, %212, !dbg !37
250
+ %229 = fadd float %137, %213, !dbg !37
251
+ %230 = fadd float %138, %214, !dbg !37
252
+ %231 = fadd float %139, %215, !dbg !37
253
+ %232 = fadd float %140, %216, !dbg !37
254
+ %233 = fadd float %141, %217, !dbg !37
255
+ %234 = fadd float %142, %218, !dbg !37
256
+ %235 = fadd float %143, %219, !dbg !37
257
+ %236 = fadd float %144, %220, !dbg !37
258
+ %237 = fadd float %145, %221, !dbg !37
259
+ %238 = fadd float %222, 0.000000e+00, !dbg !38
260
+ %239 = fadd float %223, 0.000000e+00, !dbg !38
261
+ %240 = fadd float %224, 0.000000e+00, !dbg !38
262
+ %241 = fadd float %225, 0.000000e+00, !dbg !38
263
+ %242 = fadd float %226, 0.000000e+00, !dbg !38
264
+ %243 = fadd float %227, 0.000000e+00, !dbg !38
265
+ %244 = fadd float %228, 0.000000e+00, !dbg !38
266
+ %245 = fadd float %229, 0.000000e+00, !dbg !38
267
+ %246 = fadd float %230, 0.000000e+00, !dbg !38
268
+ %247 = fadd float %231, 0.000000e+00, !dbg !38
269
+ %248 = fadd float %232, 0.000000e+00, !dbg !38
270
+ %249 = fadd float %233, 0.000000e+00, !dbg !38
271
+ %250 = fadd float %234, 0.000000e+00, !dbg !38
272
+ %251 = fadd float %235, 0.000000e+00, !dbg !38
273
+ %252 = fadd float %236, 0.000000e+00, !dbg !38
274
+ %253 = fadd float %237, 0.000000e+00, !dbg !38
275
+ %254 = fsub float %222, %238, !dbg !42
276
+ %255 = fsub float %223, %239, !dbg !42
277
+ %256 = fsub float %224, %240, !dbg !42
278
+ %257 = fsub float %225, %241, !dbg !42
279
+ %258 = fsub float %226, %242, !dbg !42
280
+ %259 = fsub float %227, %243, !dbg !42
281
+ %260 = fsub float %228, %244, !dbg !42
282
+ %261 = fsub float %229, %245, !dbg !42
283
+ %262 = fsub float %230, %246, !dbg !42
284
+ %263 = fsub float %231, %247, !dbg !42
285
+ %264 = fsub float %232, %248, !dbg !42
286
+ %265 = fsub float %233, %249, !dbg !42
287
+ %266 = fsub float %234, %250, !dbg !42
288
+ %267 = fsub float %235, %251, !dbg !42
289
+ %268 = fsub float %236, %252, !dbg !42
290
+ %269 = fsub float %237, %253, !dbg !42
291
+ %270 = fmul float %222, %254, !dbg !43
292
+ %271 = fmul float %223, %255, !dbg !43
293
+ %272 = fmul float %224, %256, !dbg !43
294
+ %273 = fmul float %225, %257, !dbg !43
295
+ %274 = fmul float %226, %258, !dbg !43
296
+ %275 = fmul float %227, %259, !dbg !43
297
+ %276 = fmul float %228, %260, !dbg !43
298
+ %277 = fmul float %229, %261, !dbg !43
299
+ %278 = fmul float %230, %262, !dbg !43
300
+ %279 = fmul float %231, %263, !dbg !43
301
+ %280 = fmul float %232, %264, !dbg !43
302
+ %281 = fmul float %233, %265, !dbg !43
303
+ %282 = fmul float %234, %266, !dbg !43
304
+ %283 = fmul float %235, %267, !dbg !43
305
+ %284 = fmul float %236, %268, !dbg !43
306
+ %285 = fmul float %237, %269, !dbg !43
307
+ %286 = fadd float %270, 0.000000e+00, !dbg !44
308
+ %287 = fadd float %271, 0.000000e+00, !dbg !44
309
+ %288 = fadd float %272, 0.000000e+00, !dbg !44
310
+ %289 = fadd float %273, 0.000000e+00, !dbg !44
311
+ %290 = fadd float %274, 0.000000e+00, !dbg !44
312
+ %291 = fadd float %275, 0.000000e+00, !dbg !44
313
+ %292 = fadd float %276, 0.000000e+00, !dbg !44
314
+ %293 = fadd float %277, 0.000000e+00, !dbg !44
315
+ %294 = fadd float %278, 0.000000e+00, !dbg !44
316
+ %295 = fadd float %279, 0.000000e+00, !dbg !44
317
+ %296 = fadd float %280, 0.000000e+00, !dbg !44
318
+ %297 = fadd float %281, 0.000000e+00, !dbg !44
319
+ %298 = fadd float %282, 0.000000e+00, !dbg !44
320
+ %299 = fadd float %283, 0.000000e+00, !dbg !44
321
+ %300 = fadd float %284, 0.000000e+00, !dbg !44
322
+ %301 = fadd float %285, 0.000000e+00, !dbg !44
323
+ %302 = fsub float %239, %238, !dbg !45
324
+ %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
325
+ %304 = fmul float %303, %302, !dbg !50
326
+ %305 = fadd float %238, %304, !dbg !51
327
+ %306 = fadd float %286, %287, !dbg !52
328
+ %307 = fmul float %302, %302, !dbg !53
329
+ %308 = fmul float %303, %307, !dbg !54
330
+ %309 = fadd float %308, %306, !dbg !55
331
+ %310 = fsub float %240, %305, !dbg !45
332
+ %311 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
333
+ %312 = fmul float %311, %310, !dbg !50
334
+ %313 = fadd float %305, %312, !dbg !51
335
+ %314 = fadd float %288, %309, !dbg !52
336
+ %315 = fmul float %310, %310, !dbg !53
337
+ %316 = fmul float %315, 2.000000e+00, !dbg !56
338
+ %317 = fmul float %311, %316, !dbg !54
339
+ %318 = fadd float %314, %317, !dbg !55
340
+ %319 = fsub float %241, %313, !dbg !45
341
+ %320 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
342
+ %321 = fmul float %320, %319, !dbg !50
343
+ %322 = fadd float %313, %321, !dbg !51
344
+ %323 = fadd float %289, %318, !dbg !52
345
+ %324 = fmul float %319, %319, !dbg !53
346
+ %325 = fmul float %324, 3.000000e+00, !dbg !56
347
+ %326 = fmul float %320, %325, !dbg !54
348
+ %327 = fadd float %323, %326, !dbg !55
349
+ %328 = fsub float %242, %322, !dbg !45
350
+ %329 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
351
+ %330 = fmul float %329, %328, !dbg !50
352
+ %331 = fadd float %322, %330, !dbg !51
353
+ %332 = fadd float %290, %327, !dbg !52
354
+ %333 = fmul float %328, %328, !dbg !53
355
+ %334 = fmul float %333, 4.000000e+00, !dbg !56
356
+ %335 = fmul float %329, %334, !dbg !54
357
+ %336 = fadd float %332, %335, !dbg !55
358
+ %337 = fsub float %243, %331, !dbg !45
359
+ %338 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
360
+ %339 = fmul float %338, %337, !dbg !50
361
+ %340 = fadd float %331, %339, !dbg !51
362
+ %341 = fadd float %291, %336, !dbg !52
363
+ %342 = fmul float %337, %337, !dbg !53
364
+ %343 = fmul float %342, 5.000000e+00, !dbg !56
365
+ %344 = fmul float %338, %343, !dbg !54
366
+ %345 = fadd float %341, %344, !dbg !55
367
+ %346 = fsub float %244, %340, !dbg !45
368
+ %347 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
369
+ %348 = fmul float %347, %346, !dbg !50
370
+ %349 = fadd float %340, %348, !dbg !51
371
+ %350 = fadd float %292, %345, !dbg !52
372
+ %351 = fmul float %346, %346, !dbg !53
373
+ %352 = fmul float %351, 6.000000e+00, !dbg !56
374
+ %353 = fmul float %347, %352, !dbg !54
375
+ %354 = fadd float %350, %353, !dbg !55
376
+ %355 = fsub float %245, %349, !dbg !45
377
+ %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
378
+ %357 = fmul float %356, %355, !dbg !50
379
+ %358 = fadd float %349, %357, !dbg !51
380
+ %359 = fadd float %293, %354, !dbg !52
381
+ %360 = fmul float %355, %355, !dbg !53
382
+ %361 = fmul float %360, 7.000000e+00, !dbg !56
383
+ %362 = fmul float %356, %361, !dbg !54
384
+ %363 = fadd float %359, %362, !dbg !55
385
+ %364 = fsub float %247, %246, !dbg !45
386
+ %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
387
+ %366 = fmul float %364, %365, !dbg !50
388
+ %367 = fadd float %246, %366, !dbg !51
389
+ %368 = fadd float %294, %295, !dbg !52
390
+ %369 = fmul float %364, %364, !dbg !53
391
+ %370 = fmul float %369, %365, !dbg !54
392
+ %371 = fadd float %368, %370, !dbg !55
393
+ %372 = fsub float %248, %367, !dbg !45
394
+ %373 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
395
+ %374 = fmul float %373, %372, !dbg !50
396
+ %375 = fadd float %367, %374, !dbg !51
397
+ %376 = fadd float %296, %371, !dbg !52
398
+ %377 = fmul float %372, %372, !dbg !53
399
+ %378 = fmul float %377, 2.000000e+00, !dbg !56
400
+ %379 = fmul float %373, %378, !dbg !54
401
+ %380 = fadd float %376, %379, !dbg !55
402
+ %381 = fsub float %249, %375, !dbg !45
403
+ %382 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
404
+ %383 = fmul float %382, %381, !dbg !50
405
+ %384 = fadd float %375, %383, !dbg !51
406
+ %385 = fadd float %297, %380, !dbg !52
407
+ %386 = fmul float %381, %381, !dbg !53
408
+ %387 = fmul float %386, 3.000000e+00, !dbg !56
409
+ %388 = fmul float %382, %387, !dbg !54
410
+ %389 = fadd float %385, %388, !dbg !55
411
+ %390 = fsub float %250, %384, !dbg !45
412
+ %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
413
+ %392 = fmul float %391, %390, !dbg !50
414
+ %393 = fadd float %384, %392, !dbg !51
415
+ %394 = fadd float %298, %389, !dbg !52
416
+ %395 = fmul float %390, %390, !dbg !53
417
+ %396 = fmul float %395, 4.000000e+00, !dbg !56
418
+ %397 = fmul float %391, %396, !dbg !54
419
+ %398 = fadd float %394, %397, !dbg !55
420
+ %399 = fsub float %251, %393, !dbg !45
421
+ %400 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
422
+ %401 = fmul float %400, %399, !dbg !50
423
+ %402 = fadd float %393, %401, !dbg !51
424
+ %403 = fadd float %299, %398, !dbg !52
425
+ %404 = fmul float %399, %399, !dbg !53
426
+ %405 = fmul float %404, 5.000000e+00, !dbg !56
427
+ %406 = fmul float %400, %405, !dbg !54
428
+ %407 = fadd float %403, %406, !dbg !55
429
+ %408 = fsub float %252, %402, !dbg !45
430
+ %409 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
431
+ %410 = fmul float %409, %408, !dbg !50
432
+ %411 = fadd float %402, %410, !dbg !51
433
+ %412 = fadd float %300, %407, !dbg !52
434
+ %413 = fmul float %408, %408, !dbg !53
435
+ %414 = fmul float %413, 6.000000e+00, !dbg !56
436
+ %415 = fmul float %409, %414, !dbg !54
437
+ %416 = fadd float %412, %415, !dbg !55
438
+ %417 = fsub float %253, %411, !dbg !45
439
+ %418 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
440
+ %419 = fmul float %418, %417, !dbg !50
441
+ %420 = fadd float %411, %419, !dbg !51
442
+ %421 = fadd float %301, %416, !dbg !52
443
+ %422 = fmul float %417, %417, !dbg !53
444
+ %423 = fmul float %422, 7.000000e+00, !dbg !56
445
+ %424 = fmul float %418, %423, !dbg !54
446
+ %425 = fadd float %421, %424, !dbg !55
447
+ %426 = bitcast float %358 to i32, !dbg !57
448
+ %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 16, i32 31), !dbg !57
449
+ %428 = bitcast i32 %427 to float, !dbg !57
450
+ %429 = bitcast float %363 to i32, !dbg !57
451
+ %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !57
452
+ %431 = bitcast i32 %430 to float, !dbg !57
453
+ %432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
454
+ %433 = bitcast i32 %432 to float, !dbg !57
455
+ %434 = fsub float %428, %358, !dbg !45
456
+ %435 = fadd float %433, 8.000000e+00, !dbg !59
457
+ %436 = fcmp oeq float %435, 0.000000e+00, !dbg !60
458
+ %437 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %433, float %435) #6, !dbg !49
459
+ %438 = select i1 %436, float 0.000000e+00, float %437, !dbg !61
460
+ %439 = fmul float %438, %434, !dbg !50
461
+ %440 = fadd float %358, %439, !dbg !51
462
+ %441 = fadd float %363, %431, !dbg !52
463
+ %442 = fmul float %434, %434, !dbg !53
464
+ %443 = fmul float %442, 8.000000e+00, !dbg !56
465
+ %444 = fmul float %438, %443, !dbg !54
466
+ %445 = fadd float %441, %444, !dbg !55
467
+ %446 = bitcast float %440 to i32, !dbg !57
468
+ %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !57
469
+ %448 = bitcast i32 %447 to float, !dbg !57
470
+ %449 = bitcast float %445 to i32, !dbg !57
471
+ %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 8, i32 31), !dbg !57
472
+ %451 = bitcast i32 %450 to float, !dbg !57
473
+ %452 = bitcast float %435 to i32, !dbg !57
474
+ %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 8, i32 31), !dbg !57
475
+ %454 = bitcast i32 %453 to float, !dbg !57
476
+ %455 = fsub float %448, %440, !dbg !45
477
+ %456 = fadd float %435, %454, !dbg !59
478
+ %457 = fcmp oeq float %456, 0.000000e+00, !dbg !60
479
+ %458 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %454, float %456) #6, !dbg !49
480
+ %459 = select i1 %457, float 0.000000e+00, float %458, !dbg !61
481
+ %460 = fmul float %459, %455, !dbg !50
482
+ %461 = fadd float %440, %460, !dbg !51
483
+ %462 = fadd float %445, %451, !dbg !52
484
+ %463 = fmul float %455, %455, !dbg !53
485
+ %464 = fmul float %435, %463, !dbg !56
486
+ %465 = fmul float %459, %464, !dbg !54
487
+ %466 = fadd float %462, %465, !dbg !55
488
+ %467 = bitcast float %461 to i32, !dbg !57
489
+ %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 4, i32 31), !dbg !57
490
+ %469 = bitcast i32 %468 to float, !dbg !57
491
+ %470 = bitcast float %466 to i32, !dbg !57
492
+ %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 4, i32 31), !dbg !57
493
+ %472 = bitcast i32 %471 to float, !dbg !57
494
+ %473 = bitcast float %456 to i32, !dbg !57
495
+ %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !57
496
+ %475 = bitcast i32 %474 to float, !dbg !57
497
+ %476 = fsub float %469, %461, !dbg !45
498
+ %477 = fadd float %456, %475, !dbg !59
499
+ %478 = fcmp oeq float %477, 0.000000e+00, !dbg !60
500
+ %479 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %475, float %477) #6, !dbg !49
501
+ %480 = select i1 %478, float 0.000000e+00, float %479, !dbg !61
502
+ %481 = fmul float %480, %476, !dbg !50
503
+ %482 = fadd float %461, %481, !dbg !51
504
+ %483 = fadd float %466, %472, !dbg !52
505
+ %484 = fmul float %476, %476, !dbg !53
506
+ %485 = fmul float %456, %484, !dbg !56
507
+ %486 = fmul float %480, %485, !dbg !54
508
+ %487 = fadd float %483, %486, !dbg !55
509
+ %488 = bitcast float %482 to i32, !dbg !57
510
+ %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 2, i32 31), !dbg !57
511
+ %490 = bitcast i32 %489 to float, !dbg !57
512
+ %491 = bitcast float %487 to i32, !dbg !57
513
+ %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57
514
+ %493 = bitcast i32 %492 to float, !dbg !57
515
+ %494 = bitcast float %477 to i32, !dbg !57
516
+ %495 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 2, i32 31), !dbg !57
517
+ %496 = bitcast i32 %495 to float, !dbg !57
518
+ %497 = fsub float %490, %482, !dbg !45
519
+ %498 = fadd float %477, %496, !dbg !59
520
+ %499 = fcmp oeq float %498, 0.000000e+00, !dbg !60
521
+ %500 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %496, float %498) #6, !dbg !49
522
+ %501 = select i1 %499, float 0.000000e+00, float %500, !dbg !61
523
+ %502 = fmul float %497, %501, !dbg !50
524
+ %503 = fadd float %482, %502, !dbg !51
525
+ %504 = fadd float %487, %493, !dbg !52
526
+ %505 = fmul float %497, %497, !dbg !53
527
+ %506 = fmul float %477, %505, !dbg !56
528
+ %507 = fmul float %501, %506, !dbg !54
529
+ %508 = fadd float %504, %507, !dbg !55
530
+ %509 = bitcast float %503 to i32, !dbg !57
531
+ %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !57
532
+ %511 = bitcast float %508 to i32, !dbg !57
533
+ %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 1, i32 31), !dbg !57
534
+ %513 = bitcast float %498 to i32, !dbg !57
535
+ %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !57
536
+ %515 = bitcast i32 %514 to float, !dbg !57
537
+ %516 = fadd float %498, %515, !dbg !59
538
+ %517 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %515, float %516) #6, !dbg !49
539
+ %518 = bitcast float %420 to i32, !dbg !57
540
+ %519 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %518, i32 16, i32 31), !dbg !57
541
+ %520 = bitcast i32 %519 to float, !dbg !57
542
+ %521 = bitcast float %425 to i32, !dbg !57
543
+ %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 16, i32 31), !dbg !57
544
+ %523 = bitcast i32 %522 to float, !dbg !57
545
+ %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
546
+ %525 = bitcast i32 %524 to float, !dbg !57
547
+ %526 = fsub float %520, %420, !dbg !45
548
+ %527 = fadd float %525, 8.000000e+00, !dbg !59
549
+ %528 = fcmp oeq float %527, 0.000000e+00, !dbg !60
550
+ %529 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %525, float %527) #6, !dbg !49
551
+ %530 = select i1 %528, float 0.000000e+00, float %529, !dbg !61
552
+ %531 = fmul float %526, %530, !dbg !50
553
+ %532 = fadd float %420, %531, !dbg !51
554
+ %533 = fadd float %425, %523, !dbg !52
555
+ %534 = fmul float %526, %526, !dbg !53
556
+ %535 = fmul float %534, 8.000000e+00, !dbg !56
557
+ %536 = fmul float %535, %530, !dbg !54
558
+ %537 = fadd float %533, %536, !dbg !55
559
+ %538 = bitcast float %532 to i32, !dbg !57
560
+ %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !57
561
+ %540 = bitcast i32 %539 to float, !dbg !57
562
+ %541 = bitcast float %537 to i32, !dbg !57
563
+ %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 8, i32 31), !dbg !57
564
+ %543 = bitcast i32 %542 to float, !dbg !57
565
+ %544 = bitcast float %527 to i32, !dbg !57
566
+ %545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %544, i32 8, i32 31), !dbg !57
567
+ %546 = bitcast i32 %545 to float, !dbg !57
568
+ %547 = fsub float %540, %532, !dbg !45
569
+ %548 = fadd float %527, %546, !dbg !59
570
+ %549 = fcmp oeq float %548, 0.000000e+00, !dbg !60
571
+ %550 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %546, float %548) #6, !dbg !49
572
+ %551 = select i1 %549, float 0.000000e+00, float %550, !dbg !61
573
+ %552 = fmul float %547, %551, !dbg !50
574
+ %553 = fadd float %532, %552, !dbg !51
575
+ %554 = fadd float %537, %543, !dbg !52
576
+ %555 = fmul float %547, %547, !dbg !53
577
+ %556 = fmul float %527, %555, !dbg !56
578
+ %557 = fmul float %551, %556, !dbg !54
579
+ %558 = fadd float %554, %557, !dbg !55
580
+ %559 = bitcast float %553 to i32, !dbg !57
581
+ %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !57
582
+ %561 = bitcast i32 %560 to float, !dbg !57
583
+ %562 = bitcast float %558 to i32, !dbg !57
584
+ %563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 4, i32 31), !dbg !57
585
+ %564 = bitcast i32 %563 to float, !dbg !57
586
+ %565 = bitcast float %548 to i32, !dbg !57
587
+ %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 4, i32 31), !dbg !57
588
+ %567 = bitcast i32 %566 to float, !dbg !57
589
+ %568 = fsub float %561, %553, !dbg !45
590
+ %569 = fadd float %548, %567, !dbg !59
591
+ %570 = fcmp oeq float %569, 0.000000e+00, !dbg !60
592
+ %571 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %567, float %569) #6, !dbg !49
593
+ %572 = select i1 %570, float 0.000000e+00, float %571, !dbg !61
594
+ %573 = fmul float %568, %572, !dbg !50
595
+ %574 = fadd float %553, %573, !dbg !51
596
+ %575 = fadd float %558, %564, !dbg !52
597
+ %576 = fmul float %568, %568, !dbg !53
598
+ %577 = fmul float %548, %576, !dbg !56
599
+ %578 = fmul float %572, %577, !dbg !54
600
+ %579 = fadd float %575, %578, !dbg !55
601
+ %580 = bitcast float %574 to i32, !dbg !57
602
+ %581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 2, i32 31), !dbg !57
603
+ %582 = bitcast i32 %581 to float, !dbg !57
604
+ %583 = bitcast float %579 to i32, !dbg !57
605
+ %584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %583, i32 2, i32 31), !dbg !57
606
+ %585 = bitcast i32 %584 to float, !dbg !57
607
+ %586 = bitcast float %569 to i32, !dbg !57
608
+ %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 2, i32 31), !dbg !57
609
+ %588 = bitcast i32 %587 to float, !dbg !57
610
+ %589 = fsub float %582, %574, !dbg !45
611
+ %590 = fadd float %569, %588, !dbg !59
612
+ %591 = fcmp oeq float %590, 0.000000e+00, !dbg !60
613
+ %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %588, float %590) #6, !dbg !49
614
+ %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !61
615
+ %594 = fmul float %589, %593, !dbg !50
616
+ %595 = fadd float %574, %594, !dbg !51
617
+ %596 = fadd float %579, %585, !dbg !52
618
+ %597 = fmul float %589, %589, !dbg !53
619
+ %598 = fmul float %569, %597, !dbg !56
620
+ %599 = fmul float %593, %598, !dbg !54
621
+ %600 = fadd float %596, %599, !dbg !55
622
+ %601 = bitcast float %595 to i32, !dbg !57
623
+ %602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !57
624
+ %603 = bitcast float %600 to i32, !dbg !57
625
+ %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 1, i32 31), !dbg !57
626
+ %605 = bitcast float %590 to i32, !dbg !57
627
+ %606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !57
628
+ %607 = bitcast i32 %606 to float, !dbg !57
629
+ %608 = fadd float %590, %607, !dbg !59
630
+ %609 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %607, float %608) #6, !dbg !49
631
+ %610 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
632
+ %611 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
633
+ %612 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
634
+ %613 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
635
+ %614 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
636
+ %615 = extractvalue { i32, i32, i32, i32 } %614, 0, !dbg !63
637
+ %616 = extractvalue { i32, i32, i32, i32 } %614, 1, !dbg !63
638
+ %617 = extractvalue { i32, i32, i32, i32 } %614, 2, !dbg !63
639
+ %618 = extractvalue { i32, i32, i32, i32 } %614, 3, !dbg !63
640
+ %619 = trunc i32 %615 to i16, !dbg !63
641
+ %extelt.offset8 = lshr i32 %615, 16, !dbg !63
642
+ %620 = trunc i32 %extelt.offset8 to i16, !dbg !63
643
+ %621 = trunc i32 %616 to i16, !dbg !63
644
+ %extelt.offset9 = lshr i32 %616, 16, !dbg !63
645
+ %622 = trunc i32 %extelt.offset9 to i16, !dbg !63
646
+ %623 = trunc i32 %617 to i16, !dbg !63
647
+ %extelt.offset10 = lshr i32 %617, 16, !dbg !63
648
+ %624 = trunc i32 %extelt.offset10 to i16, !dbg !63
649
+ %625 = trunc i32 %618 to i16, !dbg !63
650
+ %extelt.offset11 = lshr i32 %618, 16, !dbg !63
651
+ %626 = trunc i32 %extelt.offset11 to i16, !dbg !63
652
+ %627 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
653
+ %628 = extractvalue { i32, i32, i32, i32 } %627, 0, !dbg !63
654
+ %629 = extractvalue { i32, i32, i32, i32 } %627, 1, !dbg !63
655
+ %630 = extractvalue { i32, i32, i32, i32 } %627, 2, !dbg !63
656
+ %631 = extractvalue { i32, i32, i32, i32 } %627, 3, !dbg !63
657
+ %632 = trunc i32 %628 to i16, !dbg !63
658
+ %extelt.offset12 = lshr i32 %628, 16, !dbg !63
659
+ %633 = trunc i32 %extelt.offset12 to i16, !dbg !63
660
+ %634 = trunc i32 %629 to i16, !dbg !63
661
+ %extelt.offset13 = lshr i32 %629, 16, !dbg !63
662
+ %635 = trunc i32 %extelt.offset13 to i16, !dbg !63
663
+ %636 = trunc i32 %630 to i16, !dbg !63
664
+ %extelt.offset14 = lshr i32 %630, 16, !dbg !63
665
+ %637 = trunc i32 %extelt.offset14 to i16, !dbg !63
666
+ %638 = trunc i32 %631 to i16, !dbg !63
667
+ %extelt.offset15 = lshr i32 %631, 16, !dbg !63
668
+ %639 = trunc i32 %extelt.offset15 to i16, !dbg !63
669
+ %640 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %619) #6, !dbg !64
670
+ %641 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %620) #6, !dbg !64
671
+ %642 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %621) #6, !dbg !64
672
+ %643 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %622) #6, !dbg !64
673
+ %644 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %623) #6, !dbg !64
674
+ %645 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %624) #6, !dbg !64
675
+ %646 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %625) #6, !dbg !64
676
+ %647 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %626) #6, !dbg !64
677
+ %648 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %632) #6, !dbg !64
678
+ %649 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %633) #6, !dbg !64
679
+ %650 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %634) #6, !dbg !64
680
+ %651 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %635) #6, !dbg !64
681
+ %652 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %636) #6, !dbg !64
682
+ %653 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %637) #6, !dbg !64
683
+ %654 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %638) #6, !dbg !64
684
+ %655 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %639) #6, !dbg !64
685
+ %656 = zext nneg i32 %urem to i64, !dbg !65
686
+ %657 = getelementptr float, ptr addrspace(1) %4, i64 %656, !dbg !65
687
+ %658 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %657, i1 true, i32 0, i1 true) #6, !dbg !66
688
+ br i1 %151, label %659, label %660, !dbg !67
689
+
690
+ 659: ; preds = %153
691
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
692
+ br label %660, !dbg !67
693
+
694
+ 660: ; preds = %659, %153
695
+ %661 = bitcast i32 %604 to float, !dbg !57
696
+ %662 = fadd float %600, %661, !dbg !52
697
+ %663 = bitcast i32 %602 to float, !dbg !57
698
+ %664 = fsub float %663, %595, !dbg !45
699
+ %665 = fmul float %664, %664, !dbg !53
700
+ %666 = fmul float %590, %665, !dbg !56
701
+ %667 = fcmp oeq float %608, 0.000000e+00, !dbg !60
702
+ %668 = select i1 %667, float 0.000000e+00, float %609, !dbg !61
703
+ %669 = fmul float %668, %666, !dbg !54
704
+ %670 = fadd float %662, %669, !dbg !55
705
+ %671 = bitcast i32 %512 to float, !dbg !57
706
+ %672 = fadd float %508, %671, !dbg !52
707
+ %673 = bitcast i32 %510 to float, !dbg !57
708
+ %674 = fsub float %673, %503, !dbg !45
709
+ %675 = fmul float %674, %674, !dbg !53
710
+ %676 = fmul float %498, %675, !dbg !56
711
+ %677 = fcmp oeq float %516, 0.000000e+00, !dbg !60
712
+ %678 = select i1 %677, float 0.000000e+00, float %517, !dbg !61
713
+ %679 = fmul float %678, %676, !dbg !54
714
+ %680 = fadd float %672, %679, !dbg !55
715
+ %681 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
716
+ %682 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
717
+ %683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
718
+ %684 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
719
+ %685 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
720
+ %686 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
721
+ %687 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
722
+ %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
723
+ %689 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
724
+ %690 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
725
+ %691 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
726
+ %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
727
+ %693 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
728
+ %694 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
729
+ %695 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
730
+ %696 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
731
+ %697 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
732
+ %698 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
733
+ %699 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
734
+ %700 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
735
+ %701 = fadd float %685, 0x3EE4F8B580000000, !dbg !70
736
+ %702 = fadd float %693, 0x3EE4F8B580000000, !dbg !70
737
+ %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
738
+ %.not.i = icmp eq i32 %703, 0, !dbg !71
739
+ br i1 %.not.i, label %706, label %704, !dbg !71
740
+
741
+ 704: ; preds = %660
742
+ %705 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %701), !dbg !71
743
+ br label %__nv_rsqrtf.exit, !dbg !71
744
+
745
+ 706: ; preds = %660
746
+ %707 = tail call float @llvm.nvvm.rsqrt.approx.f(float %701), !dbg !71
747
+ br label %__nv_rsqrtf.exit, !dbg !71
748
+
749
+ __nv_rsqrtf.exit: ; preds = %704, %706
750
+ %.0.i = phi float [ %705, %704 ], [ %707, %706 ], !dbg !71
751
+ %708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
752
+ %709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
753
+ %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
754
+ %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
755
+ %712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
756
+ %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
757
+ %714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
758
+ %715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
759
+ %.not.i37 = icmp eq i32 %715, 0, !dbg !71
760
+ br i1 %.not.i37, label %718, label %716, !dbg !71
761
+
762
+ 716: ; preds = %__nv_rsqrtf.exit
763
+ %717 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %702), !dbg !71
764
+ br label %__nv_rsqrtf.exit39, !dbg !71
765
+
766
+ 718: ; preds = %__nv_rsqrtf.exit
767
+ %719 = tail call float @llvm.nvvm.rsqrt.approx.f(float %702), !dbg !71
768
+ br label %__nv_rsqrtf.exit39, !dbg !71
769
+
770
+ __nv_rsqrtf.exit39: ; preds = %716, %718
771
+ %.0.i38 = phi float [ %717, %716 ], [ %719, %718 ], !dbg !71
772
+ %720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
773
+ %721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
774
+ %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
775
+ %723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
776
+ %724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
777
+ %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
778
+ %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
779
+ %727 = extractvalue { i32, i32, i32, i32 } %684, 3, !dbg !68
780
+ %728 = bitcast i32 %727 to float, !dbg !68
781
+ %729 = extractvalue { i32, i32, i32, i32 } %613, 3, !dbg !62
782
+ %730 = bitcast i32 %729 to float, !dbg !62
783
+ %731 = fadd float %730, %728, !dbg !72
784
+ %732 = fadd float %655, %731, !dbg !73
785
+ %733 = fmul float %664, %668, !dbg !50
786
+ %734 = fadd float %595, %733, !dbg !51
787
+ %735 = fsub float %732, %734, !dbg !74
788
+ %736 = extractvalue { i32, i32, i32, i32 } %684, 2, !dbg !68
789
+ %737 = bitcast i32 %736 to float, !dbg !68
790
+ %738 = extractvalue { i32, i32, i32, i32 } %613, 2, !dbg !62
791
+ %739 = bitcast i32 %738 to float, !dbg !62
792
+ %740 = fadd float %739, %737, !dbg !72
793
+ %741 = fadd float %654, %740, !dbg !73
794
+ %742 = fsub float %741, %734, !dbg !74
795
+ %743 = extractvalue { i32, i32, i32, i32 } %684, 1, !dbg !68
796
+ %744 = bitcast i32 %743 to float, !dbg !68
797
+ %745 = extractvalue { i32, i32, i32, i32 } %613, 1, !dbg !62
798
+ %746 = bitcast i32 %745 to float, !dbg !62
799
+ %747 = fadd float %746, %744, !dbg !72
800
+ %748 = fadd float %653, %747, !dbg !73
801
+ %749 = fsub float %748, %734, !dbg !74
802
+ %750 = extractvalue { i32, i32, i32, i32 } %684, 0, !dbg !68
803
+ %751 = bitcast i32 %750 to float, !dbg !68
804
+ %752 = extractvalue { i32, i32, i32, i32 } %613, 0, !dbg !62
805
+ %753 = bitcast i32 %752 to float, !dbg !62
806
+ %754 = fadd float %753, %751, !dbg !72
807
+ %755 = fadd float %652, %754, !dbg !73
808
+ %756 = fsub float %755, %734, !dbg !74
809
+ %757 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !68
810
+ %758 = bitcast i32 %757 to float, !dbg !68
811
+ %759 = extractvalue { i32, i32, i32, i32 } %612, 3, !dbg !62
812
+ %760 = bitcast i32 %759 to float, !dbg !62
813
+ %761 = fadd float %760, %758, !dbg !72
814
+ %762 = fadd float %651, %761, !dbg !73
815
+ %763 = fsub float %762, %734, !dbg !74
816
+ %764 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !68
817
+ %765 = bitcast i32 %764 to float, !dbg !68
818
+ %766 = extractvalue { i32, i32, i32, i32 } %612, 2, !dbg !62
819
+ %767 = bitcast i32 %766 to float, !dbg !62
820
+ %768 = fadd float %767, %765, !dbg !72
821
+ %769 = fadd float %650, %768, !dbg !73
822
+ %770 = fsub float %769, %734, !dbg !74
823
+ %771 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !68
824
+ %772 = bitcast i32 %771 to float, !dbg !68
825
+ %773 = extractvalue { i32, i32, i32, i32 } %612, 1, !dbg !62
826
+ %774 = bitcast i32 %773 to float, !dbg !62
827
+ %775 = fadd float %774, %772, !dbg !72
828
+ %776 = fadd float %649, %775, !dbg !73
829
+ %777 = fsub float %776, %734, !dbg !74
830
+ %778 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !68
831
+ %779 = bitcast i32 %778 to float, !dbg !68
832
+ %780 = extractvalue { i32, i32, i32, i32 } %612, 0, !dbg !62
833
+ %781 = bitcast i32 %780 to float, !dbg !62
834
+ %782 = fadd float %781, %779, !dbg !72
835
+ %783 = fadd float %648, %782, !dbg !73
836
+ %784 = fsub float %783, %734, !dbg !74
837
+ %785 = extractvalue { i32, i32, i32, i32 } %682, 3, !dbg !68
838
+ %786 = bitcast i32 %785 to float, !dbg !68
839
+ %787 = extractvalue { i32, i32, i32, i32 } %611, 3, !dbg !62
840
+ %788 = bitcast i32 %787 to float, !dbg !62
841
+ %789 = fadd float %788, %786, !dbg !72
842
+ %790 = fadd float %647, %789, !dbg !73
843
+ %791 = fmul float %674, %678, !dbg !50
844
+ %792 = fadd float %503, %791, !dbg !51
845
+ %793 = fsub float %790, %792, !dbg !74
846
+ %794 = extractvalue { i32, i32, i32, i32 } %682, 2, !dbg !68
847
+ %795 = bitcast i32 %794 to float, !dbg !68
848
+ %796 = extractvalue { i32, i32, i32, i32 } %611, 2, !dbg !62
849
+ %797 = bitcast i32 %796 to float, !dbg !62
850
+ %798 = fadd float %797, %795, !dbg !72
851
+ %799 = fadd float %646, %798, !dbg !73
852
+ %800 = fsub float %799, %792, !dbg !74
853
+ %801 = extractvalue { i32, i32, i32, i32 } %682, 1, !dbg !68
854
+ %802 = bitcast i32 %801 to float, !dbg !68
855
+ %803 = extractvalue { i32, i32, i32, i32 } %611, 1, !dbg !62
856
+ %804 = bitcast i32 %803 to float, !dbg !62
857
+ %805 = fadd float %804, %802, !dbg !72
858
+ %806 = fadd float %645, %805, !dbg !73
859
+ %807 = fsub float %806, %792, !dbg !74
860
+ %808 = extractvalue { i32, i32, i32, i32 } %682, 0, !dbg !68
861
+ %809 = bitcast i32 %808 to float, !dbg !68
862
+ %810 = extractvalue { i32, i32, i32, i32 } %611, 0, !dbg !62
863
+ %811 = bitcast i32 %810 to float, !dbg !62
864
+ %812 = fadd float %811, %809, !dbg !72
865
+ %813 = fadd float %644, %812, !dbg !73
866
+ %814 = fsub float %813, %792, !dbg !74
867
+ %815 = extractvalue { i32, i32, i32, i32 } %681, 3, !dbg !68
868
+ %816 = bitcast i32 %815 to float, !dbg !68
869
+ %817 = extractvalue { i32, i32, i32, i32 } %610, 3, !dbg !62
870
+ %818 = bitcast i32 %817 to float, !dbg !62
871
+ %819 = fadd float %818, %816, !dbg !72
872
+ %820 = fadd float %643, %819, !dbg !73
873
+ %821 = fsub float %820, %792, !dbg !74
874
+ %822 = extractvalue { i32, i32, i32, i32 } %681, 2, !dbg !68
875
+ %823 = bitcast i32 %822 to float, !dbg !68
876
+ %824 = extractvalue { i32, i32, i32, i32 } %610, 2, !dbg !62
877
+ %825 = bitcast i32 %824 to float, !dbg !62
878
+ %826 = fadd float %825, %823, !dbg !72
879
+ %827 = fadd float %642, %826, !dbg !73
880
+ %828 = fsub float %827, %792, !dbg !74
881
+ %829 = extractvalue { i32, i32, i32, i32 } %681, 1, !dbg !68
882
+ %830 = bitcast i32 %829 to float, !dbg !68
883
+ %831 = extractvalue { i32, i32, i32, i32 } %610, 1, !dbg !62
884
+ %832 = bitcast i32 %831 to float, !dbg !62
885
+ %833 = fadd float %832, %830, !dbg !72
886
+ %834 = fadd float %641, %833, !dbg !73
887
+ %835 = fsub float %834, %792, !dbg !74
888
+ %836 = extractvalue { i32, i32, i32, i32 } %681, 0, !dbg !68
889
+ %837 = bitcast i32 %836 to float, !dbg !68
890
+ %838 = extractvalue { i32, i32, i32, i32 } %610, 0, !dbg !62
891
+ %839 = bitcast i32 %838 to float, !dbg !62
892
+ %840 = fadd float %839, %837, !dbg !72
893
+ %841 = fadd float %640, %840, !dbg !73
894
+ %842 = fsub float %841, %792, !dbg !74
895
+ %843 = fmul float %842, %.0.i, !dbg !75
896
+ %844 = fmul float %835, %.0.i, !dbg !75
897
+ %845 = fmul float %828, %.0.i, !dbg !75
898
+ %846 = fmul float %821, %.0.i, !dbg !75
899
+ %847 = fmul float %814, %.0.i, !dbg !75
900
+ %848 = fmul float %807, %.0.i, !dbg !75
901
+ %849 = fmul float %800, %.0.i, !dbg !75
902
+ %850 = fmul float %793, %.0.i, !dbg !75
903
+ %851 = fmul float %784, %.0.i38, !dbg !75
904
+ %852 = fmul float %777, %.0.i38, !dbg !75
905
+ %853 = fmul float %770, %.0.i38, !dbg !75
906
+ %854 = fmul float %763, %.0.i38, !dbg !75
907
+ %855 = fmul float %756, %.0.i38, !dbg !75
908
+ %856 = fmul float %749, %.0.i38, !dbg !75
909
+ %857 = fmul float %742, %.0.i38, !dbg !75
910
+ %858 = fmul float %735, %.0.i38, !dbg !75
911
+ %859 = getelementptr float, ptr addrspace(3) @global_smem, i64 %656, !dbg !76
912
+ store i32 %658, ptr addrspace(3) %859, align 4, !dbg !76
913
+ tail call void @llvm.nvvm.barrier0(), !dbg !76
914
+ %860 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !76
915
+ %861 = load float, ptr addrspace(3) %860, align 32, !dbg !76
916
+ %862 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 1, !dbg !76
917
+ %863 = load float, ptr addrspace(3) %862, align 4, !dbg !76
918
+ %864 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 2, !dbg !76
919
+ %865 = load float, ptr addrspace(3) %864, align 8, !dbg !76
920
+ %866 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 3, !dbg !76
921
+ %867 = load float, ptr addrspace(3) %866, align 4, !dbg !76
922
+ %868 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 4, !dbg !76
923
+ %869 = load float, ptr addrspace(3) %868, align 16, !dbg !76
924
+ %870 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 5, !dbg !76
925
+ %871 = load float, ptr addrspace(3) %870, align 4, !dbg !76
926
+ %872 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 6, !dbg !76
927
+ %873 = load float, ptr addrspace(3) %872, align 8, !dbg !76
928
+ %874 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 7, !dbg !76
929
+ %875 = load float, ptr addrspace(3) %874, align 4, !dbg !76
930
+ %876 = fmul float %843, %861, !dbg !76
931
+ %877 = fmul float %844, %863, !dbg !76
932
+ %878 = fmul float %845, %865, !dbg !76
933
+ %879 = fmul float %846, %867, !dbg !76
934
+ %880 = fmul float %847, %869, !dbg !76
935
+ %881 = fmul float %848, %871, !dbg !76
936
+ %882 = fmul float %849, %873, !dbg !76
937
+ %883 = fmul float %850, %875, !dbg !76
938
+ %884 = fmul float %851, %861, !dbg !76
939
+ %885 = fmul float %852, %863, !dbg !76
940
+ %886 = fmul float %853, %865, !dbg !76
941
+ %887 = fmul float %854, %867, !dbg !76
942
+ %888 = fmul float %855, %869, !dbg !76
943
+ %889 = fmul float %856, %871, !dbg !76
944
+ %890 = fmul float %857, %873, !dbg !76
945
+ %891 = fmul float %858, %875, !dbg !76
946
+ %892 = getelementptr i16, ptr addrspace(1) %5, i64 %100, !dbg !77
947
+ %893 = getelementptr i16, ptr addrspace(1) %5, i64 %102, !dbg !77
948
+ %894 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %876) #6, !dbg !78
949
+ %895 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %877) #6, !dbg !78
950
+ %896 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %878) #6, !dbg !78
951
+ %897 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %879) #6, !dbg !78
952
+ %898 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %880) #6, !dbg !78
953
+ %899 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %881) #6, !dbg !78
954
+ %900 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %882) #6, !dbg !78
955
+ %901 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %883) #6, !dbg !78
956
+ %902 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %884) #6, !dbg !78
957
+ %903 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %885) #6, !dbg !78
958
+ %904 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %886) #6, !dbg !78
959
+ %905 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %887) #6, !dbg !78
960
+ %906 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %888) #6, !dbg !78
961
+ %907 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %889) #6, !dbg !78
962
+ %908 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %890) #6, !dbg !78
963
+ %909 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %891) #6, !dbg !78
964
+ %910 = insertelement <2 x i16> undef, i16 %894, i64 0, !dbg !78
965
+ %911 = insertelement <2 x i16> %910, i16 %895, i64 1, !dbg !78
966
+ %912 = bitcast <2 x i16> %911 to i32, !dbg !78
967
+ %913 = insertelement <2 x i16> undef, i16 %896, i64 0, !dbg !78
968
+ %914 = insertelement <2 x i16> %913, i16 %897, i64 1, !dbg !78
969
+ %915 = bitcast <2 x i16> %914 to i32, !dbg !78
970
+ %916 = insertelement <2 x i16> undef, i16 %898, i64 0, !dbg !78
971
+ %917 = insertelement <2 x i16> %916, i16 %899, i64 1, !dbg !78
972
+ %918 = bitcast <2 x i16> %917 to i32, !dbg !78
973
+ %919 = insertelement <2 x i16> undef, i16 %900, i64 0, !dbg !78
974
+ %920 = insertelement <2 x i16> %919, i16 %901, i64 1, !dbg !78
975
+ %921 = bitcast <2 x i16> %920 to i32, !dbg !78
976
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %912, i32 %915, i32 %918, i32 %921, ptr addrspace(1) %892, i1 true) #6, !dbg !78
977
+ %922 = insertelement <2 x i16> undef, i16 %902, i64 0, !dbg !78
978
+ %923 = insertelement <2 x i16> %922, i16 %903, i64 1, !dbg !78
979
+ %924 = bitcast <2 x i16> %923 to i32, !dbg !78
980
+ %925 = insertelement <2 x i16> undef, i16 %904, i64 0, !dbg !78
981
+ %926 = insertelement <2 x i16> %925, i16 %905, i64 1, !dbg !78
982
+ %927 = bitcast <2 x i16> %926 to i32, !dbg !78
983
+ %928 = insertelement <2 x i16> undef, i16 %906, i64 0, !dbg !78
984
+ %929 = insertelement <2 x i16> %928, i16 %907, i64 1, !dbg !78
985
+ %930 = bitcast <2 x i16> %929 to i32, !dbg !78
986
+ %931 = insertelement <2 x i16> undef, i16 %908, i64 0, !dbg !78
987
+ %932 = insertelement <2 x i16> %931, i16 %909, i64 1, !dbg !78
988
+ %933 = bitcast <2 x i16> %932 to i32, !dbg !78
989
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %924, i32 %927, i32 %930, i32 %933, ptr addrspace(1) %893, i1 true) #6, !dbg !78
990
+ ret void, !dbg !79
991
+ }
992
+
993
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
994
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
995
+
996
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
997
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
998
+
999
+ ; Function Attrs: convergent nocallback nounwind
1000
+ declare void @llvm.nvvm.barrier0() #2
1001
+
1002
+ ; Function Attrs: alwaysinline nounwind
1003
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
1004
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
1005
+ %.not = icmp eq i32 %1, 0
1006
+ br i1 %.not, label %4, label %2
1007
+
1008
+ 2: ; preds = %0
1009
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
1010
+ br label %6
1011
+
1012
+ 4: ; preds = %0
1013
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
1014
+ br label %6
1015
+
1016
+ 6: ; preds = %4, %2
1017
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
1018
+ ret float %.0
1019
+ }
1020
+
1021
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1022
+
1023
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1024
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
1025
+
1026
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1027
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
1028
+
1029
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1030
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1031
+ attributes #2 = { convergent nocallback nounwind }
1032
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1033
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1034
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1035
+ attributes #6 = { nounwind }
1036
+
1037
+ !llvm.module.flags = !{!0, !1}
1038
+ !llvm.dbg.cu = !{!2}
1039
+ !nvvm.annotations = !{!4, !5, !5, !4}
1040
+ !llvm.ident = !{!6}
1041
+
1042
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1043
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1044
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1045
+ !3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
1046
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
1047
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
1048
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1049
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
1050
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
1051
+ !9 = !{}
1052
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
1053
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
1054
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
1055
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
1056
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
1057
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
1058
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
1059
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
1060
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
1061
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
1062
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
1063
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
1064
+ !22 = !DILocation(line: 36, column: 44, scope: !7)
1065
+ !23 = !DILocation(line: 36, column: 40, scope: !7)
1066
+ !24 = !DILocation(line: 36, column: 34, scope: !7)
1067
+ !25 = !DILocation(line: 36, column: 50, scope: !7)
1068
+ !26 = !DILocation(line: 36, column: 101, scope: !7)
1069
+ !27 = !DILocation(line: 37, column: 22, scope: !7)
1070
+ !28 = !DILocation(line: 38, column: 22, scope: !7)
1071
+ !29 = !DILocation(line: 39, column: 36, scope: !7)
1072
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
1073
+ !31 = !DILocation(line: 40, column: 55, scope: !7)
1074
+ !32 = !DILocation(line: 41, column: 44, scope: !7)
1075
+ !33 = !DILocation(line: 41, column: 40, scope: !7)
1076
+ !34 = !DILocation(line: 41, column: 34, scope: !7)
1077
+ !35 = !DILocation(line: 41, column: 52, scope: !7)
1078
+ !36 = !DILocation(line: 42, column: 22, scope: !7)
1079
+ !37 = !DILocation(line: 44, column: 22, scope: !7)
1080
+ !38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
1081
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
1082
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
1083
+ !41 = !DILocation(line: 47, column: 41, scope: !39)
1084
+ !42 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
1085
+ !43 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
1086
+ !44 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
1087
+ !45 = !DILocation(line: 108, column: 21, scope: !46, inlinedAt: !47)
1088
+ !46 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
1089
+ !47 = !DILocation(line: 120, column: 46, scope: !46, inlinedAt: !48)
1090
+ !48 = !DILocation(line: 53, column: 44, scope: !46)
1091
+ !49 = !DILocation(line: 110, column: 60, scope: !46, inlinedAt: !47)
1092
+ !50 = !DILocation(line: 112, column: 25, scope: !46, inlinedAt: !47)
1093
+ !51 = !DILocation(line: 112, column: 17, scope: !46, inlinedAt: !47)
1094
+ !52 = !DILocation(line: 113, column: 15, scope: !46, inlinedAt: !47)
1095
+ !53 = !DILocation(line: 113, column: 30, scope: !46, inlinedAt: !47)
1096
+ !54 = !DILocation(line: 113, column: 49, scope: !46, inlinedAt: !47)
1097
+ !55 = !DILocation(line: 113, column: 22, scope: !46, inlinedAt: !47)
1098
+ !56 = !DILocation(line: 113, column: 38, scope: !46, inlinedAt: !47)
1099
+ !57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
1100
+ !58 = !DILocation(line: 53, column: 44, scope: !39)
1101
+ !59 = !DILocation(line: 109, column: 28, scope: !46, inlinedAt: !47)
1102
+ !60 = !DILocation(line: 110, column: 39, scope: !46, inlinedAt: !47)
1103
+ !61 = !DILocation(line: 110, column: 49, scope: !46, inlinedAt: !47)
1104
+ !62 = !DILocation(line: 62, column: 51, scope: !7)
1105
+ !63 = !DILocation(line: 63, column: 51, scope: !7)
1106
+ !64 = !DILocation(line: 63, column: 103, scope: !7)
1107
+ !65 = !DILocation(line: 64, column: 35, scope: !7)
1108
+ !66 = !DILocation(line: 64, column: 40, scope: !7)
1109
+ !67 = !DILocation(line: 68, column: 57, scope: !7)
1110
+ !68 = !DILocation(line: 69, column: 54, scope: !7)
1111
+ !69 = !DILocation(line: 75, column: 24, scope: !7)
1112
+ !70 = !DILocation(line: 77, column: 24, scope: !7)
1113
+ !71 = !DILocation(line: 78, column: 30, scope: !7)
1114
+ !72 = !DILocation(line: 70, column: 24, scope: !7)
1115
+ !73 = !DILocation(line: 72, column: 24, scope: !7)
1116
+ !74 = !DILocation(line: 73, column: 24, scope: !7)
1117
+ !75 = !DILocation(line: 79, column: 24, scope: !7)
1118
+ !76 = !DILocation(line: 80, column: 24, scope: !7)
1119
+ !77 = !DILocation(line: 82, column: 29, scope: !7)
1120
+ !78 = !DILocation(line: 82, column: 52, scope: !7)
1121
+ !79 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx ADDED
@@ -0,0 +1,1854 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 256, 1, 1
39
+ {
40
+ .reg .pred %p<137>;
41
+ .reg .b16 %rs<49>;
42
+ .reg .b32 %r<439>;
43
+ .reg .f32 %f<487>;
44
+ .reg .b64 %rd<124>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_4];
50
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_1];
51
+ ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r89, %tid.x;
55
+ ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6de7de_param_2];
56
+ bfe.u32 %r90, %r89, 5, 3;
57
+ ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6de7de_param_3];
58
+ and.b32 %r91, %r89, 15;
59
+ .loc 1 24 33
60
+ shl.b32 %r92, %r89, 3;
61
+ and.b32 %r1, %r92, 248;
62
+ and.b32 %r2, %r89, 255;
63
+ .loc 1 21 28
64
+ mov.u32 %r24, %ctaid.x;
65
+ .loc 1 21 33
66
+ shl.b32 %r93, %r24, 4;
67
+ .loc 1 22 23
68
+ or.b32 %r94, %r93, %r90;
69
+ or.b32 %r95, %r94, 8;
70
+ or.b32 %r96, %r93, %r91;
71
+ .loc 1 26 30
72
+ mul.wide.s32 %rd62, %r94, 8;
73
+ add.s64 %rd20, %rd59, %rd62;
74
+ add.s64 %rd36, %rd20, 64;
75
+ mul.wide.s32 %rd63, %r96, 8;
76
+ add.s64 %rd52, %rd59, %rd63;
77
+ mov.pred %p113, -1;
78
+ .loc 1 26 35
79
+ mov.u64 %rd19, 0x0;
80
+ @%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ];
81
+ mov.u64 %rd21, 0x0;
82
+ @%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ];
83
+ mov.u64 %rd23, 0x0;
84
+ @%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ];
85
+ mov.u64 %rd25, 0x0;
86
+ @%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ];
87
+ mov.u64 %rd27, 0x0;
88
+ @%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ];
89
+ mov.u64 %rd29, 0x0;
90
+ @%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ];
91
+ mov.u64 %rd31, 0x0;
92
+ @%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ];
93
+ mov.u64 %rd33, 0x0;
94
+ @%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ];
95
+ mov.u64 %rd35, 0x0;
96
+ @%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ];
97
+ mov.u64 %rd37, 0x0;
98
+ @%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ];
99
+ mov.u64 %rd39, 0x0;
100
+ @%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ];
101
+ mov.u64 %rd41, 0x0;
102
+ @%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ];
103
+ mov.u64 %rd43, 0x0;
104
+ @%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ];
105
+ mov.u64 %rd45, 0x0;
106
+ @%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ];
107
+ mov.u64 %rd47, 0x0;
108
+ @%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ];
109
+ mov.u64 %rd49, 0x0;
110
+ @%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ];
111
+ mov.u64 %rd51, 0x0;
112
+ @%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ];
113
+ .loc 1 27 18
114
+ bfe.s32 %r97, %r24, 27, 1;
115
+ shr.u32 %r98, %r97, 23;
116
+ add.s32 %r99, %r94, %r98;
117
+ and.b32 %r100, %r99, 16776704;
118
+ sub.s32 %r101, %r94, %r100;
119
+ add.s32 %r102, %r95, %r98;
120
+ and.b32 %r103, %r102, 16776704;
121
+ sub.s32 %r104, %r95, %r103;
122
+ .loc 1 35 44
123
+ shl.b32 %r105, %r101, 8;
124
+ shl.b32 %r106, %r104, 8;
125
+ .loc 1 35 40
126
+ or.b32 %r107, %r105, %r1;
127
+ or.b32 %r108, %r106, %r1;
128
+ .loc 1 35 34
129
+ mul.wide.s32 %rd64, %r107, 4;
130
+ add.s64 %rd89, %rd60, %rd64;
131
+ cvt.s64.s32 %rd65, %r105;
132
+ cvt.u64.u32 %rd66, %r1;
133
+ or.b64 %rd67, %rd65, %rd66;
134
+ shl.b64 %rd68, %rd67, 2;
135
+ add.s64 %rd69, %rd60, %rd68;
136
+ add.s64 %rd90, %rd69, 16;
137
+ mul.wide.s32 %rd70, %r108, 4;
138
+ add.s64 %rd91, %rd60, %rd70;
139
+ cvt.s64.s32 %rd71, %r106;
140
+ or.b64 %rd72, %rd71, %rd66;
141
+ shl.b64 %rd73, %rd72, 2;
142
+ add.s64 %rd74, %rd60, %rd73;
143
+ add.s64 %rd92, %rd74, 16;
144
+ mov.b32 %r325, 0;
145
+ .loc 1 35 50
146
+ mov.u32 %r25, 0x0;
147
+ mov.u32 %r26, 0x0;
148
+ mov.u32 %r27, 0x0;
149
+ mov.u32 %r28, 0x0;
150
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ];
151
+ @!%p113 mov.u32 %r25, %r325;
152
+ @!%p113 mov.u32 %r26, %r325;
153
+ @!%p113 mov.u32 %r27, %r325;
154
+ @!%p113 mov.u32 %r28, %r325;
155
+ mov.b32 %f1, %r25;
156
+ mov.b32 %f2, %r26;
157
+ mov.b32 %f3, %r27;
158
+ mov.b32 %f4, %r28;
159
+ mov.u32 %r33, 0x0;
160
+ mov.u32 %r34, 0x0;
161
+ mov.u32 %r35, 0x0;
162
+ mov.u32 %r36, 0x0;
163
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ];
164
+ @!%p113 mov.u32 %r33, %r325;
165
+ @!%p113 mov.u32 %r34, %r325;
166
+ @!%p113 mov.u32 %r35, %r325;
167
+ @!%p113 mov.u32 %r36, %r325;
168
+ mov.b32 %f5, %r33;
169
+ mov.b32 %f6, %r34;
170
+ mov.b32 %f7, %r35;
171
+ mov.b32 %f8, %r36;
172
+ mov.u32 %r41, 0x0;
173
+ mov.u32 %r42, 0x0;
174
+ mov.u32 %r43, 0x0;
175
+ mov.u32 %r44, 0x0;
176
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ];
177
+ @!%p113 mov.u32 %r41, %r325;
178
+ @!%p113 mov.u32 %r42, %r325;
179
+ @!%p113 mov.u32 %r43, %r325;
180
+ @!%p113 mov.u32 %r44, %r325;
181
+ mov.b32 %f9, %r41;
182
+ mov.b32 %f10, %r42;
183
+ mov.b32 %f11, %r43;
184
+ mov.b32 %f12, %r44;
185
+ mov.u32 %r49, 0x0;
186
+ mov.u32 %r50, 0x0;
187
+ mov.u32 %r51, 0x0;
188
+ mov.u32 %r52, 0x0;
189
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ];
190
+ @!%p113 mov.u32 %r49, %r325;
191
+ @!%p113 mov.u32 %r50, %r325;
192
+ @!%p113 mov.u32 %r51, %r325;
193
+ @!%p113 mov.u32 %r52, %r325;
194
+ mov.b32 %f13, %r49;
195
+ mov.b32 %f14, %r50;
196
+ mov.b32 %f15, %r51;
197
+ mov.b32 %f16, %r52;
198
+ .loc 1 36 44
199
+ shl.b32 %r109, %r94, 8;
200
+ shl.b32 %r110, %r95, 8;
201
+ .loc 1 36 40
202
+ or.b32 %r111, %r109, %r1;
203
+ or.b32 %r112, %r110, %r1;
204
+ .loc 1 36 34
205
+ mul.wide.s32 %rd75, %r111, 2;
206
+ add.s64 %rd93, %rd61, %rd75;
207
+ mul.wide.s32 %rd76, %r112, 2;
208
+ add.s64 %rd94, %rd61, %rd76;
209
+ .loc 1 36 50
210
+ mov.u32 %r57, 0x0;
211
+ mov.u32 %r58, 0x0;
212
+ mov.u32 %r59, 0x0;
213
+ mov.u32 %r60, 0x0;
214
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ];
215
+ @!%p113 mov.u32 %r57, %r325;
216
+ @!%p113 mov.u32 %r58, %r325;
217
+ @!%p113 mov.u32 %r59, %r325;
218
+ @!%p113 mov.u32 %r60, %r325;
219
+ cvt.u16.u32 %rs1, %r57;
220
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; }
221
+ cvt.u16.u32 %rs3, %r58;
222
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; }
223
+ cvt.u16.u32 %rs5, %r59;
224
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; }
225
+ cvt.u16.u32 %rs7, %r60;
226
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; }
227
+ mov.u32 %r65, 0x0;
228
+ mov.u32 %r66, 0x0;
229
+ mov.u32 %r67, 0x0;
230
+ mov.u32 %r68, 0x0;
231
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ];
232
+ @!%p113 mov.u32 %r65, %r325;
233
+ @!%p113 mov.u32 %r66, %r325;
234
+ @!%p113 mov.u32 %r67, %r325;
235
+ @!%p113 mov.u32 %r68, %r325;
236
+ cvt.u16.u32 %rs9, %r65;
237
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; }
238
+ cvt.u16.u32 %rs11, %r66;
239
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; }
240
+ cvt.u16.u32 %rs13, %r67;
241
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; }
242
+ cvt.u16.u32 %rs15, %r68;
243
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; }
244
+ .loc 1 36 101
245
+ cvt.f32.bf16 %r73, %rs1;
246
+ mov.b32 %f17, %r73;
247
+ cvt.f32.bf16 %r74, %rs2;
248
+ mov.b32 %f18, %r74;
249
+ cvt.f32.bf16 %r75, %rs3;
250
+ mov.b32 %f19, %r75;
251
+ cvt.f32.bf16 %r76, %rs4;
252
+ mov.b32 %f20, %r76;
253
+ cvt.f32.bf16 %r77, %rs5;
254
+ mov.b32 %f21, %r77;
255
+ cvt.f32.bf16 %r78, %rs6;
256
+ mov.b32 %f22, %r78;
257
+ cvt.f32.bf16 %r79, %rs7;
258
+ mov.b32 %f23, %r79;
259
+ cvt.f32.bf16 %r80, %rs8;
260
+ mov.b32 %f24, %r80;
261
+ cvt.f32.bf16 %r81, %rs9;
262
+ mov.b32 %f25, %r81;
263
+ cvt.f32.bf16 %r82, %rs10;
264
+ mov.b32 %f26, %r82;
265
+ cvt.f32.bf16 %r83, %rs11;
266
+ mov.b32 %f27, %r83;
267
+ cvt.f32.bf16 %r84, %rs12;
268
+ mov.b32 %f28, %r84;
269
+ cvt.f32.bf16 %r85, %rs13;
270
+ mov.b32 %f29, %r85;
271
+ cvt.f32.bf16 %r86, %rs14;
272
+ mov.b32 %f30, %r86;
273
+ cvt.f32.bf16 %r87, %rs15;
274
+ mov.b32 %f31, %r87;
275
+ cvt.f32.bf16 %r88, %rs16;
276
+ mov.b32 %f32, %r88;
277
+ .loc 1 37 22
278
+ add.s64 %rd77, %rd51, 50257;
279
+ .loc 1 38 22
280
+ setp.lt.s64 %p48, %rd51, 0;
281
+ .loc 1 39 36
282
+ selp.b64 %rd11, %rd77, %rd51, %p48;
283
+ .loc 1 40 40
284
+ setp.lt.u64 %p49, %rd11, 50257;
285
+ mov.b32 %r438, 883;
286
+ mov.u64 %rd123, 1;
287
+ .loc 1 40 55
288
+ @%p49 bra $L__BB0_2;
289
+ mov.u64 %rd78, assertMessage_0;
290
+ cvta.global.u64 %rd79, %rd78;
291
+ mov.u64 %rd80, assertFile_0;
292
+ cvta.global.u64 %rd81, %rd80;
293
+ mov.u64 %rd82, assertFunc_0;
294
+ cvta.global.u64 %rd83, %rd82;
295
+ { // callseq 8, 0
296
+ .reg .b32 temp_param_reg;
297
+ .param .b64 param0;
298
+ st.param.b64 [param0+0], %rd79;
299
+ .param .b64 param1;
300
+ st.param.b64 [param1+0], %rd81;
301
+ .param .b32 param2;
302
+ st.param.b32 [param2+0], %r438;
303
+ .param .b64 param3;
304
+ st.param.b64 [param3+0], %rd83;
305
+ .param .b64 param4;
306
+ st.param.b64 [param4+0], %rd123;
307
+ call.uni
308
+ __assertfail,
309
+ (
310
+ param0,
311
+ param1,
312
+ param2,
313
+ param3,
314
+ param4
315
+ );
316
+ } // callseq 8
317
+ $L__BB0_2:
318
+ .loc 1 0 55
319
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_5];
320
+ cvt.s64.s32 %rd7, %r111;
321
+ cvt.s64.s32 %rd9, %r112;
322
+ .loc 1 38 22
323
+ setp.lt.s64 %p103, %rd35, 0;
324
+ setp.lt.s64 %p104, %rd19, 0;
325
+ .loc 1 41 44
326
+ shl.b64 %rd96, %rd19, 8;
327
+ add.s64 %rd97, %rd96, 12865792;
328
+ selp.b64 %rd98, %rd97, %rd96, %p104;
329
+ shl.b64 %rd99, %rd35, 8;
330
+ add.s64 %rd100, %rd99, 12865792;
331
+ selp.b64 %rd101, %rd100, %rd99, %p103;
332
+ .loc 1 41 40
333
+ or.b64 %rd103, %rd98, %rd66;
334
+ or.b64 %rd104, %rd101, %rd66;
335
+ .loc 1 41 34
336
+ shl.b64 %rd105, %rd103, 2;
337
+ add.s64 %rd115, %rd16, %rd105;
338
+ add.s64 %rd116, %rd115, 16;
339
+ shl.b64 %rd106, %rd104, 2;
340
+ add.s64 %rd117, %rd16, %rd106;
341
+ add.s64 %rd118, %rd117, 16;
342
+ .loc 1 41 52
343
+ mov.u32 %r114, 0x0;
344
+ mov.u32 %r115, 0x0;
345
+ mov.u32 %r116, 0x0;
346
+ mov.u32 %r117, 0x0;
347
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ];
348
+ @!%p113 mov.u32 %r114, %r325;
349
+ @!%p113 mov.u32 %r115, %r325;
350
+ @!%p113 mov.u32 %r116, %r325;
351
+ @!%p113 mov.u32 %r117, %r325;
352
+ mov.b32 %f59, %r114;
353
+ mov.b32 %f60, %r115;
354
+ mov.b32 %f61, %r116;
355
+ mov.b32 %f62, %r117;
356
+ mov.u32 %r122, 0x0;
357
+ mov.u32 %r123, 0x0;
358
+ mov.u32 %r124, 0x0;
359
+ mov.u32 %r125, 0x0;
360
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ];
361
+ @!%p113 mov.u32 %r122, %r325;
362
+ @!%p113 mov.u32 %r123, %r325;
363
+ @!%p113 mov.u32 %r124, %r325;
364
+ @!%p113 mov.u32 %r125, %r325;
365
+ mov.b32 %f63, %r122;
366
+ mov.b32 %f64, %r123;
367
+ mov.b32 %f65, %r124;
368
+ mov.b32 %f66, %r125;
369
+ mov.u32 %r130, 0x0;
370
+ mov.u32 %r131, 0x0;
371
+ mov.u32 %r132, 0x0;
372
+ mov.u32 %r133, 0x0;
373
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ];
374
+ @!%p113 mov.u32 %r130, %r325;
375
+ @!%p113 mov.u32 %r131, %r325;
376
+ @!%p113 mov.u32 %r132, %r325;
377
+ @!%p113 mov.u32 %r133, %r325;
378
+ mov.b32 %f67, %r130;
379
+ mov.b32 %f68, %r131;
380
+ mov.b32 %f69, %r132;
381
+ mov.b32 %f70, %r133;
382
+ mov.u32 %r138, 0x0;
383
+ mov.u32 %r139, 0x0;
384
+ mov.u32 %r140, 0x0;
385
+ mov.u32 %r141, 0x0;
386
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ];
387
+ @!%p113 mov.u32 %r138, %r325;
388
+ @!%p113 mov.u32 %r139, %r325;
389
+ @!%p113 mov.u32 %r140, %r325;
390
+ @!%p113 mov.u32 %r141, %r325;
391
+ mov.b32 %f71, %r138;
392
+ mov.b32 %f72, %r139;
393
+ mov.b32 %f73, %r140;
394
+ mov.b32 %f74, %r141;
395
+ .loc 1 42 22
396
+ add.f32 %f75, %f1, %f59;
397
+ add.f32 %f76, %f2, %f60;
398
+ add.f32 %f77, %f3, %f61;
399
+ add.f32 %f78, %f4, %f62;
400
+ add.f32 %f79, %f5, %f63;
401
+ add.f32 %f80, %f6, %f64;
402
+ add.f32 %f81, %f7, %f65;
403
+ add.f32 %f82, %f8, %f66;
404
+ add.f32 %f83, %f9, %f67;
405
+ add.f32 %f84, %f10, %f68;
406
+ add.f32 %f85, %f11, %f69;
407
+ add.f32 %f86, %f12, %f70;
408
+ add.f32 %f87, %f13, %f71;
409
+ add.f32 %f88, %f14, %f72;
410
+ add.f32 %f89, %f15, %f73;
411
+ add.f32 %f90, %f16, %f74;
412
+ .loc 1 44 22
413
+ add.f32 %f91, %f17, %f75;
414
+ add.f32 %f92, %f18, %f76;
415
+ add.f32 %f93, %f19, %f77;
416
+ add.f32 %f94, %f20, %f78;
417
+ add.f32 %f95, %f21, %f79;
418
+ add.f32 %f96, %f22, %f80;
419
+ add.f32 %f97, %f23, %f81;
420
+ add.f32 %f98, %f24, %f82;
421
+ add.f32 %f99, %f25, %f83;
422
+ add.f32 %f100, %f26, %f84;
423
+ add.f32 %f101, %f27, %f85;
424
+ add.f32 %f102, %f28, %f86;
425
+ add.f32 %f103, %f29, %f87;
426
+ add.f32 %f104, %f30, %f88;
427
+ add.f32 %f105, %f31, %f89;
428
+ add.f32 %f106, %f32, %f90;
429
+ $L__tmp1:
430
+ .loc 2 98 22
431
+ add.f32 %f107, %f91, 0f00000000;
432
+ add.f32 %f108, %f92, 0f00000000;
433
+ add.f32 %f109, %f93, 0f00000000;
434
+ add.f32 %f110, %f94, 0f00000000;
435
+ add.f32 %f111, %f95, 0f00000000;
436
+ add.f32 %f112, %f96, 0f00000000;
437
+ add.f32 %f113, %f97, 0f00000000;
438
+ add.f32 %f114, %f98, 0f00000000;
439
+ add.f32 %f115, %f99, 0f00000000;
440
+ add.f32 %f116, %f100, 0f00000000;
441
+ add.f32 %f117, %f101, 0f00000000;
442
+ add.f32 %f118, %f102, 0f00000000;
443
+ add.f32 %f119, %f103, 0f00000000;
444
+ add.f32 %f120, %f104, 0f00000000;
445
+ add.f32 %f121, %f105, 0f00000000;
446
+ add.f32 %f122, %f106, 0f00000000;
447
+ .loc 2 101 30
448
+ sub.f32 %f123, %f91, %f107;
449
+ sub.f32 %f124, %f92, %f108;
450
+ sub.f32 %f125, %f93, %f109;
451
+ sub.f32 %f126, %f94, %f110;
452
+ sub.f32 %f127, %f95, %f111;
453
+ sub.f32 %f128, %f96, %f112;
454
+ sub.f32 %f129, %f97, %f113;
455
+ sub.f32 %f130, %f98, %f114;
456
+ sub.f32 %f131, %f99, %f115;
457
+ sub.f32 %f132, %f100, %f116;
458
+ sub.f32 %f133, %f101, %f117;
459
+ sub.f32 %f134, %f102, %f118;
460
+ sub.f32 %f135, %f103, %f119;
461
+ sub.f32 %f136, %f104, %f120;
462
+ sub.f32 %f137, %f105, %f121;
463
+ sub.f32 %f138, %f106, %f122;
464
+ .loc 2 101 13
465
+ fma.rn.f32 %f139, %f91, %f123, 0f00000000;
466
+ fma.rn.f32 %f140, %f92, %f124, 0f00000000;
467
+ fma.rn.f32 %f141, %f93, %f125, 0f00000000;
468
+ fma.rn.f32 %f142, %f94, %f126, 0f00000000;
469
+ fma.rn.f32 %f143, %f95, %f127, 0f00000000;
470
+ fma.rn.f32 %f144, %f96, %f128, 0f00000000;
471
+ fma.rn.f32 %f145, %f97, %f129, 0f00000000;
472
+ fma.rn.f32 %f146, %f98, %f130, 0f00000000;
473
+ fma.rn.f32 %f147, %f99, %f131, 0f00000000;
474
+ fma.rn.f32 %f148, %f100, %f132, 0f00000000;
475
+ fma.rn.f32 %f149, %f101, %f133, 0f00000000;
476
+ fma.rn.f32 %f150, %f102, %f134, 0f00000000;
477
+ fma.rn.f32 %f151, %f103, %f135, 0f00000000;
478
+ fma.rn.f32 %f152, %f104, %f136, 0f00000000;
479
+ fma.rn.f32 %f153, %f105, %f137, 0f00000000;
480
+ fma.rn.f32 %f154, %f106, %f138, 0f00000000;
481
+ $L__tmp2:
482
+ .loc 2 108 21
483
+ sub.f32 %f155, %f108, %f107;
484
+ mov.b32 %r147, 1065353216;
485
+ mov.b32 %r148, 1073741824;
486
+ .loc 2 110 60
487
+ div.full.f32 %r146, %r147, %r148;
488
+ mov.b32 %f156, %r146;
489
+ .loc 2 112 17
490
+ fma.rn.f32 %f157, %f156, %f155, %f107;
491
+ .loc 2 113 15
492
+ add.f32 %f158, %f139, %f140;
493
+ .loc 2 113 30
494
+ mul.f32 %f159, %f155, %f155;
495
+ .loc 2 113 22
496
+ fma.rn.f32 %f160, %f156, %f159, %f158;
497
+ .loc 2 108 21
498
+ sub.f32 %f161, %f109, %f157;
499
+ mov.b32 %r151, 1077936128;
500
+ .loc 2 110 60
501
+ div.full.f32 %r149, %r147, %r151;
502
+ mov.b32 %f162, %r149;
503
+ .loc 2 112 17
504
+ fma.rn.f32 %f163, %f162, %f161, %f157;
505
+ .loc 2 113 15
506
+ add.f32 %f164, %f141, %f160;
507
+ .loc 2 113 30
508
+ mul.f32 %f165, %f161, %f161;
509
+ .loc 2 113 38
510
+ fma.rn.f32 %f166, %f161, %f161, %f165;
511
+ .loc 2 113 22
512
+ fma.rn.f32 %f167, %f162, %f166, %f164;
513
+ .loc 2 108 21
514
+ sub.f32 %f168, %f110, %f163;
515
+ mov.b32 %r154, 1082130432;
516
+ .loc 2 110 60
517
+ div.full.f32 %r152, %r147, %r154;
518
+ mov.b32 %f169, %r152;
519
+ .loc 2 112 17
520
+ fma.rn.f32 %f170, %f169, %f168, %f163;
521
+ .loc 2 113 15
522
+ add.f32 %f171, %f142, %f167;
523
+ .loc 2 113 30
524
+ mul.f32 %f172, %f168, %f168;
525
+ .loc 2 113 38
526
+ mul.f32 %f173, %f172, 0f40400000;
527
+ .loc 2 113 22
528
+ fma.rn.f32 %f174, %f169, %f173, %f171;
529
+ .loc 2 108 21
530
+ sub.f32 %f175, %f111, %f170;
531
+ mov.b32 %r157, 1084227584;
532
+ .loc 2 110 60
533
+ div.full.f32 %r155, %r147, %r157;
534
+ mov.b32 %f176, %r155;
535
+ .loc 2 112 17
536
+ fma.rn.f32 %f177, %f176, %f175, %f170;
537
+ .loc 2 113 15
538
+ add.f32 %f178, %f143, %f174;
539
+ .loc 2 113 30
540
+ mul.f32 %f179, %f175, %f175;
541
+ .loc 2 113 38
542
+ mul.f32 %f180, %f179, 0f40800000;
543
+ .loc 2 113 22
544
+ fma.rn.f32 %f181, %f176, %f180, %f178;
545
+ .loc 2 108 21
546
+ sub.f32 %f182, %f112, %f177;
547
+ mov.b32 %r160, 1086324736;
548
+ .loc 2 110 60
549
+ div.full.f32 %r158, %r147, %r160;
550
+ mov.b32 %f183, %r158;
551
+ .loc 2 112 17
552
+ fma.rn.f32 %f184, %f183, %f182, %f177;
553
+ .loc 2 113 15
554
+ add.f32 %f185, %f144, %f181;
555
+ .loc 2 113 30
556
+ mul.f32 %f186, %f182, %f182;
557
+ .loc 2 113 38
558
+ mul.f32 %f187, %f186, 0f40A00000;
559
+ .loc 2 113 22
560
+ fma.rn.f32 %f188, %f183, %f187, %f185;
561
+ .loc 2 108 21
562
+ sub.f32 %f189, %f113, %f184;
563
+ mov.b32 %r163, 1088421888;
564
+ .loc 2 110 60
565
+ div.full.f32 %r161, %r147, %r163;
566
+ mov.b32 %f190, %r161;
567
+ .loc 2 112 17
568
+ fma.rn.f32 %f191, %f190, %f189, %f184;
569
+ .loc 2 113 15
570
+ add.f32 %f192, %f145, %f188;
571
+ .loc 2 113 30
572
+ mul.f32 %f193, %f189, %f189;
573
+ .loc 2 113 38
574
+ mul.f32 %f194, %f193, 0f40C00000;
575
+ .loc 2 113 22
576
+ fma.rn.f32 %f195, %f190, %f194, %f192;
577
+ .loc 2 108 21
578
+ sub.f32 %f196, %f114, %f191;
579
+ mov.b32 %r166, 1090519040;
580
+ .loc 2 110 60
581
+ div.full.f32 %r164, %r147, %r166;
582
+ mov.b32 %f197, %r164;
583
+ .loc 2 112 17
584
+ fma.rn.f32 %f198, %f197, %f196, %f191;
585
+ .loc 2 113 15
586
+ add.f32 %f199, %f146, %f195;
587
+ .loc 2 113 30
588
+ mul.f32 %f200, %f196, %f196;
589
+ .loc 2 113 38
590
+ mul.f32 %f201, %f200, 0f40E00000;
591
+ .loc 2 113 22
592
+ fma.rn.f32 %f202, %f197, %f201, %f199;
593
+ .loc 2 108 21
594
+ sub.f32 %f203, %f116, %f115;
595
+ .loc 2 110 60
596
+ div.full.f32 %r167, %r147, %r148;
597
+ mov.b32 %f204, %r167;
598
+ .loc 2 112 17
599
+ fma.rn.f32 %f205, %f203, %f204, %f115;
600
+ .loc 2 113 15
601
+ add.f32 %f206, %f147, %f148;
602
+ .loc 2 113 30
603
+ mul.f32 %f207, %f203, %f203;
604
+ .loc 2 113 22
605
+ fma.rn.f32 %f208, %f207, %f204, %f206;
606
+ .loc 2 108 21
607
+ sub.f32 %f209, %f117, %f205;
608
+ .loc 2 110 60
609
+ div.full.f32 %r170, %r147, %r151;
610
+ mov.b32 %f210, %r170;
611
+ .loc 2 112 17
612
+ fma.rn.f32 %f211, %f210, %f209, %f205;
613
+ .loc 2 113 15
614
+ add.f32 %f212, %f149, %f208;
615
+ .loc 2 113 30
616
+ mul.f32 %f213, %f209, %f209;
617
+ .loc 2 113 38
618
+ fma.rn.f32 %f214, %f209, %f209, %f213;
619
+ .loc 2 113 22
620
+ fma.rn.f32 %f215, %f210, %f214, %f212;
621
+ .loc 2 108 21
622
+ sub.f32 %f216, %f118, %f211;
623
+ .loc 2 110 60
624
+ div.full.f32 %r173, %r147, %r154;
625
+ mov.b32 %f217, %r173;
626
+ .loc 2 112 17
627
+ fma.rn.f32 %f218, %f217, %f216, %f211;
628
+ .loc 2 113 15
629
+ add.f32 %f219, %f150, %f215;
630
+ .loc 2 113 30
631
+ mul.f32 %f220, %f216, %f216;
632
+ .loc 2 113 38
633
+ mul.f32 %f221, %f220, 0f40400000;
634
+ .loc 2 113 22
635
+ fma.rn.f32 %f222, %f217, %f221, %f219;
636
+ .loc 2 108 21
637
+ sub.f32 %f223, %f119, %f218;
638
+ .loc 2 110 60
639
+ div.full.f32 %r176, %r147, %r157;
640
+ mov.b32 %f224, %r176;
641
+ .loc 2 112 17
642
+ fma.rn.f32 %f225, %f224, %f223, %f218;
643
+ .loc 2 113 15
644
+ add.f32 %f226, %f151, %f222;
645
+ .loc 2 113 30
646
+ mul.f32 %f227, %f223, %f223;
647
+ .loc 2 113 38
648
+ mul.f32 %f228, %f227, 0f40800000;
649
+ .loc 2 113 22
650
+ fma.rn.f32 %f229, %f224, %f228, %f226;
651
+ .loc 2 108 21
652
+ sub.f32 %f230, %f120, %f225;
653
+ .loc 2 110 60
654
+ div.full.f32 %r179, %r147, %r160;
655
+ mov.b32 %f231, %r179;
656
+ .loc 2 112 17
657
+ fma.rn.f32 %f232, %f231, %f230, %f225;
658
+ .loc 2 113 15
659
+ add.f32 %f233, %f152, %f229;
660
+ .loc 2 113 30
661
+ mul.f32 %f234, %f230, %f230;
662
+ .loc 2 113 38
663
+ mul.f32 %f235, %f234, 0f40A00000;
664
+ .loc 2 113 22
665
+ fma.rn.f32 %f236, %f231, %f235, %f233;
666
+ .loc 2 108 21
667
+ sub.f32 %f237, %f121, %f232;
668
+ .loc 2 110 60
669
+ div.full.f32 %r182, %r147, %r163;
670
+ mov.b32 %f238, %r182;
671
+ .loc 2 112 17
672
+ fma.rn.f32 %f239, %f238, %f237, %f232;
673
+ .loc 2 113 15
674
+ add.f32 %f240, %f153, %f236;
675
+ .loc 2 113 30
676
+ mul.f32 %f241, %f237, %f237;
677
+ .loc 2 113 38
678
+ mul.f32 %f242, %f241, 0f40C00000;
679
+ .loc 2 113 22
680
+ fma.rn.f32 %f243, %f238, %f242, %f240;
681
+ .loc 2 108 21
682
+ sub.f32 %f244, %f122, %f239;
683
+ .loc 2 110 60
684
+ div.full.f32 %r185, %r147, %r166;
685
+ mov.b32 %f245, %r185;
686
+ .loc 2 112 17
687
+ fma.rn.f32 %f246, %f245, %f244, %f239;
688
+ .loc 2 113 15
689
+ add.f32 %f247, %f154, %f243;
690
+ .loc 2 113 30
691
+ mul.f32 %f248, %f244, %f244;
692
+ .loc 2 113 38
693
+ mul.f32 %f249, %f248, 0f40E00000;
694
+ .loc 2 113 22
695
+ fma.rn.f32 %f250, %f245, %f249, %f247;
696
+ $L__tmp3:
697
+ .loc 2 120 46
698
+ mov.b32 %r284, %f198;
699
+ shfl.sync.bfly.b32 %r285, %r284, 16, 31, -1;
700
+ mov.b32 %f251, %r285;
701
+ mov.b32 %r286, %f202;
702
+ shfl.sync.bfly.b32 %r287, %r286, 16, 31, -1;
703
+ mov.b32 %f252, %r287;
704
+ shfl.sync.bfly.b32 %r189, %r166, 16, 31, -1;
705
+ mov.b32 %f253, %r189;
706
+ $L__tmp4:
707
+ .loc 2 108 21
708
+ sub.f32 %f254, %f251, %f198;
709
+ .loc 2 109 28
710
+ add.f32 %f255, %f253, 0f41000000;
711
+ .loc 2 110 39
712
+ setp.eq.f32 %p105, %f255, 0f00000000;
713
+ .loc 2 110 60
714
+ mov.b32 %r190, %f255;
715
+ div.full.f32 %r188, %r189, %r190;
716
+ mov.b32 %f256, %r188;
717
+ .loc 2 110 49
718
+ selp.f32 %f257, 0f00000000, %f256, %p105;
719
+ .loc 2 112 17
720
+ fma.rn.f32 %f258, %f257, %f254, %f198;
721
+ .loc 2 113 15
722
+ add.f32 %f259, %f202, %f252;
723
+ .loc 2 113 30
724
+ mul.f32 %f260, %f254, %f254;
725
+ .loc 2 113 38
726
+ mul.f32 %f261, %f260, 0f41000000;
727
+ .loc 2 113 22
728
+ fma.rn.f32 %f262, %f257, %f261, %f259;
729
+ $L__tmp5:
730
+ .loc 2 120 46
731
+ mov.b32 %r288, %f258;
732
+ shfl.sync.bfly.b32 %r289, %r288, 8, 31, -1;
733
+ mov.b32 %f263, %r289;
734
+ mov.b32 %r290, %f262;
735
+ shfl.sync.bfly.b32 %r291, %r290, 8, 31, -1;
736
+ mov.b32 %f264, %r291;
737
+ shfl.sync.bfly.b32 %r192, %r190, 8, 31, -1;
738
+ mov.b32 %f265, %r192;
739
+ $L__tmp6:
740
+ .loc 2 108 21
741
+ sub.f32 %f266, %f263, %f258;
742
+ .loc 2 109 28
743
+ add.f32 %f267, %f255, %f265;
744
+ .loc 2 110 39
745
+ setp.eq.f32 %p106, %f267, 0f00000000;
746
+ .loc 2 110 60
747
+ mov.b32 %r193, %f267;
748
+ div.full.f32 %r191, %r192, %r193;
749
+ mov.b32 %f268, %r191;
750
+ .loc 2 110 49
751
+ selp.f32 %f269, 0f00000000, %f268, %p106;
752
+ .loc 2 112 17
753
+ fma.rn.f32 %f270, %f269, %f266, %f258;
754
+ .loc 2 113 15
755
+ add.f32 %f271, %f262, %f264;
756
+ .loc 2 113 30
757
+ mul.f32 %f272, %f266, %f266;
758
+ .loc 2 113 38
759
+ mul.f32 %f273, %f255, %f272;
760
+ .loc 2 113 22
761
+ fma.rn.f32 %f274, %f269, %f273, %f271;
762
+ $L__tmp7:
763
+ .loc 2 120 46
764
+ mov.b32 %r292, %f270;
765
+ shfl.sync.bfly.b32 %r293, %r292, 4, 31, -1;
766
+ mov.b32 %f275, %r293;
767
+ mov.b32 %r294, %f274;
768
+ shfl.sync.bfly.b32 %r295, %r294, 4, 31, -1;
769
+ mov.b32 %f276, %r295;
770
+ shfl.sync.bfly.b32 %r195, %r193, 4, 31, -1;
771
+ mov.b32 %f277, %r195;
772
+ $L__tmp8:
773
+ .loc 2 108 21
774
+ sub.f32 %f278, %f275, %f270;
775
+ .loc 2 109 28
776
+ add.f32 %f279, %f267, %f277;
777
+ .loc 2 110 39
778
+ setp.eq.f32 %p107, %f279, 0f00000000;
779
+ .loc 2 110 60
780
+ mov.b32 %r196, %f279;
781
+ div.full.f32 %r194, %r195, %r196;
782
+ mov.b32 %f280, %r194;
783
+ .loc 2 110 49
784
+ selp.f32 %f281, 0f00000000, %f280, %p107;
785
+ .loc 2 112 17
786
+ fma.rn.f32 %f282, %f281, %f278, %f270;
787
+ .loc 2 113 15
788
+ add.f32 %f283, %f274, %f276;
789
+ .loc 2 113 30
790
+ mul.f32 %f284, %f278, %f278;
791
+ .loc 2 113 38
792
+ mul.f32 %f285, %f267, %f284;
793
+ .loc 2 113 22
794
+ fma.rn.f32 %f286, %f281, %f285, %f283;
795
+ $L__tmp9:
796
+ .loc 2 120 46
797
+ mov.b32 %r296, %f282;
798
+ shfl.sync.bfly.b32 %r297, %r296, 2, 31, -1;
799
+ mov.b32 %f287, %r297;
800
+ mov.b32 %r298, %f286;
801
+ shfl.sync.bfly.b32 %r299, %r298, 2, 31, -1;
802
+ mov.b32 %f288, %r299;
803
+ shfl.sync.bfly.b32 %r198, %r196, 2, 31, -1;
804
+ mov.b32 %f289, %r198;
805
+ $L__tmp10:
806
+ .loc 2 108 21
807
+ sub.f32 %f290, %f287, %f282;
808
+ .loc 2 109 28
809
+ add.f32 %f33, %f279, %f289;
810
+ .loc 2 110 39
811
+ setp.eq.f32 %p108, %f33, 0f00000000;
812
+ .loc 2 110 60
813
+ mov.b32 %r199, %f33;
814
+ div.full.f32 %r197, %r198, %r199;
815
+ mov.b32 %f291, %r197;
816
+ .loc 2 110 49
817
+ selp.f32 %f292, 0f00000000, %f291, %p108;
818
+ .loc 2 112 17
819
+ fma.rn.f32 %f34, %f290, %f292, %f282;
820
+ .loc 2 113 15
821
+ add.f32 %f293, %f286, %f288;
822
+ .loc 2 113 30
823
+ mul.f32 %f294, %f290, %f290;
824
+ .loc 2 113 38
825
+ mul.f32 %f295, %f279, %f294;
826
+ .loc 2 113 22
827
+ fma.rn.f32 %f35, %f292, %f295, %f293;
828
+ $L__tmp11:
829
+ .loc 2 120 46
830
+ mov.b32 %r300, %f34;
831
+ shfl.sync.bfly.b32 %r3, %r300, 1, 31, -1;
832
+ mov.b32 %r301, %f35;
833
+ shfl.sync.bfly.b32 %r4, %r301, 1, 31, -1;
834
+ shfl.sync.bfly.b32 %r201, %r199, 1, 31, -1;
835
+ mov.b32 %f296, %r201;
836
+ $L__tmp12:
837
+ .loc 2 109 28
838
+ add.f32 %f36, %f33, %f296;
839
+ .loc 2 110 60
840
+ mov.b32 %r202, %f36;
841
+ div.full.f32 %r200, %r201, %r202;
842
+ mov.b32 %f37, %r200;
843
+ $L__tmp13:
844
+ .loc 2 120 46
845
+ mov.b32 %r302, %f246;
846
+ shfl.sync.bfly.b32 %r303, %r302, 16, 31, -1;
847
+ mov.b32 %f297, %r303;
848
+ mov.b32 %r304, %f250;
849
+ shfl.sync.bfly.b32 %r305, %r304, 16, 31, -1;
850
+ mov.b32 %f298, %r305;
851
+ shfl.sync.bfly.b32 %r204, %r166, 16, 31, -1;
852
+ mov.b32 %f299, %r204;
853
+ $L__tmp14:
854
+ .loc 2 108 21
855
+ sub.f32 %f300, %f297, %f246;
856
+ .loc 2 109 28
857
+ add.f32 %f301, %f299, 0f41000000;
858
+ .loc 2 110 39
859
+ setp.eq.f32 %p109, %f301, 0f00000000;
860
+ .loc 2 110 60
861
+ mov.b32 %r205, %f301;
862
+ div.full.f32 %r203, %r204, %r205;
863
+ mov.b32 %f302, %r203;
864
+ .loc 2 110 49
865
+ selp.f32 %f303, 0f00000000, %f302, %p109;
866
+ .loc 2 112 17
867
+ fma.rn.f32 %f304, %f300, %f303, %f246;
868
+ .loc 2 113 15
869
+ add.f32 %f305, %f250, %f298;
870
+ .loc 2 113 30
871
+ mul.f32 %f306, %f300, %f300;
872
+ .loc 2 113 38
873
+ mul.f32 %f307, %f306, 0f41000000;
874
+ .loc 2 113 22
875
+ fma.rn.f32 %f308, %f307, %f303, %f305;
876
+ $L__tmp15:
877
+ .loc 2 120 46
878
+ mov.b32 %r306, %f304;
879
+ shfl.sync.bfly.b32 %r307, %r306, 8, 31, -1;
880
+ mov.b32 %f309, %r307;
881
+ mov.b32 %r308, %f308;
882
+ shfl.sync.bfly.b32 %r309, %r308, 8, 31, -1;
883
+ mov.b32 %f310, %r309;
884
+ shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1;
885
+ mov.b32 %f311, %r207;
886
+ $L__tmp16:
887
+ .loc 2 108 21
888
+ sub.f32 %f312, %f309, %f304;
889
+ .loc 2 109 28
890
+ add.f32 %f313, %f301, %f311;
891
+ .loc 2 110 39
892
+ setp.eq.f32 %p110, %f313, 0f00000000;
893
+ .loc 2 110 60
894
+ mov.b32 %r208, %f313;
895
+ div.full.f32 %r206, %r207, %r208;
896
+ mov.b32 %f314, %r206;
897
+ .loc 2 110 49
898
+ selp.f32 %f315, 0f00000000, %f314, %p110;
899
+ .loc 2 112 17
900
+ fma.rn.f32 %f316, %f312, %f315, %f304;
901
+ .loc 2 113 15
902
+ add.f32 %f317, %f308, %f310;
903
+ .loc 2 113 30
904
+ mul.f32 %f318, %f312, %f312;
905
+ .loc 2 113 38
906
+ mul.f32 %f319, %f301, %f318;
907
+ .loc 2 113 22
908
+ fma.rn.f32 %f320, %f315, %f319, %f317;
909
+ $L__tmp17:
910
+ .loc 2 120 46
911
+ mov.b32 %r310, %f316;
912
+ shfl.sync.bfly.b32 %r311, %r310, 4, 31, -1;
913
+ mov.b32 %f321, %r311;
914
+ mov.b32 %r312, %f320;
915
+ shfl.sync.bfly.b32 %r313, %r312, 4, 31, -1;
916
+ mov.b32 %f322, %r313;
917
+ shfl.sync.bfly.b32 %r210, %r208, 4, 31, -1;
918
+ mov.b32 %f323, %r210;
919
+ $L__tmp18:
920
+ .loc 2 108 21
921
+ sub.f32 %f324, %f321, %f316;
922
+ .loc 2 109 28
923
+ add.f32 %f325, %f313, %f323;
924
+ .loc 2 110 39
925
+ setp.eq.f32 %p111, %f325, 0f00000000;
926
+ .loc 2 110 60
927
+ mov.b32 %r211, %f325;
928
+ div.full.f32 %r209, %r210, %r211;
929
+ mov.b32 %f326, %r209;
930
+ .loc 2 110 49
931
+ selp.f32 %f327, 0f00000000, %f326, %p111;
932
+ .loc 2 112 17
933
+ fma.rn.f32 %f328, %f324, %f327, %f316;
934
+ .loc 2 113 15
935
+ add.f32 %f329, %f320, %f322;
936
+ .loc 2 113 30
937
+ mul.f32 %f330, %f324, %f324;
938
+ .loc 2 113 38
939
+ mul.f32 %f331, %f313, %f330;
940
+ .loc 2 113 22
941
+ fma.rn.f32 %f332, %f327, %f331, %f329;
942
+ $L__tmp19:
943
+ .loc 2 120 46
944
+ mov.b32 %r314, %f328;
945
+ shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
946
+ mov.b32 %f333, %r315;
947
+ mov.b32 %r316, %f332;
948
+ shfl.sync.bfly.b32 %r317, %r316, 2, 31, -1;
949
+ mov.b32 %f334, %r317;
950
+ shfl.sync.bfly.b32 %r213, %r211, 2, 31, -1;
951
+ mov.b32 %f335, %r213;
952
+ $L__tmp20:
953
+ .loc 2 108 21
954
+ sub.f32 %f336, %f333, %f328;
955
+ .loc 2 109 28
956
+ add.f32 %f38, %f325, %f335;
957
+ .loc 2 110 39
958
+ setp.eq.f32 %p112, %f38, 0f00000000;
959
+ .loc 2 110 60
960
+ mov.b32 %r214, %f38;
961
+ div.full.f32 %r212, %r213, %r214;
962
+ mov.b32 %f337, %r212;
963
+ .loc 2 110 49
964
+ selp.f32 %f338, 0f00000000, %f337, %p112;
965
+ .loc 2 112 17
966
+ fma.rn.f32 %f39, %f336, %f338, %f328;
967
+ .loc 2 113 15
968
+ add.f32 %f339, %f332, %f334;
969
+ .loc 2 113 30
970
+ mul.f32 %f340, %f336, %f336;
971
+ .loc 2 113 38
972
+ mul.f32 %f341, %f325, %f340;
973
+ .loc 2 113 22
974
+ fma.rn.f32 %f40, %f338, %f341, %f339;
975
+ $L__tmp21:
976
+ .loc 2 120 46
977
+ mov.b32 %r318, %f39;
978
+ shfl.sync.bfly.b32 %r5, %r318, 1, 31, -1;
979
+ mov.b32 %r319, %f40;
980
+ shfl.sync.bfly.b32 %r6, %r319, 1, 31, -1;
981
+ shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1;
982
+ mov.b32 %f342, %r216;
983
+ $L__tmp22:
984
+ .loc 2 109 28
985
+ add.f32 %f41, %f38, %f342;
986
+ .loc 2 110 60
987
+ mov.b32 %r217, %f41;
988
+ div.full.f32 %r215, %r216, %r217;
989
+ mov.b32 %f42, %r215;
990
+ $L__tmp23:
991
+ .loc 1 62 51
992
+ mov.u32 %r218, 0x0;
993
+ mov.u32 %r219, 0x0;
994
+ mov.u32 %r220, 0x0;
995
+ mov.u32 %r221, 0x0;
996
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ];
997
+ @!%p113 mov.u32 %r218, %r325;
998
+ @!%p113 mov.u32 %r219, %r325;
999
+ @!%p113 mov.u32 %r220, %r325;
1000
+ @!%p113 mov.u32 %r221, %r325;
1001
+ mov.u32 %r226, 0x0;
1002
+ mov.u32 %r227, 0x0;
1003
+ mov.u32 %r228, 0x0;
1004
+ mov.u32 %r229, 0x0;
1005
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ];
1006
+ @!%p113 mov.u32 %r226, %r325;
1007
+ @!%p113 mov.u32 %r227, %r325;
1008
+ @!%p113 mov.u32 %r228, %r325;
1009
+ @!%p113 mov.u32 %r229, %r325;
1010
+ mov.u32 %r234, 0x0;
1011
+ mov.u32 %r235, 0x0;
1012
+ mov.u32 %r236, 0x0;
1013
+ mov.u32 %r237, 0x0;
1014
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ];
1015
+ @!%p113 mov.u32 %r234, %r325;
1016
+ @!%p113 mov.u32 %r235, %r325;
1017
+ @!%p113 mov.u32 %r236, %r325;
1018
+ @!%p113 mov.u32 %r237, %r325;
1019
+ mov.u32 %r242, 0x0;
1020
+ mov.u32 %r243, 0x0;
1021
+ mov.u32 %r244, 0x0;
1022
+ mov.u32 %r245, 0x0;
1023
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ];
1024
+ @!%p113 mov.u32 %r242, %r325;
1025
+ @!%p113 mov.u32 %r243, %r325;
1026
+ @!%p113 mov.u32 %r244, %r325;
1027
+ @!%p113 mov.u32 %r245, %r325;
1028
+ .loc 1 63 51
1029
+ mov.u32 %r250, 0x0;
1030
+ mov.u32 %r251, 0x0;
1031
+ mov.u32 %r252, 0x0;
1032
+ mov.u32 %r253, 0x0;
1033
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ];
1034
+ @!%p113 mov.u32 %r250, %r325;
1035
+ @!%p113 mov.u32 %r251, %r325;
1036
+ @!%p113 mov.u32 %r252, %r325;
1037
+ @!%p113 mov.u32 %r253, %r325;
1038
+ cvt.u16.u32 %rs17, %r250;
1039
+ { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; }
1040
+ cvt.u16.u32 %rs19, %r251;
1041
+ { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; }
1042
+ cvt.u16.u32 %rs21, %r252;
1043
+ { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; }
1044
+ cvt.u16.u32 %rs23, %r253;
1045
+ { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; }
1046
+ mov.u32 %r258, 0x0;
1047
+ mov.u32 %r259, 0x0;
1048
+ mov.u32 %r260, 0x0;
1049
+ mov.u32 %r261, 0x0;
1050
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ];
1051
+ @!%p113 mov.u32 %r258, %r325;
1052
+ @!%p113 mov.u32 %r259, %r325;
1053
+ @!%p113 mov.u32 %r260, %r325;
1054
+ @!%p113 mov.u32 %r261, %r325;
1055
+ cvt.u16.u32 %rs25, %r258;
1056
+ { .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; }
1057
+ cvt.u16.u32 %rs27, %r259;
1058
+ { .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; }
1059
+ cvt.u16.u32 %rs29, %r260;
1060
+ { .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; }
1061
+ cvt.u16.u32 %rs31, %r261;
1062
+ { .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; }
1063
+ .loc 1 63 103
1064
+ cvt.f32.bf16 %r266, %rs17;
1065
+ mov.b32 %f43, %r266;
1066
+ cvt.f32.bf16 %r267, %rs18;
1067
+ mov.b32 %f44, %r267;
1068
+ cvt.f32.bf16 %r268, %rs19;
1069
+ mov.b32 %f45, %r268;
1070
+ cvt.f32.bf16 %r269, %rs20;
1071
+ mov.b32 %f46, %r269;
1072
+ cvt.f32.bf16 %r270, %rs21;
1073
+ mov.b32 %f47, %r270;
1074
+ cvt.f32.bf16 %r271, %rs22;
1075
+ mov.b32 %f48, %r271;
1076
+ cvt.f32.bf16 %r272, %rs23;
1077
+ mov.b32 %f49, %r272;
1078
+ cvt.f32.bf16 %r273, %rs24;
1079
+ mov.b32 %f50, %r273;
1080
+ cvt.f32.bf16 %r274, %rs25;
1081
+ mov.b32 %f51, %r274;
1082
+ cvt.f32.bf16 %r275, %rs26;
1083
+ mov.b32 %f52, %r275;
1084
+ cvt.f32.bf16 %r276, %rs27;
1085
+ mov.b32 %f53, %r276;
1086
+ cvt.f32.bf16 %r277, %rs28;
1087
+ mov.b32 %f54, %r277;
1088
+ cvt.f32.bf16 %r278, %rs29;
1089
+ mov.b32 %f55, %r278;
1090
+ cvt.f32.bf16 %r279, %rs30;
1091
+ mov.b32 %f56, %r279;
1092
+ cvt.f32.bf16 %r280, %rs31;
1093
+ mov.b32 %f57, %r280;
1094
+ cvt.f32.bf16 %r281, %rs32;
1095
+ mov.b32 %f58, %r281;
1096
+ .loc 1 64 35
1097
+ mul.wide.u32 %rd107, %r2, 4;
1098
+ add.s64 %rd95, %rd17, %rd107;
1099
+ .loc 1 64 40
1100
+ mov.u32 %r282, 0x0;
1101
+ @%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ];
1102
+ @!%p113 mov.u32 %r282, %r325;
1103
+ .loc 1 68 57
1104
+ @%p49 bra $L__BB0_4;
1105
+ mov.u64 %rd108, assertMessage_1;
1106
+ cvta.global.u64 %rd109, %rd108;
1107
+ mov.u64 %rd110, assertFile_1;
1108
+ cvta.global.u64 %rd111, %rd110;
1109
+ mov.u64 %rd112, assertFunc_1;
1110
+ cvta.global.u64 %rd113, %rd112;
1111
+ { // callseq 9, 0
1112
+ .reg .b32 temp_param_reg;
1113
+ .param .b64 param0;
1114
+ st.param.b64 [param0+0], %rd109;
1115
+ .param .b64 param1;
1116
+ st.param.b64 [param1+0], %rd111;
1117
+ .param .b32 param2;
1118
+ st.param.b32 [param2+0], %r438;
1119
+ .param .b64 param3;
1120
+ st.param.b64 [param3+0], %rd113;
1121
+ .param .b64 param4;
1122
+ st.param.b64 [param4+0], %rd123;
1123
+ call.uni
1124
+ __assertfail,
1125
+ (
1126
+ param0,
1127
+ param1,
1128
+ param2,
1129
+ param3,
1130
+ param4
1131
+ );
1132
+ } // callseq 9
1133
+ $L__BB0_4:
1134
+ $L__tmp24:
1135
+ .loc 2 120 46
1136
+ mov.b32 %f343, %r6;
1137
+ $L__tmp25:
1138
+ .loc 2 113 15
1139
+ add.f32 %f344, %f40, %f343;
1140
+ $L__tmp26:
1141
+ .loc 2 120 46
1142
+ mov.b32 %f345, %r5;
1143
+ $L__tmp27:
1144
+ .loc 2 108 21
1145
+ sub.f32 %f346, %f345, %f39;
1146
+ .loc 2 113 30
1147
+ mul.f32 %f347, %f346, %f346;
1148
+ .loc 2 113 38
1149
+ mul.f32 %f348, %f38, %f347;
1150
+ .loc 2 110 39
1151
+ setp.eq.f32 %p135, %f41, 0f00000000;
1152
+ .loc 2 110 49
1153
+ selp.f32 %f349, 0f00000000, %f42, %p135;
1154
+ .loc 2 113 22
1155
+ fma.rn.f32 %f350, %f349, %f348, %f344;
1156
+ $L__tmp28:
1157
+ .loc 2 120 46
1158
+ mov.b32 %f351, %r4;
1159
+ $L__tmp29:
1160
+ .loc 2 113 15
1161
+ add.f32 %f352, %f35, %f351;
1162
+ $L__tmp30:
1163
+ .loc 2 120 46
1164
+ mov.b32 %f353, %r3;
1165
+ $L__tmp31:
1166
+ .loc 2 108 21
1167
+ sub.f32 %f354, %f353, %f34;
1168
+ .loc 2 113 30
1169
+ mul.f32 %f355, %f354, %f354;
1170
+ .loc 2 113 38
1171
+ mul.f32 %f356, %f33, %f355;
1172
+ .loc 2 110 39
1173
+ setp.eq.f32 %p136, %f36, 0f00000000;
1174
+ .loc 2 110 49
1175
+ selp.f32 %f357, 0f00000000, %f37, %p136;
1176
+ .loc 2 113 22
1177
+ fma.rn.f32 %f358, %f357, %f356, %f352;
1178
+ $L__tmp32:
1179
+ .loc 1 69 54
1180
+ mov.u32 %r321, 0x0;
1181
+ mov.u32 %r322, 0x0;
1182
+ mov.u32 %r323, 0x0;
1183
+ mov.u32 %r324, 0x0;
1184
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ];
1185
+ @!%p113 mov.u32 %r321, %r325;
1186
+ @!%p113 mov.u32 %r322, %r325;
1187
+ @!%p113 mov.u32 %r323, %r325;
1188
+ @!%p113 mov.u32 %r324, %r325;
1189
+ mov.u32 %r329, 0x0;
1190
+ mov.u32 %r330, 0x0;
1191
+ mov.u32 %r331, 0x0;
1192
+ mov.u32 %r332, 0x0;
1193
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ];
1194
+ @!%p113 mov.u32 %r329, %r325;
1195
+ @!%p113 mov.u32 %r330, %r325;
1196
+ @!%p113 mov.u32 %r331, %r325;
1197
+ @!%p113 mov.u32 %r332, %r325;
1198
+ mov.u32 %r337, 0x0;
1199
+ mov.u32 %r338, 0x0;
1200
+ mov.u32 %r339, 0x0;
1201
+ mov.u32 %r340, 0x0;
1202
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ];
1203
+ @!%p113 mov.u32 %r337, %r325;
1204
+ @!%p113 mov.u32 %r338, %r325;
1205
+ @!%p113 mov.u32 %r339, %r325;
1206
+ @!%p113 mov.u32 %r340, %r325;
1207
+ mov.u32 %r345, 0x0;
1208
+ mov.u32 %r346, 0x0;
1209
+ mov.u32 %r347, 0x0;
1210
+ mov.u32 %r348, 0x0;
1211
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ];
1212
+ @!%p113 mov.u32 %r345, %r325;
1213
+ @!%p113 mov.u32 %r346, %r325;
1214
+ @!%p113 mov.u32 %r347, %r325;
1215
+ @!%p113 mov.u32 %r348, %r325;
1216
+ .loc 1 75 24
1217
+ mov.b32 %r354, %f358;
1218
+ mov.b32 %r355, 1132462080;
1219
+ div.full.f32 %r353, %r354, %r355;
1220
+ mov.b32 %f359, %r353;
1221
+ mov.b32 %r378, %f350;
1222
+ div.full.f32 %r377, %r378, %r355;
1223
+ mov.b32 %f360, %r377;
1224
+ .loc 1 77 24
1225
+ add.f32 %f361, %f359, 0f3727C5AC;
1226
+ add.f32 %f362, %f360, 0f3727C5AC;
1227
+ .loc 1 78 30
1228
+ rsqrt.approx.ftz.f32 %f363, %f361;
1229
+ rsqrt.approx.ftz.f32 %f364, %f362;
1230
+ .loc 1 69 54
1231
+ mov.b32 %f365, %r348;
1232
+ .loc 1 62 51
1233
+ mov.b32 %f366, %r245;
1234
+ .loc 1 70 24
1235
+ add.f32 %f367, %f366, %f365;
1236
+ .loc 1 72 24
1237
+ add.f32 %f368, %f58, %f367;
1238
+ $L__tmp33:
1239
+ .loc 2 112 17
1240
+ fma.rn.f32 %f369, %f346, %f349, %f39;
1241
+ $L__tmp34:
1242
+ .loc 1 73 24
1243
+ sub.f32 %f370, %f368, %f369;
1244
+ .loc 1 69 54
1245
+ mov.b32 %f371, %r347;
1246
+ .loc 1 62 51
1247
+ mov.b32 %f372, %r244;
1248
+ .loc 1 70 24
1249
+ add.f32 %f373, %f372, %f371;
1250
+ .loc 1 72 24
1251
+ add.f32 %f374, %f57, %f373;
1252
+ .loc 1 73 24
1253
+ sub.f32 %f375, %f374, %f369;
1254
+ .loc 1 69 54
1255
+ mov.b32 %f376, %r346;
1256
+ .loc 1 62 51
1257
+ mov.b32 %f377, %r243;
1258
+ .loc 1 70 24
1259
+ add.f32 %f378, %f377, %f376;
1260
+ .loc 1 72 24
1261
+ add.f32 %f379, %f56, %f378;
1262
+ .loc 1 73 24
1263
+ sub.f32 %f380, %f379, %f369;
1264
+ .loc 1 69 54
1265
+ mov.b32 %f381, %r345;
1266
+ .loc 1 62 51
1267
+ mov.b32 %f382, %r242;
1268
+ .loc 1 70 24
1269
+ add.f32 %f383, %f382, %f381;
1270
+ .loc 1 72 24
1271
+ add.f32 %f384, %f55, %f383;
1272
+ .loc 1 73 24
1273
+ sub.f32 %f385, %f384, %f369;
1274
+ .loc 1 69 54
1275
+ mov.b32 %f386, %r340;
1276
+ .loc 1 62 51
1277
+ mov.b32 %f387, %r237;
1278
+ .loc 1 70 24
1279
+ add.f32 %f388, %f387, %f386;
1280
+ .loc 1 72 24
1281
+ add.f32 %f389, %f54, %f388;
1282
+ .loc 1 73 24
1283
+ sub.f32 %f390, %f389, %f369;
1284
+ .loc 1 69 54
1285
+ mov.b32 %f391, %r339;
1286
+ .loc 1 62 51
1287
+ mov.b32 %f392, %r236;
1288
+ .loc 1 70 24
1289
+ add.f32 %f393, %f392, %f391;
1290
+ .loc 1 72 24
1291
+ add.f32 %f394, %f53, %f393;
1292
+ .loc 1 73 24
1293
+ sub.f32 %f395, %f394, %f369;
1294
+ .loc 1 69 54
1295
+ mov.b32 %f396, %r338;
1296
+ .loc 1 62 51
1297
+ mov.b32 %f397, %r235;
1298
+ .loc 1 70 24
1299
+ add.f32 %f398, %f397, %f396;
1300
+ .loc 1 72 24
1301
+ add.f32 %f399, %f52, %f398;
1302
+ .loc 1 73 24
1303
+ sub.f32 %f400, %f399, %f369;
1304
+ .loc 1 69 54
1305
+ mov.b32 %f401, %r337;
1306
+ .loc 1 62 51
1307
+ mov.b32 %f402, %r234;
1308
+ .loc 1 70 24
1309
+ add.f32 %f403, %f402, %f401;
1310
+ .loc 1 72 24
1311
+ add.f32 %f404, %f51, %f403;
1312
+ .loc 1 73 24
1313
+ sub.f32 %f405, %f404, %f369;
1314
+ .loc 1 69 54
1315
+ mov.b32 %f406, %r332;
1316
+ .loc 1 62 51
1317
+ mov.b32 %f407, %r229;
1318
+ .loc 1 70 24
1319
+ add.f32 %f408, %f407, %f406;
1320
+ .loc 1 72 24
1321
+ add.f32 %f409, %f50, %f408;
1322
+ $L__tmp35:
1323
+ .loc 2 112 17
1324
+ fma.rn.f32 %f410, %f354, %f357, %f34;
1325
+ $L__tmp36:
1326
+ .loc 1 73 24
1327
+ sub.f32 %f411, %f409, %f410;
1328
+ .loc 1 69 54
1329
+ mov.b32 %f412, %r331;
1330
+ .loc 1 62 51
1331
+ mov.b32 %f413, %r228;
1332
+ .loc 1 70 24
1333
+ add.f32 %f414, %f413, %f412;
1334
+ .loc 1 72 24
1335
+ add.f32 %f415, %f49, %f414;
1336
+ .loc 1 73 24
1337
+ sub.f32 %f416, %f415, %f410;
1338
+ .loc 1 69 54
1339
+ mov.b32 %f417, %r330;
1340
+ .loc 1 62 51
1341
+ mov.b32 %f418, %r227;
1342
+ .loc 1 70 24
1343
+ add.f32 %f419, %f418, %f417;
1344
+ .loc 1 72 24
1345
+ add.f32 %f420, %f48, %f419;
1346
+ .loc 1 73 24
1347
+ sub.f32 %f421, %f420, %f410;
1348
+ .loc 1 69 54
1349
+ mov.b32 %f422, %r329;
1350
+ .loc 1 62 51
1351
+ mov.b32 %f423, %r226;
1352
+ .loc 1 70 24
1353
+ add.f32 %f424, %f423, %f422;
1354
+ .loc 1 72 24
1355
+ add.f32 %f425, %f47, %f424;
1356
+ .loc 1 73 24
1357
+ sub.f32 %f426, %f425, %f410;
1358
+ .loc 1 69 54
1359
+ mov.b32 %f427, %r324;
1360
+ .loc 1 62 51
1361
+ mov.b32 %f428, %r221;
1362
+ .loc 1 70 24
1363
+ add.f32 %f429, %f428, %f427;
1364
+ .loc 1 72 24
1365
+ add.f32 %f430, %f46, %f429;
1366
+ .loc 1 73 24
1367
+ sub.f32 %f431, %f430, %f410;
1368
+ .loc 1 69 54
1369
+ mov.b32 %f432, %r323;
1370
+ .loc 1 62 51
1371
+ mov.b32 %f433, %r220;
1372
+ .loc 1 70 24
1373
+ add.f32 %f434, %f433, %f432;
1374
+ .loc 1 72 24
1375
+ add.f32 %f435, %f45, %f434;
1376
+ .loc 1 73 24
1377
+ sub.f32 %f436, %f435, %f410;
1378
+ .loc 1 69 54
1379
+ mov.b32 %f437, %r322;
1380
+ .loc 1 62 51
1381
+ mov.b32 %f438, %r219;
1382
+ .loc 1 70 24
1383
+ add.f32 %f439, %f438, %f437;
1384
+ .loc 1 72 24
1385
+ add.f32 %f440, %f44, %f439;
1386
+ .loc 1 73 24
1387
+ sub.f32 %f441, %f440, %f410;
1388
+ .loc 1 69 54
1389
+ mov.b32 %f442, %r321;
1390
+ .loc 1 62 51
1391
+ mov.b32 %f443, %r218;
1392
+ .loc 1 70 24
1393
+ add.f32 %f444, %f443, %f442;
1394
+ .loc 1 72 24
1395
+ add.f32 %f445, %f43, %f444;
1396
+ .loc 1 73 24
1397
+ sub.f32 %f446, %f445, %f410;
1398
+ .loc 1 79 24
1399
+ mul.f32 %f447, %f446, %f363;
1400
+ mul.f32 %f448, %f441, %f363;
1401
+ mul.f32 %f449, %f436, %f363;
1402
+ mul.f32 %f450, %f431, %f363;
1403
+ mul.f32 %f451, %f426, %f363;
1404
+ mul.f32 %f452, %f421, %f363;
1405
+ mul.f32 %f453, %f416, %f363;
1406
+ mul.f32 %f454, %f411, %f363;
1407
+ mul.f32 %f455, %f405, %f364;
1408
+ mul.f32 %f456, %f400, %f364;
1409
+ mul.f32 %f457, %f395, %f364;
1410
+ mul.f32 %f458, %f390, %f364;
1411
+ mul.f32 %f459, %f385, %f364;
1412
+ mul.f32 %f460, %f380, %f364;
1413
+ mul.f32 %f461, %f375, %f364;
1414
+ mul.f32 %f462, %f370, %f364;
1415
+ .loc 1 80 24
1416
+ shl.b32 %r425, %r2, 2;
1417
+ mov.u32 %r426, global_smem;
1418
+ add.s32 %r427, %r426, %r425;
1419
+ st.shared.u32 [%r427], %r282;
1420
+ bar.sync 0;
1421
+ shl.b32 %r428, %r1, 2;
1422
+ add.s32 %r429, %r426, %r428;
1423
+ ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r429];
1424
+ ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r429+16];
1425
+ mul.f32 %f471, %f447, %f463;
1426
+ mul.f32 %f472, %f448, %f464;
1427
+ mul.f32 %f473, %f449, %f465;
1428
+ mul.f32 %f474, %f450, %f466;
1429
+ mul.f32 %f475, %f451, %f467;
1430
+ mul.f32 %f476, %f452, %f468;
1431
+ mul.f32 %f477, %f453, %f469;
1432
+ mul.f32 %f478, %f454, %f470;
1433
+ mul.f32 %f479, %f455, %f463;
1434
+ mul.f32 %f480, %f456, %f464;
1435
+ mul.f32 %f481, %f457, %f465;
1436
+ mul.f32 %f482, %f458, %f466;
1437
+ mul.f32 %f483, %f459, %f467;
1438
+ mul.f32 %f484, %f460, %f468;
1439
+ mul.f32 %f485, %f461, %f469;
1440
+ mul.f32 %f486, %f462, %f470;
1441
+ .loc 1 82 29
1442
+ shl.b64 %rd121, %rd7, 1;
1443
+ add.s64 %rd119, %rd18, %rd121;
1444
+ shl.b64 %rd122, %rd9, 1;
1445
+ add.s64 %rd120, %rd18, %rd122;
1446
+ .loc 1 82 52
1447
+ mov.b32 %r401, %f471;
1448
+ cvt.rn.bf16.f32 %rs33, %r401;
1449
+ mov.b32 %r402, %f472;
1450
+ cvt.rn.bf16.f32 %rs34, %r402;
1451
+ mov.b32 %r403, %f473;
1452
+ cvt.rn.bf16.f32 %rs35, %r403;
1453
+ mov.b32 %r404, %f474;
1454
+ cvt.rn.bf16.f32 %rs36, %r404;
1455
+ mov.b32 %r405, %f475;
1456
+ cvt.rn.bf16.f32 %rs37, %r405;
1457
+ mov.b32 %r406, %f476;
1458
+ cvt.rn.bf16.f32 %rs38, %r406;
1459
+ mov.b32 %r407, %f477;
1460
+ cvt.rn.bf16.f32 %rs39, %r407;
1461
+ mov.b32 %r408, %f478;
1462
+ cvt.rn.bf16.f32 %rs40, %r408;
1463
+ mov.b32 %r409, %f479;
1464
+ cvt.rn.bf16.f32 %rs41, %r409;
1465
+ mov.b32 %r410, %f480;
1466
+ cvt.rn.bf16.f32 %rs42, %r410;
1467
+ mov.b32 %r411, %f481;
1468
+ cvt.rn.bf16.f32 %rs43, %r411;
1469
+ mov.b32 %r412, %f482;
1470
+ cvt.rn.bf16.f32 %rs44, %r412;
1471
+ mov.b32 %r413, %f483;
1472
+ cvt.rn.bf16.f32 %rs45, %r413;
1473
+ mov.b32 %r414, %f484;
1474
+ cvt.rn.bf16.f32 %rs46, %r414;
1475
+ mov.b32 %r415, %f485;
1476
+ cvt.rn.bf16.f32 %rs47, %r415;
1477
+ mov.b32 %r416, %f486;
1478
+ cvt.rn.bf16.f32 %rs48, %r416;
1479
+ mov.b32 %r430, {%rs33, %rs34};
1480
+ mov.b32 %r431, {%rs35, %rs36};
1481
+ mov.b32 %r432, {%rs37, %rs38};
1482
+ mov.b32 %r433, {%rs39, %rs40};
1483
+ @%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 };
1484
+ mov.b32 %r434, {%rs41, %rs42};
1485
+ mov.b32 %r435, {%rs43, %rs44};
1486
+ mov.b32 %r436, {%rs45, %rs46};
1487
+ mov.b32 %r437, {%rs47, %rs48};
1488
+ @%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 };
1489
+ .loc 1 58 4
1490
+ ret;
1491
+ $L__tmp37:
1492
+ $L__func_end0:
1493
+
1494
+ }
1495
+ // .globl __nv_rsqrtf
1496
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
1497
+ .param .b32 __nv_rsqrtf_param_0
1498
+ )
1499
+ {
1500
+ .reg .f32 %f<3>;
1501
+ $L__func_begin1:
1502
+
1503
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
1504
+ rsqrt.approx.ftz.f32 %f2, %f1;
1505
+ st.param.f32 [func_retval0+0], %f2;
1506
+ ret;
1507
+ $L__func_end1:
1508
+
1509
+ }
1510
+ .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
1511
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
1512
+ .section .debug_abbrev
1513
+ {
1514
+ .b8 1
1515
+ .b8 17
1516
+ .b8 1
1517
+ .b8 37
1518
+ .b8 8
1519
+ .b8 19
1520
+ .b8 5
1521
+ .b8 3
1522
+ .b8 8
1523
+ .b8 16
1524
+ .b8 6
1525
+ .b8 27
1526
+ .b8 8
1527
+ .b8 180
1528
+ .b8 66
1529
+ .b8 12
1530
+ .b8 17
1531
+ .b8 1
1532
+ .b8 18
1533
+ .b8 1
1534
+ .b8 0
1535
+ .b8 0
1536
+ .b8 2
1537
+ .b8 46
1538
+ .b8 0
1539
+ .b8 135
1540
+ .b8 64
1541
+ .b8 8
1542
+ .b8 3
1543
+ .b8 8
1544
+ .b8 58
1545
+ .b8 11
1546
+ .b8 59
1547
+ .b8 11
1548
+ .b8 63
1549
+ .b8 12
1550
+ .b8 32
1551
+ .b8 11
1552
+ .b8 0
1553
+ .b8 0
1554
+ .b8 3
1555
+ .b8 46
1556
+ .b8 1
1557
+ .b8 17
1558
+ .b8 1
1559
+ .b8 18
1560
+ .b8 1
1561
+ .b8 64
1562
+ .b8 10
1563
+ .b8 49
1564
+ .b8 19
1565
+ .b8 0
1566
+ .b8 0
1567
+ .b8 4
1568
+ .b8 29
1569
+ .b8 0
1570
+ .b8 49
1571
+ .b8 19
1572
+ .b8 17
1573
+ .b8 1
1574
+ .b8 18
1575
+ .b8 1
1576
+ .b8 88
1577
+ .b8 11
1578
+ .b8 89
1579
+ .b8 11
1580
+ .b8 87
1581
+ .b8 11
1582
+ .b8 0
1583
+ .b8 0
1584
+ .b8 5
1585
+ .b8 29
1586
+ .b8 1
1587
+ .b8 49
1588
+ .b8 19
1589
+ .b8 17
1590
+ .b8 1
1591
+ .b8 18
1592
+ .b8 1
1593
+ .b8 88
1594
+ .b8 11
1595
+ .b8 89
1596
+ .b8 11
1597
+ .b8 87
1598
+ .b8 11
1599
+ .b8 0
1600
+ .b8 0
1601
+ .b8 0
1602
+ }
1603
+ .section .debug_info
1604
+ {
1605
+ .b32 302
1606
+ .b8 2
1607
+ .b8 0
1608
+ .b32 .debug_abbrev
1609
+ .b8 8
1610
+ .b8 1
1611
+ .b8 116
1612
+ .b8 114
1613
+ .b8 105
1614
+ .b8 116
1615
+ .b8 111
1616
+ .b8 110
1617
+ .b8 0
1618
+ .b8 2
1619
+ .b8 0
1620
+ .b8 99
1621
+ .b8 99
1622
+ .b8 105
1623
+ .b8 103
1624
+ .b8 54
1625
+ .b8 102
1626
+ .b8 107
1627
+ .b8 105
1628
+ .b8 54
1629
+ .b8 112
1630
+ .b8 52
1631
+ .b8 108
1632
+ .b8 120
1633
+ .b8 114
1634
+ .b8 100
1635
+ .b8 109
1636
+ .b8 103
1637
+ .b8 103
1638
+ .b8 54
1639
+ .b8 101
1640
+ .b8 117
1641
+ .b8 100
1642
+ .b8 97
1643
+ .b8 104
1644
+ .b8 105
1645
+ .b8 101
1646
+ .b8 120
1647
+ .b8 99
1648
+ .b8 118
1649
+ .b8 117
1650
+ .b8 101
1651
+ .b8 101
1652
+ .b8 111
1653
+ .b8 108
1654
+ .b8 50
1655
+ .b8 112
1656
+ .b8 52
1657
+ .b8 113
1658
+ .b8 112
1659
+ .b8 53
1660
+ .b8 51
1661
+ .b8 50
1662
+ .b8 112
1663
+ .b8 118
1664
+ .b8 118
1665
+ .b8 101
1666
+ .b8 50
1667
+ .b8 121
1668
+ .b8 52
1669
+ .b8 54
1670
+ .b8 51
1671
+ .b8 121
1672
+ .b8 46
1673
+ .b8 112
1674
+ .b8 121
1675
+ .b8 0
1676
+ .b32 .debug_line
1677
+ .b8 47
1678
+ .b8 116
1679
+ .b8 109
1680
+ .b8 112
1681
+ .b8 47
1682
+ .b8 116
1683
+ .b8 111
1684
+ .b8 114
1685
+ .b8 99
1686
+ .b8 104
1687
+ .b8 105
1688
+ .b8 110
1689
+ .b8 100
1690
+ .b8 117
1691
+ .b8 99
1692
+ .b8 116
1693
+ .b8 111
1694
+ .b8 114
1695
+ .b8 95
1696
+ .b8 114
1697
+ .b8 111
1698
+ .b8 111
1699
+ .b8 116
1700
+ .b8 47
1701
+ .b8 99
1702
+ .b8 105
1703
+ .b8 0
1704
+ .b8 1
1705
+ .b64 $L__func_begin0
1706
+ .b64 $L__func_end0
1707
+ .b8 2
1708
+ .b8 116
1709
+ .b8 114
1710
+ .b8 105
1711
+ .b8 116
1712
+ .b8 111
1713
+ .b8 110
1714
+ .b8 95
1715
+ .b8 95
1716
+ .b8 48
1717
+ .b8 100
1718
+ .b8 49
1719
+ .b8 100
1720
+ .b8 50
1721
+ .b8 100
1722
+ .b8 51
1723
+ .b8 100
1724
+ .b8 52
1725
+ .b8 100
1726
+ .b8 53
1727
+ .b8 100
1728
+ .b8 54
1729
+ .b8 100
1730
+ .b8 101
1731
+ .b8 55
1732
+ .b8 100
1733
+ .b8 101
1734
+ .b8 0
1735
+ .b8 116
1736
+ .b8 114
1737
+ .b8 105
1738
+ .b8 116
1739
+ .b8 111
1740
+ .b8 110
1741
+ .b8 95
1742
+ .b8 95
1743
+ .b8 48
1744
+ .b8 100
1745
+ .b8 49
1746
+ .b8 100
1747
+ .b8 50
1748
+ .b8 100
1749
+ .b8 51
1750
+ .b8 100
1751
+ .b8 52
1752
+ .b8 100
1753
+ .b8 53
1754
+ .b8 100
1755
+ .b8 54
1756
+ .b8 100
1757
+ .b8 101
1758
+ .b8 55
1759
+ .b8 100
1760
+ .b8 101
1761
+ .b8 0
1762
+ .b8 1
1763
+ .b8 18
1764
+ .b8 1
1765
+ .b8 1
1766
+ .b8 3
1767
+ .b64 $L__func_begin0
1768
+ .b64 $L__func_end0
1769
+ .b8 1
1770
+ .b8 156
1771
+ .b32 125
1772
+ .b8 4
1773
+ .b32 125
1774
+ .b64 $L__tmp1
1775
+ .b64 $L__tmp2
1776
+ .b8 2
1777
+ .b8 47
1778
+ .b8 41
1779
+ .b8 5
1780
+ .b32 125
1781
+ .b64 $L__tmp2
1782
+ .b64 $L__tmp36
1783
+ .b8 2
1784
+ .b8 53
1785
+ .b8 44
1786
+ .b8 4
1787
+ .b32 125
1788
+ .b64 $L__tmp2
1789
+ .b64 $L__tmp36
1790
+ .b8 2
1791
+ .b8 120
1792
+ .b8 46
1793
+ .b8 0
1794
+ .b8 4
1795
+ .b32 125
1796
+ .b64 $L__tmp3
1797
+ .b64 $L__tmp31
1798
+ .b8 2
1799
+ .b8 53
1800
+ .b8 44
1801
+ .b8 0
1802
+ .b8 0
1803
+ }
1804
+ .section .debug_pubnames
1805
+ {
1806
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1807
+ $L__pubNames_start0:
1808
+ .b8 2
1809
+ .b8 0
1810
+ .b32 .debug_info
1811
+ .b32 306
1812
+ .b32 125
1813
+ .b8 116
1814
+ .b8 114
1815
+ .b8 105
1816
+ .b8 116
1817
+ .b8 111
1818
+ .b8 110
1819
+ .b8 95
1820
+ .b8 95
1821
+ .b8 48
1822
+ .b8 100
1823
+ .b8 49
1824
+ .b8 100
1825
+ .b8 50
1826
+ .b8 100
1827
+ .b8 51
1828
+ .b8 100
1829
+ .b8 52
1830
+ .b8 100
1831
+ .b8 53
1832
+ .b8 100
1833
+ .b8 54
1834
+ .b8 100
1835
+ .b8 101
1836
+ .b8 55
1837
+ .b8 100
1838
+ .b8 101
1839
+ .b8 0
1840
+ .b32 0
1841
+ $L__pubNames_end0:
1842
+ }
1843
+ .section .debug_pubtypes
1844
+ {
1845
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1846
+ $L__pubTypes_start0:
1847
+ .b8 2
1848
+ .b8 0
1849
+ .b32 .debug_info
1850
+ .b32 306
1851
+ .b32 0
1852
+ $L__pubTypes_end0:
1853
+ }
1854
+ .section .debug_loc { }
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
12
+ %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
15
+ %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
16
+ %cst_9 = arith.constant 0.000000e+00 : f32
17
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
18
+ %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
19
+ %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
22
+ %cst_15 = arith.constant dense<0.000000e+00> : tensor<16x256xbf16, #blocked>
23
+ %c16_i32 = arith.constant 16 : i32
24
+ %0 = tt.get_program_id x : i32
25
+ %1 = arith.muli %0, %c16_i32 : i32
26
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
27
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
28
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
29
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
30
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
31
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
32
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
33
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
34
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
35
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
36
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
37
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
38
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
39
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
40
+ %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
41
+ %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
42
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
43
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
44
+ %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
45
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
46
+ %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
47
+ %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
48
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
49
+ %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
50
+ %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
51
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
52
+ %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
53
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
54
+ %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
55
+ %31 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
56
+ %32 = tt.broadcast %31 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
57
+ %33 = arith.addi %24, %32 : tensor<16x256xi32, #blocked>
58
+ %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
59
+ %35 = tt.addptr %34, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
60
+ %36 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
61
+ %37 = arith.extf %36 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
62
+ %38 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
63
+ %39 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
64
+ %40 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
65
+ %41 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
66
+ %42 = arith.select %40, %38, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
67
+ %43 = arith.select %41, %39, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
68
+ %44 = arith.cmpi sge, %43, %cst_7 : tensor<16x1xi64, #blocked1>
69
+ %45 = arith.cmpi slt, %43, %cst_8 : tensor<16x1xi64, #blocked1>
70
+ %46 = arith.andi %44, %45 : tensor<16x1xi1, #blocked1>
71
+ tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
72
+ %47 = arith.muli %42, %cst_4 : tensor<16x1xi64, #blocked>
73
+ %48 = tt.broadcast %47 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
74
+ %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
75
+ %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
76
+ %51 = arith.addi %50, %48 : tensor<16x256xi64, #blocked>
77
+ %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
78
+ %53 = tt.addptr %52, %51 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
79
+ %54 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
80
+ %55 = arith.addf %54, %30 : tensor<16x256xf32, #blocked>
81
+ %56 = arith.addf %55, %37 : tensor<16x256xf32, #blocked>
82
+ %57 = arith.addf %56, %cst_14 : tensor<16x256xf32, #blocked>
83
+ %58 = arith.subf %56, %57 : tensor<16x256xf32, #blocked>
84
+ %59 = arith.mulf %56, %58 : tensor<16x256xf32, #blocked>
85
+ %60 = arith.addf %59, %cst_14 : tensor<16x256xf32, #blocked>
86
+ %61 = arith.select %29, %57, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
87
+ %62 = arith.select %29, %60, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
88
+ %63 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
89
+ %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
90
+ %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
91
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
92
+ %90 = arith.subf %arg11, %arg8 : f32
93
+ %91 = arith.addf %arg10, %arg13 : f32
94
+ %92 = arith.cmpf oeq, %91, %cst_9 : f32
95
+ %93 = arith.divf %arg13, %91 : f32
96
+ %94 = arith.select %92, %cst_9, %93 : f32
97
+ %95 = arith.mulf %90, %94 : f32
98
+ %96 = arith.addf %arg8, %95 : f32
99
+ %97 = arith.addf %arg9, %arg12 : f32
100
+ %98 = arith.mulf %90, %90 : f32
101
+ %99 = arith.mulf %98, %arg10 : f32
102
+ %100 = arith.mulf %99, %94 : f32
103
+ %101 = arith.addf %97, %100 : f32
104
+ tt.reduce.return %96, %101, %91 : f32, f32, f32
105
+ }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
106
+ %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
107
+ %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
108
+ %68 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
109
+ %69 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
110
+ %70 = arith.extf %69 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
111
+ %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
112
+ %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
113
+ %73 = tt.load %72, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
114
+ tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
115
+ %74 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
116
+ %75 = arith.addf %74, %68 : tensor<16x256xf32, #blocked>
117
+ %76 = arith.addf %75, %70 : tensor<16x256xf32, #blocked>
118
+ %77 = tt.broadcast %66 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
119
+ %78 = arith.subf %76, %77 : tensor<16x256xf32, #blocked>
120
+ %79 = arith.divf %67, %cst_13 : tensor<16x1xf32, #blocked>
121
+ %80 = arith.addf %79, %cst_12 : tensor<16x1xf32, #blocked>
122
+ %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
123
+ %82 = tt.broadcast %81 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
124
+ %83 = arith.mulf %78, %82 : tensor<16x256xf32, #blocked>
125
+ %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
126
+ %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
127
+ %86 = arith.mulf %83, %85 : tensor<16x256xf32, #blocked>
128
+ %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
129
+ %88 = tt.addptr %87, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
130
+ %89 = arith.truncf %86 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
131
+ tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
132
+ tt.return
133
+ }
134
+ }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = and i32 %6, 3, !dbg !8
11
+ %10 = and i32 %8, 3, !dbg !9
12
+ %urem = and i32 %6, 127, !dbg !9
13
+ %11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
14
+ %12 = shl i32 %11, 2, !dbg !11
15
+ %13 = or i32 %12, %9, !dbg !12
16
+ %14 = icmp ult i32 %urem, 120, !dbg !13
17
+ %15 = shl nuw nsw i32 %urem, 17, !dbg !14
18
+ %16 = add i32 %12, %15, !dbg !15
19
+ %17 = sext i32 %16 to i64, !dbg !16
20
+ %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16
21
+ %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17
22
+ %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17
23
+ %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17
24
+ %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17
25
+ %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17
26
+ %24 = bitcast i32 %20 to float, !dbg !17
27
+ %25 = bitcast i32 %21 to float, !dbg !17
28
+ %26 = bitcast i32 %22 to float, !dbg !17
29
+ %27 = bitcast i32 %23 to float, !dbg !17
30
+ %28 = fadd float %24, 0.000000e+00, !dbg !18
31
+ %29 = fadd float %25, 0.000000e+00, !dbg !18
32
+ %30 = fadd float %26, 0.000000e+00, !dbg !18
33
+ %31 = fadd float %27, 0.000000e+00, !dbg !18
34
+ %32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19
35
+ %33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19
36
+ %34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19
37
+ %35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19
38
+ %36 = bitcast float %32 to i32, !dbg !20
39
+ %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20
40
+ %38 = bitcast i32 %37 to float, !dbg !20
41
+ %39 = fadd float %32, %38, !dbg !24
42
+ %40 = bitcast float %39 to i32, !dbg !20
43
+ %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20
44
+ %42 = bitcast i32 %41 to float, !dbg !20
45
+ %43 = fadd float %39, %42, !dbg !24
46
+ %44 = bitcast float %43 to i32, !dbg !20
47
+ %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20
48
+ %46 = bitcast i32 %45 to float, !dbg !20
49
+ %47 = fadd float %43, %46, !dbg !24
50
+ %48 = bitcast float %47 to i32, !dbg !20
51
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20
52
+ %50 = bitcast i32 %49 to float, !dbg !20
53
+ %51 = fadd float %47, %50, !dbg !24
54
+ %52 = bitcast float %51 to i32, !dbg !20
55
+ %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20
56
+ %54 = bitcast i32 %53 to float, !dbg !20
57
+ %55 = fadd float %51, %54, !dbg !24
58
+ %56 = bitcast float %33 to i32, !dbg !20
59
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
60
+ %58 = bitcast i32 %57 to float, !dbg !20
61
+ %59 = fadd float %33, %58, !dbg !24
62
+ %60 = bitcast float %59 to i32, !dbg !20
63
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20
64
+ %62 = bitcast i32 %61 to float, !dbg !20
65
+ %63 = fadd float %59, %62, !dbg !24
66
+ %64 = bitcast float %63 to i32, !dbg !20
67
+ %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20
68
+ %66 = bitcast i32 %65 to float, !dbg !20
69
+ %67 = fadd float %63, %66, !dbg !24
70
+ %68 = bitcast float %67 to i32, !dbg !20
71
+ %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20
72
+ %70 = bitcast i32 %69 to float, !dbg !20
73
+ %71 = fadd float %67, %70, !dbg !24
74
+ %72 = bitcast float %71 to i32, !dbg !20
75
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20
76
+ %74 = bitcast i32 %73 to float, !dbg !20
77
+ %75 = fadd float %71, %74, !dbg !24
78
+ %76 = bitcast float %34 to i32, !dbg !20
79
+ %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20
80
+ %78 = bitcast i32 %77 to float, !dbg !20
81
+ %79 = fadd float %34, %78, !dbg !24
82
+ %80 = bitcast float %79 to i32, !dbg !20
83
+ %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20
84
+ %82 = bitcast i32 %81 to float, !dbg !20
85
+ %83 = fadd float %79, %82, !dbg !24
86
+ %84 = bitcast float %83 to i32, !dbg !20
87
+ %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20
88
+ %86 = bitcast i32 %85 to float, !dbg !20
89
+ %87 = fadd float %83, %86, !dbg !24
90
+ %88 = bitcast float %87 to i32, !dbg !20
91
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20
92
+ %90 = bitcast i32 %89 to float, !dbg !20
93
+ %91 = fadd float %87, %90, !dbg !24
94
+ %92 = bitcast float %91 to i32, !dbg !20
95
+ %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20
96
+ %94 = bitcast i32 %93 to float, !dbg !20
97
+ %95 = fadd float %91, %94, !dbg !24
98
+ %96 = bitcast float %35 to i32, !dbg !20
99
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20
100
+ %98 = bitcast i32 %97 to float, !dbg !20
101
+ %99 = fadd float %35, %98, !dbg !24
102
+ %100 = bitcast float %99 to i32, !dbg !20
103
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20
104
+ %102 = bitcast i32 %101 to float, !dbg !20
105
+ %103 = fadd float %99, %102, !dbg !24
106
+ %104 = bitcast float %103 to i32, !dbg !20
107
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20
108
+ %106 = bitcast i32 %105 to float, !dbg !20
109
+ %107 = fadd float %103, %106, !dbg !24
110
+ %108 = bitcast float %107 to i32, !dbg !20
111
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20
112
+ %110 = bitcast i32 %109 to float, !dbg !20
113
+ %111 = fadd float %107, %110, !dbg !24
114
+ %112 = bitcast float %111 to i32, !dbg !20
115
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20
116
+ %114 = bitcast i32 %113 to float, !dbg !20
117
+ %115 = fadd float %111, %114, !dbg !24
118
+ %116 = icmp eq i32 %7, 0, !dbg !20
119
+ %117 = zext nneg i32 %10 to i64, !dbg !20
120
+ %118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20
121
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20
122
+ %119 = or i32 %10, 4, !dbg !20
123
+ %120 = zext nneg i32 %119 to i64, !dbg !20
124
+ %121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20
126
+ %122 = or i32 %10, 8, !dbg !20
127
+ %123 = zext nneg i32 %122 to i64, !dbg !20
128
+ %124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20
129
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20
130
+ %125 = or i32 %10, 12, !dbg !20
131
+ %126 = zext nneg i32 %125 to i64, !dbg !20
132
+ %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20
133
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20
134
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
135
+ %128 = icmp slt i32 %6, 16, !dbg !20
136
+ %129 = sext i32 %6 to i64, !dbg !20
137
+ %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20
138
+ %131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20
139
+ %132 = bitcast float %131 to i32, !dbg !20
140
+ %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20
141
+ %134 = bitcast i32 %133 to float, !dbg !20
142
+ %135 = fadd float %131, %134, !dbg !24
143
+ %136 = bitcast float %135 to i32, !dbg !20
144
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20
145
+ %138 = bitcast i32 %137 to float, !dbg !20
146
+ %139 = fadd float %135, %138, !dbg !24
147
+ %140 = icmp eq i32 %9, 0, !dbg !20
148
+ %141 = and i1 %128, %140, !dbg !20
149
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20
150
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
151
+ %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20
152
+ %143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20
153
+ %144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20
154
+ %145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20
155
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
156
+ %146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28
157
+ store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28
158
+ %147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28
159
+ store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28
160
+ %148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28
161
+ store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28
162
+ %149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28
163
+ store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28
164
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
165
+ %150 = zext nneg i32 %9 to i64, !dbg !28
166
+ %151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28
167
+ %152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28
168
+ %.frozen = freeze i32 %13
169
+ %153 = sdiv i32 %.frozen, 256, !dbg !29
170
+ %154 = mul i32 %153, 256
171
+ %.decomposed = sub i32 %.frozen, %154
172
+ %155 = sext i32 %153 to i64, !dbg !30
173
+ %156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30
174
+ %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31
175
+ %158 = lshr i64 %157, 54, !dbg !32
176
+ %159 = and i64 %158, 512, !dbg !32
177
+ %160 = add i64 %159, %157, !dbg !32
178
+ %161 = shl i64 %160, 8, !dbg !33
179
+ %162 = sext i32 %.decomposed to i64, !dbg !34
180
+ %163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35
181
+ %164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35
182
+ %165 = lshr i32 %7, 2, !dbg !36
183
+ %166 = shl nuw nsw i32 %10, 3, !dbg !36
184
+ %167 = or i32 %166, %165, !dbg !36
185
+ %168 = icmp eq i32 %167, 0, !dbg !36
186
+ %169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36
187
+ ret void, !dbg !37
188
+ }
189
+
190
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
191
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
192
+
193
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
194
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
195
+
196
+ ; Function Attrs: convergent nocallback nounwind
197
+ declare void @llvm.nvvm.barrier0() #2
198
+
199
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
200
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
201
+ attributes #2 = { convergent nocallback nounwind }
202
+ attributes #3 = { nounwind }
203
+
204
+ !llvm.module.flags = !{!0}
205
+ !llvm.dbg.cu = !{!1}
206
+ !nvvm.annotations = !{!3, !4, !4, !3}
207
+
208
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
209
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
210
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
211
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
212
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
213
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
214
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
215
+ !7 = !{}
216
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
217
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
218
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
219
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
220
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
221
+ !13 = !DILocation(line: 29, column: 25, scope: !5)
222
+ !14 = !DILocation(line: 31, column: 47, scope: !5)
223
+ !15 = !DILocation(line: 31, column: 40, scope: !5)
224
+ !16 = !DILocation(line: 31, column: 34, scope: !5)
225
+ !17 = !DILocation(line: 31, column: 53, scope: !5)
226
+ !18 = !DILocation(line: 33, column: 23, scope: !5)
227
+ !19 = !DILocation(line: 34, column: 38, scope: !5)
228
+ !20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
229
+ !21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
230
+ !22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
231
+ !23 = !DILocation(line: 35, column: 25, scope: !21)
232
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
233
+ !25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
234
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
235
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
236
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
237
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
238
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
239
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
240
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
241
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
242
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
243
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
244
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
245
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
7
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %4 = shl i32 %3, 3, !dbg !10
9
+ %5 = and i32 %4, 1016, !dbg !10
10
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %7 = shl i32 %6, 10, !dbg !12
12
+ %8 = or i32 %7, %5, !dbg !13
13
+ %9 = sext i32 %8 to i64, !dbg !14
14
+ %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
15
+ %11 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
16
+ %12 = extractvalue { i32, i32, i32, i32 } %11, 0, !dbg !15
17
+ %13 = extractvalue { i32, i32, i32, i32 } %11, 1, !dbg !15
18
+ %14 = extractvalue { i32, i32, i32, i32 } %11, 2, !dbg !15
19
+ %15 = extractvalue { i32, i32, i32, i32 } %11, 3, !dbg !15
20
+ %16 = trunc i32 %12 to i16, !dbg !15
21
+ %extelt.offset = lshr i32 %12, 16, !dbg !15
22
+ %17 = trunc i32 %extelt.offset to i16, !dbg !15
23
+ %18 = trunc i32 %13 to i16, !dbg !15
24
+ %extelt.offset1 = lshr i32 %13, 16, !dbg !15
25
+ %19 = trunc i32 %extelt.offset1 to i16, !dbg !15
26
+ %20 = trunc i32 %14 to i16, !dbg !15
27
+ %extelt.offset2 = lshr i32 %14, 16, !dbg !15
28
+ %21 = trunc i32 %extelt.offset2 to i16, !dbg !15
29
+ %22 = trunc i32 %15 to i16, !dbg !15
30
+ %extelt.offset3 = lshr i32 %15, 16, !dbg !15
31
+ %23 = trunc i32 %extelt.offset3 to i16, !dbg !15
32
+ %24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #4, !dbg !16
33
+ %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
34
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
35
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
36
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
37
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
38
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
39
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
40
+ %32 = fmul float %24, 0x3FE6A09E60000000, !dbg !17
41
+ %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
42
+ %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
43
+ %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
44
+ %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
45
+ %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
46
+ %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
47
+ %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
48
+ %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
49
+ %.not.i = icmp eq i32 %40, 0, !dbg !18
50
+ %41 = tail call float @llvm.nvvm.fabs.ftz.f(float %32) #4, !dbg !18
51
+ %42 = tail call float @llvm.nvvm.fabs.f(float %32) #4, !dbg !18
52
+ %.0.i = select i1 %.not.i, float %42, float %41, !dbg !18
53
+ %43 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
54
+ br i1 %43, label %__nv_fabsf.exit1.i, label %45, !dbg !18
55
+
56
+ __nv_fabsf.exit1.i: ; preds = %2
57
+ %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
58
+ %.not1.i = icmp eq i32 %44, 0, !dbg !18
59
+ %.01.i = select i1 %.not1.i, float %42, float %41, !dbg !18
60
+ br label %__internal_fmad.exit.i, !dbg !18
61
+
62
+ 45: ; preds = %2
63
+ %46 = fmul float %32, %32, !dbg !18
64
+ br label %__internal_fmad.exit.i, !dbg !18
65
+
66
+ __internal_fmad.exit.i: ; preds = %45, %__nv_fabsf.exit1.i
67
+ %47 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %45 ], !dbg !18
68
+ %48 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %45 ], !dbg !18
69
+ %49 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %45 ], !dbg !18
70
+ %50 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %45 ], !dbg !18
71
+ %51 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %45 ], !dbg !18
72
+ %52 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %45 ], !dbg !18
73
+ %53 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %45 ], !dbg !18
74
+ %54 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %46, %45 ], !dbg !18
75
+ %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
76
+ %.not2.i = icmp eq i32 %55, 0, !dbg !18
77
+ %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %53, float %54, float %52) #4, !dbg !18
78
+ %57 = tail call float @llvm.nvvm.fma.rn.f(float %53, float %54, float %52) #4, !dbg !18
79
+ %.02.i = select i1 %.not2.i, float %57, float %56, !dbg !18
80
+ %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
81
+ %.not3.i = icmp eq i32 %58, 0, !dbg !18
82
+ %59 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %54, float %51) #4, !dbg !18
83
+ %60 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %54, float %51) #4, !dbg !18
84
+ %.03.i = select i1 %.not3.i, float %60, float %59, !dbg !18
85
+ %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
86
+ %.not4.i = icmp eq i32 %61, 0, !dbg !18
87
+ %62 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %54, float %50) #4, !dbg !18
88
+ %63 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %54, float %50) #4, !dbg !18
89
+ %.04.i = select i1 %.not4.i, float %63, float %62, !dbg !18
90
+ %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
91
+ %.not5.i = icmp eq i32 %64, 0, !dbg !18
92
+ %65 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %54, float %49) #4, !dbg !18
93
+ %66 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %54, float %49) #4, !dbg !18
94
+ %.05.i = select i1 %.not5.i, float %66, float %65, !dbg !18
95
+ %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
96
+ %.not6.i = icmp eq i32 %67, 0, !dbg !18
97
+ %68 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %54, float %48) #4, !dbg !18
98
+ %69 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %54, float %48) #4, !dbg !18
99
+ %.06.i = select i1 %.not6.i, float %69, float %68, !dbg !18
100
+ %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
101
+ %.not7.i = icmp eq i32 %70, 0, !dbg !18
102
+ %71 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %54, float %47) #4, !dbg !18
103
+ %72 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %54, float %47) #4, !dbg !18
104
+ %.07.i = select i1 %.not7.i, float %72, float %71, !dbg !18
105
+ %73 = fneg float %54, !dbg !18
106
+ %74 = select i1 %43, float %73, float %32, !dbg !18
107
+ %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
108
+ %.not8.i = icmp eq i32 %75, 0, !dbg !18
109
+ %76 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %74, float %74) #4, !dbg !18
110
+ %77 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %74, float %74) #4, !dbg !18
111
+ %.08.i = select i1 %.not8.i, float %77, float %76, !dbg !18
112
+ br i1 %43, label %78, label %__nv_erff.exit, !dbg !18
113
+
114
+ 78: ; preds = %__internal_fmad.exit.i
115
+ %79 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
116
+ %80 = fsub float 1.000000e+00, %79, !dbg !18
117
+ %81 = bitcast float %80 to i32, !dbg !18
118
+ %82 = bitcast float %32 to i32, !dbg !18
119
+ %83 = and i32 %82, -2147483648, !dbg !18
120
+ %84 = or i32 %83, %81, !dbg !18
121
+ %85 = bitcast i32 %84 to float, !dbg !18
122
+ br label %__nv_erff.exit, !dbg !18
123
+
124
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %78
125
+ %r.0.i = phi float [ %85, %78 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
126
+ %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
127
+ %.not.i4 = icmp eq i32 %86, 0, !dbg !18
128
+ %87 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
129
+ %88 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
130
+ %.0.i5 = select i1 %.not.i4, float %88, float %87, !dbg !18
131
+ %89 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
132
+ br i1 %89, label %__nv_fabsf.exit1.i22, label %91, !dbg !18
133
+
134
+ __nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
135
+ %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
136
+ %.not1.i23 = icmp eq i32 %90, 0, !dbg !18
137
+ %.01.i24 = select i1 %.not1.i23, float %88, float %87, !dbg !18
138
+ br label %__internal_fmad.exit.i6, !dbg !18
139
+
140
+ 91: ; preds = %__nv_erff.exit
141
+ %92 = fmul float %33, %33, !dbg !18
142
+ br label %__internal_fmad.exit.i6, !dbg !18
143
+
144
+ __internal_fmad.exit.i6: ; preds = %91, %__nv_fabsf.exit1.i22
145
+ %93 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %91 ], !dbg !18
146
+ %94 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %91 ], !dbg !18
147
+ %95 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %91 ], !dbg !18
148
+ %96 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %91 ], !dbg !18
149
+ %97 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %91 ], !dbg !18
150
+ %98 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %91 ], !dbg !18
151
+ %99 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %91 ], !dbg !18
152
+ %100 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %92, %91 ], !dbg !18
153
+ %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
154
+ %.not2.i7 = icmp eq i32 %101, 0, !dbg !18
155
+ %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %99, float %100, float %98) #4, !dbg !18
156
+ %103 = tail call float @llvm.nvvm.fma.rn.f(float %99, float %100, float %98) #4, !dbg !18
157
+ %.02.i8 = select i1 %.not2.i7, float %103, float %102, !dbg !18
158
+ %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
159
+ %.not3.i9 = icmp eq i32 %104, 0, !dbg !18
160
+ %105 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %100, float %97) #4, !dbg !18
161
+ %106 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %100, float %97) #4, !dbg !18
162
+ %.03.i10 = select i1 %.not3.i9, float %106, float %105, !dbg !18
163
+ %107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
164
+ %.not4.i11 = icmp eq i32 %107, 0, !dbg !18
165
+ %108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %100, float %96) #4, !dbg !18
166
+ %109 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %100, float %96) #4, !dbg !18
167
+ %.04.i12 = select i1 %.not4.i11, float %109, float %108, !dbg !18
168
+ %110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
169
+ %.not5.i13 = icmp eq i32 %110, 0, !dbg !18
170
+ %111 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %100, float %95) #4, !dbg !18
171
+ %112 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %100, float %95) #4, !dbg !18
172
+ %.05.i14 = select i1 %.not5.i13, float %112, float %111, !dbg !18
173
+ %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
174
+ %.not6.i15 = icmp eq i32 %113, 0, !dbg !18
175
+ %114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %100, float %94) #4, !dbg !18
176
+ %115 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %100, float %94) #4, !dbg !18
177
+ %.06.i16 = select i1 %.not6.i15, float %115, float %114, !dbg !18
178
+ %116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
179
+ %.not7.i17 = icmp eq i32 %116, 0, !dbg !18
180
+ %117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %100, float %93) #4, !dbg !18
181
+ %118 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %100, float %93) #4, !dbg !18
182
+ %.07.i18 = select i1 %.not7.i17, float %118, float %117, !dbg !18
183
+ %119 = fneg float %100, !dbg !18
184
+ %120 = select i1 %89, float %119, float %33, !dbg !18
185
+ %121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
186
+ %.not8.i19 = icmp eq i32 %121, 0, !dbg !18
187
+ %122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %120, float %120) #4, !dbg !18
188
+ %123 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %120, float %120) #4, !dbg !18
189
+ %.08.i20 = select i1 %.not8.i19, float %123, float %122, !dbg !18
190
+ br i1 %89, label %124, label %__nv_erff.exit25, !dbg !18
191
+
192
+ 124: ; preds = %__internal_fmad.exit.i6
193
+ %125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
194
+ %126 = fsub float 1.000000e+00, %125, !dbg !18
195
+ %127 = bitcast float %126 to i32, !dbg !18
196
+ %128 = bitcast float %33 to i32, !dbg !18
197
+ %129 = and i32 %128, -2147483648, !dbg !18
198
+ %130 = or i32 %129, %127, !dbg !18
199
+ %131 = bitcast i32 %130 to float, !dbg !18
200
+ br label %__nv_erff.exit25, !dbg !18
201
+
202
+ __nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %124
203
+ %r.0.i21 = phi float [ %131, %124 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
204
+ %132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
205
+ %.not.i26 = icmp eq i32 %132, 0, !dbg !18
206
+ %133 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
207
+ %134 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
208
+ %.0.i27 = select i1 %.not.i26, float %134, float %133, !dbg !18
209
+ %135 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
210
+ br i1 %135, label %__nv_fabsf.exit1.i44, label %137, !dbg !18
211
+
212
+ __nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
213
+ %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
214
+ %.not1.i45 = icmp eq i32 %136, 0, !dbg !18
215
+ %.01.i46 = select i1 %.not1.i45, float %134, float %133, !dbg !18
216
+ br label %__internal_fmad.exit.i28, !dbg !18
217
+
218
+ 137: ; preds = %__nv_erff.exit25
219
+ %138 = fmul float %34, %34, !dbg !18
220
+ br label %__internal_fmad.exit.i28, !dbg !18
221
+
222
+ __internal_fmad.exit.i28: ; preds = %137, %__nv_fabsf.exit1.i44
223
+ %139 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %137 ], !dbg !18
224
+ %140 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %137 ], !dbg !18
225
+ %141 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %137 ], !dbg !18
226
+ %142 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %137 ], !dbg !18
227
+ %143 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %137 ], !dbg !18
228
+ %144 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %137 ], !dbg !18
229
+ %145 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %137 ], !dbg !18
230
+ %146 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %138, %137 ], !dbg !18
231
+ %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
232
+ %.not2.i29 = icmp eq i32 %147, 0, !dbg !18
233
+ %148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %145, float %146, float %144) #4, !dbg !18
234
+ %149 = tail call float @llvm.nvvm.fma.rn.f(float %145, float %146, float %144) #4, !dbg !18
235
+ %.02.i30 = select i1 %.not2.i29, float %149, float %148, !dbg !18
236
+ %150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
237
+ %.not3.i31 = icmp eq i32 %150, 0, !dbg !18
238
+ %151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %146, float %143) #4, !dbg !18
239
+ %152 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %146, float %143) #4, !dbg !18
240
+ %.03.i32 = select i1 %.not3.i31, float %152, float %151, !dbg !18
241
+ %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
242
+ %.not4.i33 = icmp eq i32 %153, 0, !dbg !18
243
+ %154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %146, float %142) #4, !dbg !18
244
+ %155 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %146, float %142) #4, !dbg !18
245
+ %.04.i34 = select i1 %.not4.i33, float %155, float %154, !dbg !18
246
+ %156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
247
+ %.not5.i35 = icmp eq i32 %156, 0, !dbg !18
248
+ %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %146, float %141) #4, !dbg !18
249
+ %158 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %146, float %141) #4, !dbg !18
250
+ %.05.i36 = select i1 %.not5.i35, float %158, float %157, !dbg !18
251
+ %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
252
+ %.not6.i37 = icmp eq i32 %159, 0, !dbg !18
253
+ %160 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %146, float %140) #4, !dbg !18
254
+ %161 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %146, float %140) #4, !dbg !18
255
+ %.06.i38 = select i1 %.not6.i37, float %161, float %160, !dbg !18
256
+ %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
257
+ %.not7.i39 = icmp eq i32 %162, 0, !dbg !18
258
+ %163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %146, float %139) #4, !dbg !18
259
+ %164 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %146, float %139) #4, !dbg !18
260
+ %.07.i40 = select i1 %.not7.i39, float %164, float %163, !dbg !18
261
+ %165 = fneg float %146, !dbg !18
262
+ %166 = select i1 %135, float %165, float %34, !dbg !18
263
+ %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
264
+ %.not8.i41 = icmp eq i32 %167, 0, !dbg !18
265
+ %168 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %166, float %166) #4, !dbg !18
266
+ %169 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %166, float %166) #4, !dbg !18
267
+ %.08.i42 = select i1 %.not8.i41, float %169, float %168, !dbg !18
268
+ br i1 %135, label %170, label %__nv_erff.exit47, !dbg !18
269
+
270
+ 170: ; preds = %__internal_fmad.exit.i28
271
+ %171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
272
+ %172 = fsub float 1.000000e+00, %171, !dbg !18
273
+ %173 = bitcast float %172 to i32, !dbg !18
274
+ %174 = bitcast float %34 to i32, !dbg !18
275
+ %175 = and i32 %174, -2147483648, !dbg !18
276
+ %176 = or i32 %175, %173, !dbg !18
277
+ %177 = bitcast i32 %176 to float, !dbg !18
278
+ br label %__nv_erff.exit47, !dbg !18
279
+
280
+ __nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %170
281
+ %r.0.i43 = phi float [ %177, %170 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
282
+ %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
283
+ %.not.i48 = icmp eq i32 %178, 0, !dbg !18
284
+ %179 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
285
+ %180 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
286
+ %.0.i49 = select i1 %.not.i48, float %180, float %179, !dbg !18
287
+ %181 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
288
+ br i1 %181, label %__nv_fabsf.exit1.i66, label %183, !dbg !18
289
+
290
+ __nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
291
+ %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
292
+ %.not1.i67 = icmp eq i32 %182, 0, !dbg !18
293
+ %.01.i68 = select i1 %.not1.i67, float %180, float %179, !dbg !18
294
+ br label %__internal_fmad.exit.i50, !dbg !18
295
+
296
+ 183: ; preds = %__nv_erff.exit47
297
+ %184 = fmul float %35, %35, !dbg !18
298
+ br label %__internal_fmad.exit.i50, !dbg !18
299
+
300
+ __internal_fmad.exit.i50: ; preds = %183, %__nv_fabsf.exit1.i66
301
+ %185 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %183 ], !dbg !18
302
+ %186 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %183 ], !dbg !18
303
+ %187 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %183 ], !dbg !18
304
+ %188 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %183 ], !dbg !18
305
+ %189 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %183 ], !dbg !18
306
+ %190 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %183 ], !dbg !18
307
+ %191 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %183 ], !dbg !18
308
+ %192 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %184, %183 ], !dbg !18
309
+ %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
310
+ %.not2.i51 = icmp eq i32 %193, 0, !dbg !18
311
+ %194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %191, float %192, float %190) #4, !dbg !18
312
+ %195 = tail call float @llvm.nvvm.fma.rn.f(float %191, float %192, float %190) #4, !dbg !18
313
+ %.02.i52 = select i1 %.not2.i51, float %195, float %194, !dbg !18
314
+ %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
315
+ %.not3.i53 = icmp eq i32 %196, 0, !dbg !18
316
+ %197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %192, float %189) #4, !dbg !18
317
+ %198 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %192, float %189) #4, !dbg !18
318
+ %.03.i54 = select i1 %.not3.i53, float %198, float %197, !dbg !18
319
+ %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
320
+ %.not4.i55 = icmp eq i32 %199, 0, !dbg !18
321
+ %200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %192, float %188) #4, !dbg !18
322
+ %201 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %192, float %188) #4, !dbg !18
323
+ %.04.i56 = select i1 %.not4.i55, float %201, float %200, !dbg !18
324
+ %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
325
+ %.not5.i57 = icmp eq i32 %202, 0, !dbg !18
326
+ %203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %192, float %187) #4, !dbg !18
327
+ %204 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %192, float %187) #4, !dbg !18
328
+ %.05.i58 = select i1 %.not5.i57, float %204, float %203, !dbg !18
329
+ %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
330
+ %.not6.i59 = icmp eq i32 %205, 0, !dbg !18
331
+ %206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %192, float %186) #4, !dbg !18
332
+ %207 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %192, float %186) #4, !dbg !18
333
+ %.06.i60 = select i1 %.not6.i59, float %207, float %206, !dbg !18
334
+ %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
335
+ %.not7.i61 = icmp eq i32 %208, 0, !dbg !18
336
+ %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %192, float %185) #4, !dbg !18
337
+ %210 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %192, float %185) #4, !dbg !18
338
+ %.07.i62 = select i1 %.not7.i61, float %210, float %209, !dbg !18
339
+ %211 = fneg float %192, !dbg !18
340
+ %212 = select i1 %181, float %211, float %35, !dbg !18
341
+ %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
342
+ %.not8.i63 = icmp eq i32 %213, 0, !dbg !18
343
+ %214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %212, float %212) #4, !dbg !18
344
+ %215 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %212, float %212) #4, !dbg !18
345
+ %.08.i64 = select i1 %.not8.i63, float %215, float %214, !dbg !18
346
+ br i1 %181, label %216, label %__nv_erff.exit69, !dbg !18
347
+
348
+ 216: ; preds = %__internal_fmad.exit.i50
349
+ %217 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
350
+ %218 = fsub float 1.000000e+00, %217, !dbg !18
351
+ %219 = bitcast float %218 to i32, !dbg !18
352
+ %220 = bitcast float %35 to i32, !dbg !18
353
+ %221 = and i32 %220, -2147483648, !dbg !18
354
+ %222 = or i32 %221, %219, !dbg !18
355
+ %223 = bitcast i32 %222 to float, !dbg !18
356
+ br label %__nv_erff.exit69, !dbg !18
357
+
358
+ __nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %216
359
+ %r.0.i65 = phi float [ %223, %216 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
360
+ %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
361
+ %.not.i70 = icmp eq i32 %224, 0, !dbg !18
362
+ %225 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
363
+ %226 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
364
+ %.0.i71 = select i1 %.not.i70, float %226, float %225, !dbg !18
365
+ %227 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
366
+ br i1 %227, label %__nv_fabsf.exit1.i88, label %229, !dbg !18
367
+
368
+ __nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
369
+ %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
370
+ %.not1.i89 = icmp eq i32 %228, 0, !dbg !18
371
+ %.01.i90 = select i1 %.not1.i89, float %226, float %225, !dbg !18
372
+ br label %__internal_fmad.exit.i72, !dbg !18
373
+
374
+ 229: ; preds = %__nv_erff.exit69
375
+ %230 = fmul float %36, %36, !dbg !18
376
+ br label %__internal_fmad.exit.i72, !dbg !18
377
+
378
+ __internal_fmad.exit.i72: ; preds = %229, %__nv_fabsf.exit1.i88
379
+ %231 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %229 ], !dbg !18
380
+ %232 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %229 ], !dbg !18
381
+ %233 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %229 ], !dbg !18
382
+ %234 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %229 ], !dbg !18
383
+ %235 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %229 ], !dbg !18
384
+ %236 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %229 ], !dbg !18
385
+ %237 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %229 ], !dbg !18
386
+ %238 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %230, %229 ], !dbg !18
387
+ %239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
388
+ %.not2.i73 = icmp eq i32 %239, 0, !dbg !18
389
+ %240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %237, float %238, float %236) #4, !dbg !18
390
+ %241 = tail call float @llvm.nvvm.fma.rn.f(float %237, float %238, float %236) #4, !dbg !18
391
+ %.02.i74 = select i1 %.not2.i73, float %241, float %240, !dbg !18
392
+ %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
393
+ %.not3.i75 = icmp eq i32 %242, 0, !dbg !18
394
+ %243 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %238, float %235) #4, !dbg !18
395
+ %244 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %238, float %235) #4, !dbg !18
396
+ %.03.i76 = select i1 %.not3.i75, float %244, float %243, !dbg !18
397
+ %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
398
+ %.not4.i77 = icmp eq i32 %245, 0, !dbg !18
399
+ %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %238, float %234) #4, !dbg !18
400
+ %247 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %238, float %234) #4, !dbg !18
401
+ %.04.i78 = select i1 %.not4.i77, float %247, float %246, !dbg !18
402
+ %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
403
+ %.not5.i79 = icmp eq i32 %248, 0, !dbg !18
404
+ %249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %238, float %233) #4, !dbg !18
405
+ %250 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %238, float %233) #4, !dbg !18
406
+ %.05.i80 = select i1 %.not5.i79, float %250, float %249, !dbg !18
407
+ %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
408
+ %.not6.i81 = icmp eq i32 %251, 0, !dbg !18
409
+ %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %238, float %232) #4, !dbg !18
410
+ %253 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %238, float %232) #4, !dbg !18
411
+ %.06.i82 = select i1 %.not6.i81, float %253, float %252, !dbg !18
412
+ %254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
413
+ %.not7.i83 = icmp eq i32 %254, 0, !dbg !18
414
+ %255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %238, float %231) #4, !dbg !18
415
+ %256 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %238, float %231) #4, !dbg !18
416
+ %.07.i84 = select i1 %.not7.i83, float %256, float %255, !dbg !18
417
+ %257 = fneg float %238, !dbg !18
418
+ %258 = select i1 %227, float %257, float %36, !dbg !18
419
+ %259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
420
+ %.not8.i85 = icmp eq i32 %259, 0, !dbg !18
421
+ %260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %258, float %258) #4, !dbg !18
422
+ %261 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %258, float %258) #4, !dbg !18
423
+ %.08.i86 = select i1 %.not8.i85, float %261, float %260, !dbg !18
424
+ br i1 %227, label %262, label %__nv_erff.exit91, !dbg !18
425
+
426
+ 262: ; preds = %__internal_fmad.exit.i72
427
+ %263 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
428
+ %264 = fsub float 1.000000e+00, %263, !dbg !18
429
+ %265 = bitcast float %264 to i32, !dbg !18
430
+ %266 = bitcast float %36 to i32, !dbg !18
431
+ %267 = and i32 %266, -2147483648, !dbg !18
432
+ %268 = or i32 %267, %265, !dbg !18
433
+ %269 = bitcast i32 %268 to float, !dbg !18
434
+ br label %__nv_erff.exit91, !dbg !18
435
+
436
+ __nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %262
437
+ %r.0.i87 = phi float [ %269, %262 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
438
+ %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
439
+ %.not.i92 = icmp eq i32 %270, 0, !dbg !18
440
+ %271 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
441
+ %272 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
442
+ %.0.i93 = select i1 %.not.i92, float %272, float %271, !dbg !18
443
+ %273 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
444
+ br i1 %273, label %__nv_fabsf.exit1.i110, label %275, !dbg !18
445
+
446
+ __nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
447
+ %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
448
+ %.not1.i111 = icmp eq i32 %274, 0, !dbg !18
449
+ %.01.i112 = select i1 %.not1.i111, float %272, float %271, !dbg !18
450
+ br label %__internal_fmad.exit.i94, !dbg !18
451
+
452
+ 275: ; preds = %__nv_erff.exit91
453
+ %276 = fmul float %37, %37, !dbg !18
454
+ br label %__internal_fmad.exit.i94, !dbg !18
455
+
456
+ __internal_fmad.exit.i94: ; preds = %275, %__nv_fabsf.exit1.i110
457
+ %277 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %275 ], !dbg !18
458
+ %278 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %275 ], !dbg !18
459
+ %279 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %275 ], !dbg !18
460
+ %280 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %275 ], !dbg !18
461
+ %281 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %275 ], !dbg !18
462
+ %282 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %275 ], !dbg !18
463
+ %283 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %275 ], !dbg !18
464
+ %284 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %276, %275 ], !dbg !18
465
+ %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
466
+ %.not2.i95 = icmp eq i32 %285, 0, !dbg !18
467
+ %286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %283, float %284, float %282) #4, !dbg !18
468
+ %287 = tail call float @llvm.nvvm.fma.rn.f(float %283, float %284, float %282) #4, !dbg !18
469
+ %.02.i96 = select i1 %.not2.i95, float %287, float %286, !dbg !18
470
+ %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
471
+ %.not3.i97 = icmp eq i32 %288, 0, !dbg !18
472
+ %289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %284, float %281) #4, !dbg !18
473
+ %290 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %284, float %281) #4, !dbg !18
474
+ %.03.i98 = select i1 %.not3.i97, float %290, float %289, !dbg !18
475
+ %291 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
476
+ %.not4.i99 = icmp eq i32 %291, 0, !dbg !18
477
+ %292 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %284, float %280) #4, !dbg !18
478
+ %293 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %284, float %280) #4, !dbg !18
479
+ %.04.i100 = select i1 %.not4.i99, float %293, float %292, !dbg !18
480
+ %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
481
+ %.not5.i101 = icmp eq i32 %294, 0, !dbg !18
482
+ %295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %284, float %279) #4, !dbg !18
483
+ %296 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %284, float %279) #4, !dbg !18
484
+ %.05.i102 = select i1 %.not5.i101, float %296, float %295, !dbg !18
485
+ %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
486
+ %.not6.i103 = icmp eq i32 %297, 0, !dbg !18
487
+ %298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %284, float %278) #4, !dbg !18
488
+ %299 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %284, float %278) #4, !dbg !18
489
+ %.06.i104 = select i1 %.not6.i103, float %299, float %298, !dbg !18
490
+ %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
491
+ %.not7.i105 = icmp eq i32 %300, 0, !dbg !18
492
+ %301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %284, float %277) #4, !dbg !18
493
+ %302 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %284, float %277) #4, !dbg !18
494
+ %.07.i106 = select i1 %.not7.i105, float %302, float %301, !dbg !18
495
+ %303 = fneg float %284, !dbg !18
496
+ %304 = select i1 %273, float %303, float %37, !dbg !18
497
+ %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
498
+ %.not8.i107 = icmp eq i32 %305, 0, !dbg !18
499
+ %306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %304, float %304) #4, !dbg !18
500
+ %307 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %304, float %304) #4, !dbg !18
501
+ %.08.i108 = select i1 %.not8.i107, float %307, float %306, !dbg !18
502
+ br i1 %273, label %308, label %__nv_erff.exit113, !dbg !18
503
+
504
+ 308: ; preds = %__internal_fmad.exit.i94
505
+ %309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
506
+ %310 = fsub float 1.000000e+00, %309, !dbg !18
507
+ %311 = bitcast float %310 to i32, !dbg !18
508
+ %312 = bitcast float %37 to i32, !dbg !18
509
+ %313 = and i32 %312, -2147483648, !dbg !18
510
+ %314 = or i32 %313, %311, !dbg !18
511
+ %315 = bitcast i32 %314 to float, !dbg !18
512
+ br label %__nv_erff.exit113, !dbg !18
513
+
514
+ __nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %308
515
+ %r.0.i109 = phi float [ %315, %308 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
516
+ %316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
517
+ %.not.i114 = icmp eq i32 %316, 0, !dbg !18
518
+ %317 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
519
+ %318 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
520
+ %.0.i115 = select i1 %.not.i114, float %318, float %317, !dbg !18
521
+ %319 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
522
+ br i1 %319, label %__nv_fabsf.exit1.i132, label %321, !dbg !18
523
+
524
+ __nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
525
+ %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
526
+ %.not1.i133 = icmp eq i32 %320, 0, !dbg !18
527
+ %.01.i134 = select i1 %.not1.i133, float %318, float %317, !dbg !18
528
+ br label %__internal_fmad.exit.i116, !dbg !18
529
+
530
+ 321: ; preds = %__nv_erff.exit113
531
+ %322 = fmul float %38, %38, !dbg !18
532
+ br label %__internal_fmad.exit.i116, !dbg !18
533
+
534
+ __internal_fmad.exit.i116: ; preds = %321, %__nv_fabsf.exit1.i132
535
+ %323 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %321 ], !dbg !18
536
+ %324 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %321 ], !dbg !18
537
+ %325 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %321 ], !dbg !18
538
+ %326 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %321 ], !dbg !18
539
+ %327 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %321 ], !dbg !18
540
+ %328 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %321 ], !dbg !18
541
+ %329 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %321 ], !dbg !18
542
+ %330 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %322, %321 ], !dbg !18
543
+ %331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
544
+ %.not2.i117 = icmp eq i32 %331, 0, !dbg !18
545
+ %332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %329, float %330, float %328) #4, !dbg !18
546
+ %333 = tail call float @llvm.nvvm.fma.rn.f(float %329, float %330, float %328) #4, !dbg !18
547
+ %.02.i118 = select i1 %.not2.i117, float %333, float %332, !dbg !18
548
+ %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
549
+ %.not3.i119 = icmp eq i32 %334, 0, !dbg !18
550
+ %335 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %330, float %327) #4, !dbg !18
551
+ %336 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %330, float %327) #4, !dbg !18
552
+ %.03.i120 = select i1 %.not3.i119, float %336, float %335, !dbg !18
553
+ %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
554
+ %.not4.i121 = icmp eq i32 %337, 0, !dbg !18
555
+ %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %330, float %326) #4, !dbg !18
556
+ %339 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %330, float %326) #4, !dbg !18
557
+ %.04.i122 = select i1 %.not4.i121, float %339, float %338, !dbg !18
558
+ %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
559
+ %.not5.i123 = icmp eq i32 %340, 0, !dbg !18
560
+ %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %330, float %325) #4, !dbg !18
561
+ %342 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %330, float %325) #4, !dbg !18
562
+ %.05.i124 = select i1 %.not5.i123, float %342, float %341, !dbg !18
563
+ %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
564
+ %.not6.i125 = icmp eq i32 %343, 0, !dbg !18
565
+ %344 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %330, float %324) #4, !dbg !18
566
+ %345 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %330, float %324) #4, !dbg !18
567
+ %.06.i126 = select i1 %.not6.i125, float %345, float %344, !dbg !18
568
+ %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
569
+ %.not7.i127 = icmp eq i32 %346, 0, !dbg !18
570
+ %347 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %330, float %323) #4, !dbg !18
571
+ %348 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %330, float %323) #4, !dbg !18
572
+ %.07.i128 = select i1 %.not7.i127, float %348, float %347, !dbg !18
573
+ %349 = fneg float %330, !dbg !18
574
+ %350 = select i1 %319, float %349, float %38, !dbg !18
575
+ %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
576
+ %.not8.i129 = icmp eq i32 %351, 0, !dbg !18
577
+ %352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %350, float %350) #4, !dbg !18
578
+ %353 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %350, float %350) #4, !dbg !18
579
+ %.08.i130 = select i1 %.not8.i129, float %353, float %352, !dbg !18
580
+ br i1 %319, label %354, label %__nv_erff.exit135, !dbg !18
581
+
582
+ 354: ; preds = %__internal_fmad.exit.i116
583
+ %355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
584
+ %356 = fsub float 1.000000e+00, %355, !dbg !18
585
+ %357 = bitcast float %356 to i32, !dbg !18
586
+ %358 = bitcast float %38 to i32, !dbg !18
587
+ %359 = and i32 %358, -2147483648, !dbg !18
588
+ %360 = or i32 %359, %357, !dbg !18
589
+ %361 = bitcast i32 %360 to float, !dbg !18
590
+ br label %__nv_erff.exit135, !dbg !18
591
+
592
+ __nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %354
593
+ %r.0.i131 = phi float [ %361, %354 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
594
+ %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
595
+ %.not.i136 = icmp eq i32 %362, 0, !dbg !18
596
+ %363 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
597
+ %364 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
598
+ %.0.i137 = select i1 %.not.i136, float %364, float %363, !dbg !18
599
+ %365 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
600
+ br i1 %365, label %__nv_fabsf.exit1.i154, label %367, !dbg !18
601
+
602
+ __nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
603
+ %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
604
+ %.not1.i155 = icmp eq i32 %366, 0, !dbg !18
605
+ %.01.i156 = select i1 %.not1.i155, float %364, float %363, !dbg !18
606
+ br label %__internal_fmad.exit.i138, !dbg !18
607
+
608
+ 367: ; preds = %__nv_erff.exit135
609
+ %368 = fmul float %39, %39, !dbg !18
610
+ br label %__internal_fmad.exit.i138, !dbg !18
611
+
612
+ __internal_fmad.exit.i138: ; preds = %367, %__nv_fabsf.exit1.i154
613
+ %369 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %367 ], !dbg !18
614
+ %370 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %367 ], !dbg !18
615
+ %371 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %367 ], !dbg !18
616
+ %372 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %367 ], !dbg !18
617
+ %373 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %367 ], !dbg !18
618
+ %374 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %367 ], !dbg !18
619
+ %375 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %367 ], !dbg !18
620
+ %376 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %368, %367 ], !dbg !18
621
+ %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
622
+ %.not2.i139 = icmp eq i32 %377, 0, !dbg !18
623
+ %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float %376, float %374) #4, !dbg !18
624
+ %379 = tail call float @llvm.nvvm.fma.rn.f(float %375, float %376, float %374) #4, !dbg !18
625
+ %.02.i140 = select i1 %.not2.i139, float %379, float %378, !dbg !18
626
+ %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
627
+ %.not3.i141 = icmp eq i32 %380, 0, !dbg !18
628
+ %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %376, float %373) #4, !dbg !18
629
+ %382 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %376, float %373) #4, !dbg !18
630
+ %.03.i142 = select i1 %.not3.i141, float %382, float %381, !dbg !18
631
+ %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
632
+ %.not4.i143 = icmp eq i32 %383, 0, !dbg !18
633
+ %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %376, float %372) #4, !dbg !18
634
+ %385 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %376, float %372) #4, !dbg !18
635
+ %.04.i144 = select i1 %.not4.i143, float %385, float %384, !dbg !18
636
+ %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
637
+ %.not5.i145 = icmp eq i32 %386, 0, !dbg !18
638
+ %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %376, float %371) #4, !dbg !18
639
+ %388 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %376, float %371) #4, !dbg !18
640
+ %.05.i146 = select i1 %.not5.i145, float %388, float %387, !dbg !18
641
+ %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
642
+ %.not6.i147 = icmp eq i32 %389, 0, !dbg !18
643
+ %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %376, float %370) #4, !dbg !18
644
+ %391 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %376, float %370) #4, !dbg !18
645
+ %.06.i148 = select i1 %.not6.i147, float %391, float %390, !dbg !18
646
+ %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
647
+ %.not7.i149 = icmp eq i32 %392, 0, !dbg !18
648
+ %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %376, float %369) #4, !dbg !18
649
+ %394 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %376, float %369) #4, !dbg !18
650
+ %.07.i150 = select i1 %.not7.i149, float %394, float %393, !dbg !18
651
+ %395 = fneg float %376, !dbg !18
652
+ %396 = select i1 %365, float %395, float %39, !dbg !18
653
+ %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
654
+ %.not8.i151 = icmp eq i32 %397, 0, !dbg !18
655
+ %398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %396, float %396) #4, !dbg !18
656
+ %399 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %396, float %396) #4, !dbg !18
657
+ %.08.i152 = select i1 %.not8.i151, float %399, float %398, !dbg !18
658
+ br i1 %365, label %400, label %__nv_erff.exit157, !dbg !18
659
+
660
+ 400: ; preds = %__internal_fmad.exit.i138
661
+ %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
662
+ %402 = fsub float 1.000000e+00, %401, !dbg !18
663
+ %403 = bitcast float %402 to i32, !dbg !18
664
+ %404 = bitcast float %39 to i32, !dbg !18
665
+ %405 = and i32 %404, -2147483648, !dbg !18
666
+ %406 = or i32 %405, %403, !dbg !18
667
+ %407 = bitcast i32 %406 to float, !dbg !18
668
+ br label %__nv_erff.exit157, !dbg !18
669
+
670
+ __nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %400
671
+ %r.0.i153 = phi float [ %407, %400 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
672
+ %408 = fmul float %31, 5.000000e-01, !dbg !19
673
+ %409 = fmul float %30, 5.000000e-01, !dbg !19
674
+ %410 = fmul float %29, 5.000000e-01, !dbg !19
675
+ %411 = fmul float %28, 5.000000e-01, !dbg !19
676
+ %412 = fmul float %27, 5.000000e-01, !dbg !19
677
+ %413 = fmul float %26, 5.000000e-01, !dbg !19
678
+ %414 = fmul float %25, 5.000000e-01, !dbg !19
679
+ %415 = fmul float %24, 5.000000e-01, !dbg !19
680
+ %416 = fadd float %r.0.i, 1.000000e+00, !dbg !20
681
+ %417 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
682
+ %418 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
683
+ %419 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
684
+ %420 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
685
+ %421 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
686
+ %422 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
687
+ %423 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
688
+ %424 = fmul float %415, %416, !dbg !21
689
+ %425 = fmul float %414, %417, !dbg !21
690
+ %426 = fmul float %413, %418, !dbg !21
691
+ %427 = fmul float %412, %419, !dbg !21
692
+ %428 = fmul float %411, %420, !dbg !21
693
+ %429 = fmul float %410, %421, !dbg !21
694
+ %430 = fmul float %409, %422, !dbg !21
695
+ %431 = fmul float %408, %423, !dbg !21
696
+ %432 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %424) #4, !dbg !22
697
+ %433 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !22
698
+ %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !22
699
+ %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !22
700
+ %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !22
701
+ %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !22
702
+ %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !22
703
+ %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !22
704
+ %440 = insertelement <2 x i16> undef, i16 %432, i64 0, !dbg !22
705
+ %441 = insertelement <2 x i16> %440, i16 %433, i64 1, !dbg !22
706
+ %442 = bitcast <2 x i16> %441 to i32, !dbg !22
707
+ %443 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !22
708
+ %444 = insertelement <2 x i16> %443, i16 %435, i64 1, !dbg !22
709
+ %445 = bitcast <2 x i16> %444 to i32, !dbg !22
710
+ %446 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !22
711
+ %447 = insertelement <2 x i16> %446, i16 %437, i64 1, !dbg !22
712
+ %448 = bitcast <2 x i16> %447 to i32, !dbg !22
713
+ %449 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !22
714
+ %450 = insertelement <2 x i16> %449, i16 %439, i64 1, !dbg !22
715
+ %451 = bitcast <2 x i16> %450 to i32, !dbg !22
716
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %445, i32 %448, i32 %451, ptr addrspace(1) %10, i1 true) #4, !dbg !22
717
+ ret void, !dbg !23
718
+ }
719
+
720
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
721
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
722
+
723
+ ; Function Attrs: alwaysinline nounwind
724
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
725
+ __nv_fabsf.exit:
726
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
727
+ %.not = icmp eq i32 %0, 0
728
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
729
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
730
+ %.0 = select i1 %.not, float %2, float %1
731
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
732
+ br i1 %3, label %__nv_fabsf.exit1, label %5
733
+
734
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
735
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
736
+ %.not1 = icmp eq i32 %4, 0
737
+ %.01 = select i1 %.not1, float %2, float %1
738
+ br label %__internal_fmad.exit
739
+
740
+ 5: ; preds = %__nv_fabsf.exit
741
+ %6 = fmul float %a, %a
742
+ br label %__internal_fmad.exit
743
+
744
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
745
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
746
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
747
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
748
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
749
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
750
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
751
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
752
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
753
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
754
+ %.not2 = icmp eq i32 %15, 0
755
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
756
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
757
+ %.02 = select i1 %.not2, float %17, float %16
758
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
759
+ %.not3 = icmp eq i32 %18, 0
760
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
761
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
762
+ %.03 = select i1 %.not3, float %20, float %19
763
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
764
+ %.not4 = icmp eq i32 %21, 0
765
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
766
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
767
+ %.04 = select i1 %.not4, float %23, float %22
768
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
769
+ %.not5 = icmp eq i32 %24, 0
770
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
771
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
772
+ %.05 = select i1 %.not5, float %26, float %25
773
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
774
+ %.not6 = icmp eq i32 %27, 0
775
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
776
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
777
+ %.06 = select i1 %.not6, float %29, float %28
778
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
779
+ %.not7 = icmp eq i32 %30, 0
780
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
781
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
782
+ %.07 = select i1 %.not7, float %32, float %31
783
+ %33 = fneg float %14
784
+ %34 = select i1 %3, float %33, float %a
785
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
786
+ %.not8 = icmp eq i32 %35, 0
787
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
788
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
789
+ %.08 = select i1 %.not8, float %37, float %36
790
+ br i1 %3, label %38, label %46
791
+
792
+ 38: ; preds = %__internal_fmad.exit
793
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
794
+ %40 = fsub float 1.000000e+00, %39
795
+ %41 = bitcast float %40 to i32
796
+ %42 = bitcast float %a to i32
797
+ %43 = and i32 %42, -2147483648
798
+ %44 = or i32 %43, %41
799
+ %45 = bitcast i32 %44 to float
800
+ br label %46
801
+
802
+ 46: ; preds = %38, %__internal_fmad.exit
803
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
804
+ ret float %r.0
805
+ }
806
+
807
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
808
+
809
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
810
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
811
+
812
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
813
+ declare float @llvm.nvvm.fabs.f(float) #0
814
+
815
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
816
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
817
+
818
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
819
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
820
+
821
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
822
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
823
+
824
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
825
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
826
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
827
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
828
+ attributes #4 = { nounwind }
829
+
830
+ !llvm.module.flags = !{!0, !1}
831
+ !llvm.dbg.cu = !{!2}
832
+ !nvvm.annotations = !{!4, !5, !5, !4}
833
+ !llvm.ident = !{!6}
834
+
835
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
836
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
837
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
838
+ !3 = !DIFile(filename: "cafucwnmq4o436kwzkmrinerrnocxll7q6wsadcl726g6cradipo.py", directory: "/tmp/torchinductor_root/af")
839
+ !4 = !{ptr @triton__0d1de, !"kernel", i32 1}
840
+ !5 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
841
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
842
+ !7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
843
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
844
+ !9 = !{}
845
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
846
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
847
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
848
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
849
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
850
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
851
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
852
+ !17 = !DILocation(line: 29, column: 18, scope: !7)
853
+ !18 = !DILocation(line: 30, column: 23, scope: !7)
854
+ !19 = !DILocation(line: 27, column: 18, scope: !7)
855
+ !20 = !DILocation(line: 32, column: 18, scope: !7)
856
+ !21 = !DILocation(line: 33, column: 18, scope: !7)
857
+ !22 = !DILocation(line: 35, column: 40, scope: !7)
858
+ !23 = !DILocation(line: 35, column: 4, scope: !7)
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i32 = arith.constant 1024 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c1024_i32 : i32
6
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
8
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
12
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
13
+ %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
14
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
15
+ tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
11
+
12
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
13
+
14
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
15
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
16
+ %9 = lshr i32 %8, 2, !dbg !10
17
+ %10 = and i32 %9, 63, !dbg !10
18
+ %11 = and i32 %8, 63, !dbg !10
19
+ %12 = and i32 %8, 3, !dbg !11
20
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
21
+ %14 = shl i32 %13, 6, !dbg !13
22
+ %15 = or i32 %14, %10, !dbg !14
23
+ %16 = or i32 %14, %11, !dbg !14
24
+ %17 = sext i32 %15 to i64, !dbg !15
25
+ %18 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !15
26
+ %19 = sext i32 %16 to i64, !dbg !15
27
+ %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
28
+ %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %18, i1 true) #5, !dbg !16
29
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #5, !dbg !16
30
+ %23 = srem i32 %15, 512, !dbg !17
31
+ %24 = shl nsw i32 %23, 8, !dbg !18
32
+ %25 = add i64 %22, 50257, !dbg !19
33
+ %26 = icmp slt i64 %21, 0, !dbg !20
34
+ %27 = icmp slt i64 %22, 0, !dbg !20
35
+ %28 = select i1 %27, i64 %25, i64 %22, !dbg !21
36
+ %.fr8 = freeze i64 %28, !dbg !22
37
+ %29 = icmp ugt i64 %.fr8, 50256, !dbg !22
38
+ %30 = shl i64 %21, 8, !dbg !23
39
+ %31 = add i64 %30, 12865792, !dbg !23
40
+ %32 = select i1 %26, i64 %31, i64 %30, !dbg !23
41
+ %33 = getelementptr float, ptr addrspace(1) %1, i64 %32
42
+ br i1 %29, label %.split.us, label %.split, !dbg !24
43
+
44
+ .split.us: ; preds = %7, %.split.us
45
+ %34 = phi float [ %50, %.split.us ], [ 0.000000e+00, %7 ]
46
+ %35 = phi float [ %55, %.split.us ], [ 0.000000e+00, %7 ]
47
+ %36 = phi float [ %52, %.split.us ], [ 0.000000e+00, %7 ]
48
+ %37 = phi i32 [ %56, %.split.us ], [ 0, %7 ]
49
+ %38 = or i32 %37, %12, !dbg !25
50
+ %39 = add i32 %38, %24, !dbg !26
51
+ %40 = sext i32 %39 to i64, !dbg !27
52
+ %41 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !27
53
+ %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true) #5, !dbg !28
54
+ %43 = bitcast i32 %42 to float, !dbg !28
55
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !24
56
+ %44 = zext nneg i32 %38 to i64, !dbg !29
57
+ %45 = getelementptr float, ptr addrspace(1) %33, i64 %44, !dbg !30
58
+ %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %45, i1 true, i32 0, i1 true) #5, !dbg !31
59
+ %47 = bitcast i32 %46 to float, !dbg !31
60
+ %48 = fadd float %43, %47, !dbg !32
61
+ %49 = fsub float %48, %36, !dbg !33
62
+ %50 = fadd float %34, 1.000000e+00, !dbg !37
63
+ %51 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %49, float %50) #5, !dbg !38
64
+ %52 = fadd float %36, %51, !dbg !39
65
+ %53 = fsub float %48, %52, !dbg !40
66
+ %54 = fmul float %49, %53, !dbg !41
67
+ %55 = fadd float %35, %54, !dbg !42
68
+ %56 = add nuw nsw i32 %37, 4, !dbg !43
69
+ %57 = icmp ult i32 %37, 252, !dbg !43
70
+ br i1 %57, label %.split.us, label %.split5.us, !dbg !43
71
+
72
+ .split: ; preds = %7, %.split
73
+ %58 = phi float [ %74, %.split ], [ 0.000000e+00, %7 ]
74
+ %59 = phi float [ %79, %.split ], [ 0.000000e+00, %7 ]
75
+ %60 = phi float [ %76, %.split ], [ 0.000000e+00, %7 ]
76
+ %61 = phi i32 [ %80, %.split ], [ 0, %7 ]
77
+ %62 = or i32 %61, %12, !dbg !25
78
+ %63 = add i32 %62, %24, !dbg !26
79
+ %64 = sext i32 %63 to i64, !dbg !27
80
+ %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
81
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true) #5, !dbg !28
82
+ %67 = bitcast i32 %66 to float, !dbg !28
83
+ %68 = zext nneg i32 %62 to i64, !dbg !29
84
+ %69 = getelementptr float, ptr addrspace(1) %33, i64 %68, !dbg !30
85
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %69, i1 true, i32 0, i1 true) #5, !dbg !31
86
+ %71 = bitcast i32 %70 to float, !dbg !31
87
+ %72 = fadd float %67, %71, !dbg !32
88
+ %73 = fsub float %72, %60, !dbg !33
89
+ %74 = fadd float %58, 1.000000e+00, !dbg !37
90
+ %75 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %73, float %74) #5, !dbg !38
91
+ %76 = fadd float %60, %75, !dbg !39
92
+ %77 = fsub float %72, %76, !dbg !40
93
+ %78 = fmul float %73, %77, !dbg !41
94
+ %79 = fadd float %59, %78, !dbg !42
95
+ %80 = add nuw nsw i32 %61, 4, !dbg !43
96
+ %81 = icmp ult i32 %61, 252, !dbg !43
97
+ br i1 %81, label %.split, label %.split5.us, !dbg !43
98
+
99
+ .split5.us: ; preds = %.split, %.split.us
100
+ %.us-phi = phi float [ %52, %.split.us ], [ %76, %.split ]
101
+ %.us-phi6 = phi float [ %55, %.split.us ], [ %79, %.split ]
102
+ %.us-phi7 = phi float [ %50, %.split.us ], [ %74, %.split ]
103
+ %82 = bitcast float %.us-phi to i32, !dbg !44
104
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !44
105
+ %84 = bitcast i32 %83 to float, !dbg !44
106
+ %85 = bitcast float %.us-phi6 to i32, !dbg !44
107
+ %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !44
108
+ %87 = bitcast i32 %86 to float, !dbg !44
109
+ %88 = bitcast float %.us-phi7 to i32, !dbg !44
110
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !44
111
+ %90 = bitcast i32 %89 to float, !dbg !44
112
+ %91 = fsub float %84, %.us-phi, !dbg !46
113
+ %92 = fadd float %.us-phi7, %90, !dbg !50
114
+ %93 = fcmp oeq float %92, 0.000000e+00, !dbg !51
115
+ %94 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %90, float %92) #5, !dbg !52
116
+ %95 = select i1 %93, float 0.000000e+00, float %94, !dbg !53
117
+ %96 = fmul float %91, %95, !dbg !54
118
+ %97 = fadd float %.us-phi, %96, !dbg !55
119
+ %98 = fadd float %.us-phi6, %87, !dbg !56
120
+ %99 = fmul float %91, %91, !dbg !57
121
+ %100 = fmul float %.us-phi7, %99, !dbg !58
122
+ %101 = fmul float %100, %95, !dbg !59
123
+ %102 = fadd float %98, %101, !dbg !60
124
+ %103 = bitcast float %97 to i32, !dbg !44
125
+ %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 1, i32 31), !dbg !44
126
+ %105 = bitcast i32 %104 to float, !dbg !44
127
+ %106 = bitcast float %102 to i32, !dbg !44
128
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !44
129
+ %108 = bitcast i32 %107 to float, !dbg !44
130
+ %109 = bitcast float %92 to i32, !dbg !44
131
+ %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 1, i32 31), !dbg !44
132
+ %111 = bitcast i32 %110 to float, !dbg !44
133
+ %112 = fsub float %105, %97, !dbg !46
134
+ %113 = fadd float %92, %111, !dbg !50
135
+ %114 = fcmp oeq float %113, 0.000000e+00, !dbg !51
136
+ %115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %111, float %113) #5, !dbg !52
137
+ %116 = select i1 %114, float 0.000000e+00, float %115, !dbg !53
138
+ %117 = fmul float %112, %116, !dbg !54
139
+ %118 = fadd float %97, %117, !dbg !55
140
+ %119 = fadd float %102, %108, !dbg !56
141
+ %120 = fmul float %112, %112, !dbg !57
142
+ %121 = fmul float %92, %120, !dbg !58
143
+ %122 = fmul float %116, %121, !dbg !59
144
+ %123 = fadd float %119, %122, !dbg !60
145
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #5, !dbg !61
146
+ %125 = fadd float %124, 0x3EE4F8B580000000, !dbg !62
147
+ %126 = shl i32 %15, 8, !dbg !63
148
+ br label %127, !dbg !64
149
+
150
+ 127: ; preds = %.split5.us, %__nv_rsqrtf.exit
151
+ %128 = phi i32 [ 0, %.split5.us ], [ %157, %__nv_rsqrtf.exit ]
152
+ %129 = or i32 %128, %12, !dbg !65
153
+ %130 = add i32 %129, %24, !dbg !66
154
+ %131 = sext i32 %130 to i64, !dbg !67
155
+ %132 = getelementptr float, ptr addrspace(1) %2, i64 %131, !dbg !67
156
+ %133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %132, i1 true, i32 0, i1 true) #5, !dbg !68
157
+ %134 = bitcast i32 %133 to float, !dbg !68
158
+ %135 = zext nneg i32 %129 to i64, !dbg !69
159
+ %136 = getelementptr float, ptr addrspace(1) %3, i64 %135, !dbg !69
160
+ %137 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true) #5, !dbg !70
161
+ %138 = bitcast i32 %137 to float, !dbg !70
162
+ br i1 %29, label %139, label %140, !dbg !71
163
+
164
+ 139: ; preds = %127
165
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
166
+ br label %140, !dbg !71
167
+
168
+ 140: ; preds = %139, %127
169
+ %141 = getelementptr float, ptr addrspace(1) %33, i64 %135, !dbg !72
170
+ %142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true) #5, !dbg !73
171
+ %143 = bitcast i32 %142 to float, !dbg !73
172
+ %144 = fadd float %134, %143, !dbg !74
173
+ %145 = fsub float %144, %118, !dbg !75
174
+ %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !76
175
+ %.not.i = icmp eq i32 %146, 0, !dbg !76
176
+ br i1 %.not.i, label %149, label %147, !dbg !76
177
+
178
+ 147: ; preds = %140
179
+ %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !76
180
+ br label %__nv_rsqrtf.exit, !dbg !76
181
+
182
+ 149: ; preds = %140
183
+ %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !76
184
+ br label %__nv_rsqrtf.exit, !dbg !76
185
+
186
+ __nv_rsqrtf.exit: ; preds = %147, %149
187
+ %.0.i = phi float [ %148, %147 ], [ %150, %149 ], !dbg !76
188
+ %151 = fmul float %145, %.0.i, !dbg !77
189
+ %152 = fmul float %151, %138, !dbg !78
190
+ %153 = add i32 %129, %126, !dbg !79
191
+ %154 = sext i32 %153 to i64, !dbg !80
192
+ %155 = getelementptr i16, ptr addrspace(1) %4, i64 %154, !dbg !80
193
+ %156 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %152) #5, !dbg !81
194
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %155, i1 true) #5, !dbg !81
195
+ %157 = add nuw nsw i32 %128, 4, !dbg !64
196
+ %158 = icmp ult i32 %128, 252, !dbg !64
197
+ br i1 %158, label %127, label %159, !dbg !64
198
+
199
+ 159: ; preds = %__nv_rsqrtf.exit
200
+ ret void, !dbg !82
201
+ }
202
+
203
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
204
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
205
+
206
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
207
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
208
+
209
+ ; Function Attrs: alwaysinline nounwind
210
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
211
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
212
+ %.not = icmp eq i32 %1, 0
213
+ br i1 %.not, label %4, label %2
214
+
215
+ 2: ; preds = %0
216
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
217
+ br label %6
218
+
219
+ 4: ; preds = %0
220
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
221
+ br label %6
222
+
223
+ 6: ; preds = %4, %2
224
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
225
+ ret float %.0
226
+ }
227
+
228
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
229
+
230
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
231
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
232
+
233
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
234
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #4
235
+
236
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
237
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
238
+ attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
239
+ attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
240
+ attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
241
+ attributes #5 = { nounwind }
242
+
243
+ !llvm.module.flags = !{!0, !1}
244
+ !llvm.dbg.cu = !{!2}
245
+ !nvvm.annotations = !{!4, !5, !5, !4}
246
+ !llvm.ident = !{!6}
247
+
248
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
249
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
250
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
251
+ !3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
252
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
253
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
254
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
255
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
256
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
257
+ !9 = !{}
258
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
259
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
260
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
261
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
262
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
263
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
264
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
265
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
266
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
267
+ !19 = !DILocation(line: 36, column: 22, scope: !7)
268
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
269
+ !21 = !DILocation(line: 38, column: 36, scope: !7)
270
+ !22 = !DILocation(line: 39, column: 40, scope: !7)
271
+ !23 = !DILocation(line: 40, column: 44, scope: !7)
272
+ !24 = !DILocation(line: 39, column: 55, scope: !7)
273
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
274
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
275
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
276
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
277
+ !29 = !DILocation(line: 40, column: 40, scope: !7)
278
+ !30 = !DILocation(line: 40, column: 34, scope: !7)
279
+ !31 = !DILocation(line: 40, column: 52, scope: !7)
280
+ !32 = !DILocation(line: 41, column: 22, scope: !7)
281
+ !33 = !DILocation(line: 96, column: 20, scope: !34, inlinedAt: !36)
282
+ !34 = distinct !DILexicalBlockFile(scope: !7, file: !35, discriminator: 0)
283
+ !35 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
284
+ !36 = !DILocation(line: 44, column: 38, scope: !34)
285
+ !37 = !DILocation(line: 97, column: 26, scope: !34, inlinedAt: !36)
286
+ !38 = !DILocation(line: 98, column: 30, scope: !34, inlinedAt: !36)
287
+ !39 = !DILocation(line: 98, column: 22, scope: !34, inlinedAt: !36)
288
+ !40 = !DILocation(line: 101, column: 30, scope: !34, inlinedAt: !36)
289
+ !41 = !DILocation(line: 101, column: 22, scope: !34, inlinedAt: !36)
290
+ !42 = !DILocation(line: 47, column: 48, scope: !7)
291
+ !43 = !DILocation(line: 31, column: 36, scope: !7)
292
+ !44 = !DILocation(line: 120, column: 46, scope: !34, inlinedAt: !45)
293
+ !45 = !DILocation(line: 50, column: 41, scope: !34)
294
+ !46 = !DILocation(line: 108, column: 21, scope: !47, inlinedAt: !48)
295
+ !47 = distinct !DILexicalBlockFile(scope: !34, file: !35, discriminator: 0)
296
+ !48 = !DILocation(line: 120, column: 46, scope: !47, inlinedAt: !49)
297
+ !49 = !DILocation(line: 50, column: 41, scope: !47)
298
+ !50 = !DILocation(line: 109, column: 28, scope: !47, inlinedAt: !48)
299
+ !51 = !DILocation(line: 110, column: 39, scope: !47, inlinedAt: !48)
300
+ !52 = !DILocation(line: 110, column: 60, scope: !47, inlinedAt: !48)
301
+ !53 = !DILocation(line: 110, column: 49, scope: !47, inlinedAt: !48)
302
+ !54 = !DILocation(line: 112, column: 25, scope: !47, inlinedAt: !48)
303
+ !55 = !DILocation(line: 112, column: 17, scope: !47, inlinedAt: !48)
304
+ !56 = !DILocation(line: 113, column: 15, scope: !47, inlinedAt: !48)
305
+ !57 = !DILocation(line: 113, column: 30, scope: !47, inlinedAt: !48)
306
+ !58 = !DILocation(line: 113, column: 38, scope: !47, inlinedAt: !48)
307
+ !59 = !DILocation(line: 113, column: 49, scope: !47, inlinedAt: !48)
308
+ !60 = !DILocation(line: 113, column: 22, scope: !47, inlinedAt: !48)
309
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
310
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
311
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
312
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
313
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
314
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
315
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
316
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
317
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
318
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
319
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
320
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
321
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
322
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
323
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
324
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
325
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
326
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
327
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
328
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
329
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
330
+ !82 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
26
+
27
+ .visible .entry triton__0d1d2d3d4d5de6de(
28
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
29
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
30
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
31
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
32
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
33
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
34
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
35
+ )
36
+ .maxntid 256, 1, 1
37
+ {
38
+ .reg .pred %p<27>;
39
+ .reg .b16 %rs<3>;
40
+ .reg .b32 %r<81>;
41
+ .reg .f32 %f<73>;
42
+ .reg .b64 %rd<84>;
43
+ .loc 1 18 0
44
+ $L__func_begin0:
45
+ .loc 1 18 0
46
+
47
+ ld.param.u64 %rd35, [triton__0d1d2d3d4d5de6de_param_3];
48
+ ld.param.u64 %rd34, [triton__0d1d2d3d4d5de6de_param_2];
49
+ ld.param.u64 %rd33, [triton__0d1d2d3d4d5de6de_param_1];
50
+ ld.param.u64 %rd41, [triton__0d1d2d3d4d5de6de_param_0];
51
+ $L__tmp0:
52
+ .loc 1 22 44
53
+ mov.u32 %r1, %tid.x;
54
+ bfe.u32 %r2, %r1, 2, 6;
55
+ and.b32 %r14, %r1, 63;
56
+ .loc 1 24 33
57
+ and.b32 %r3, %r1, 3;
58
+ .loc 1 21 28
59
+ mov.u32 %r13, %ctaid.x;
60
+ .loc 1 21 33
61
+ shl.b32 %r15, %r13, 6;
62
+ .loc 1 22 23
63
+ or.b32 %r16, %r15, %r2;
64
+ or.b32 %r17, %r15, %r14;
65
+ .loc 1 26 30
66
+ mul.wide.s32 %rd42, %r16, 8;
67
+ add.s64 %rd38, %rd41, %rd42;
68
+ mul.wide.s32 %rd43, %r17, 8;
69
+ add.s64 %rd40, %rd41, %rd43;
70
+ mov.pred %p11, -1;
71
+ .loc 1 26 35
72
+ mov.u64 %rd37, 0x0;
73
+ @%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ];
74
+ mov.u64 %rd39, 0x0;
75
+ @%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
76
+ .loc 1 27 18
77
+ bfe.s32 %r18, %r13, 25, 1;
78
+ shr.u32 %r19, %r18, 23;
79
+ add.s32 %r20, %r16, %r19;
80
+ and.b32 %r21, %r20, 16776704;
81
+ sub.s32 %r22, %r16, %r21;
82
+ .loc 1 35 44
83
+ shl.b32 %r5, %r22, 8;
84
+ .loc 1 36 22
85
+ add.s64 %rd44, %rd39, 50257;
86
+ .loc 1 37 22
87
+ setp.lt.s64 %p3, %rd37, 0;
88
+ setp.lt.s64 %p4, %rd39, 0;
89
+ .loc 1 38 36
90
+ selp.b64 %rd45, %rd44, %rd39, %p4;
91
+ .loc 1 39 40
92
+ setp.gt.u64 %p5, %rd45, 50256;
93
+ .loc 1 40 44
94
+ shl.b64 %rd46, %rd37, 8;
95
+ add.s64 %rd47, %rd46, 12865792;
96
+ selp.b64 %rd2, %rd47, %rd46, %p3;
97
+ mov.b32 %r67, 0;
98
+ mov.b32 %r77, 883;
99
+ mov.u64 %rd73, 1;
100
+ .loc 1 39 55
101
+ @%p5 bra $L__BB0_3;
102
+ bra.uni $L__BB0_1;
103
+ $L__BB0_3:
104
+ .loc 1 31 36
105
+ shl.b64 %rd51, %rd2, 2;
106
+ mul.wide.u32 %rd80, %r3, 4;
107
+ add.s64 %rd79, %rd51, %rd80;
108
+ add.s64 %rd75, %rd33, %rd79;
109
+ add.s32 %r35, %r5, %r3;
110
+ mul.wide.s32 %rd78, %r35, 4;
111
+ add.s64 %rd74, %rd34, %rd78;
112
+ mov.f32 %f72, 0f00000000;
113
+ mov.b32 %r78, -4;
114
+ mov.f32 %f71, %f72;
115
+ mov.f32 %f70, %f72;
116
+ $L__BB0_4:
117
+ .loc 1 35 50
118
+ mov.u32 %r36, 0x0;
119
+ @%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ];
120
+ @!%p11 mov.u32 %r36, %r67;
121
+ mov.b32 %f28, %r36;
122
+ .loc 1 39 55
123
+ mov.u64 %rd54, assertMessage_0;
124
+ cvta.global.u64 %rd55, %rd54;
125
+ mov.u64 %rd56, assertFile_0;
126
+ cvta.global.u64 %rd57, %rd56;
127
+ mov.u64 %rd58, assertFunc_0;
128
+ cvta.global.u64 %rd59, %rd58;
129
+ { // callseq 10, 0
130
+ .reg .b32 temp_param_reg;
131
+ .param .b64 param0;
132
+ st.param.b64 [param0+0], %rd55;
133
+ .param .b64 param1;
134
+ st.param.b64 [param1+0], %rd57;
135
+ .param .b32 param2;
136
+ st.param.b32 [param2+0], %r77;
137
+ .param .b64 param3;
138
+ st.param.b64 [param3+0], %rd59;
139
+ .param .b64 param4;
140
+ st.param.b64 [param4+0], %rd73;
141
+ call.uni
142
+ __assertfail,
143
+ (
144
+ param0,
145
+ param1,
146
+ param2,
147
+ param3,
148
+ param4
149
+ );
150
+ } // callseq 10
151
+ .loc 1 40 52
152
+ mov.u32 %r38, 0x0;
153
+ @%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ];
154
+ @!%p11 mov.u32 %r38, %r67;
155
+ mov.b32 %f29, %r38;
156
+ .loc 1 41 22
157
+ add.f32 %f30, %f28, %f29;
158
+ $L__tmp1:
159
+ .loc 2 96 20
160
+ sub.f32 %f31, %f30, %f70;
161
+ .loc 2 97 26
162
+ add.f32 %f72, %f72, 0f3F800000;
163
+ .loc 2 98 30
164
+ mov.b32 %r41, %f31;
165
+ mov.b32 %r42, %f72;
166
+ div.full.f32 %r40, %r41, %r42;
167
+ mov.b32 %f32, %r40;
168
+ .loc 2 98 22
169
+ add.f32 %f70, %f70, %f32;
170
+ .loc 2 101 30
171
+ sub.f32 %f33, %f30, %f70;
172
+ $L__tmp2:
173
+ .loc 1 47 48
174
+ fma.rn.f32 %f71, %f31, %f33, %f71;
175
+ .loc 1 31 36
176
+ add.s32 %r78, %r78, 4;
177
+ add.s64 %rd75, %rd75, 16;
178
+ add.s64 %rd74, %rd74, 16;
179
+ setp.lt.u32 %p15, %r78, 252;
180
+ @%p15 bra $L__BB0_4;
181
+ bra.uni $L__BB0_5;
182
+ $L__BB0_1:
183
+ .loc 1 0 36
184
+ mov.b32 %r79, -4;
185
+ .loc 1 31 36
186
+ shl.b64 %rd48, %rd2, 2;
187
+ mul.wide.u32 %rd80, %r3, 4;
188
+ add.s64 %rd79, %rd48, %rd80;
189
+ add.s64 %rd77, %rd33, %rd79;
190
+ add.s32 %r25, %r5, %r3;
191
+ mul.wide.s32 %rd78, %r25, 4;
192
+ add.s64 %rd76, %rd34, %rd78;
193
+ mov.f32 %f72, 0f00000000;
194
+ mov.f32 %f71, %f72;
195
+ mov.f32 %f70, %f72;
196
+ $L__BB0_2:
197
+ .loc 1 35 50
198
+ mov.u32 %r26, 0x0;
199
+ @%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ];
200
+ @!%p11 mov.u32 %r26, %r67;
201
+ mov.b32 %f21, %r26;
202
+ .loc 1 40 52
203
+ mov.u32 %r28, 0x0;
204
+ @%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ];
205
+ @!%p11 mov.u32 %r28, %r67;
206
+ mov.b32 %f22, %r28;
207
+ .loc 1 41 22
208
+ add.f32 %f23, %f21, %f22;
209
+ $L__tmp3:
210
+ .loc 2 96 20
211
+ sub.f32 %f24, %f23, %f70;
212
+ .loc 2 97 26
213
+ add.f32 %f72, %f72, 0f3F800000;
214
+ .loc 2 98 30
215
+ mov.b32 %r31, %f24;
216
+ mov.b32 %r32, %f72;
217
+ div.full.f32 %r30, %r31, %r32;
218
+ mov.b32 %f25, %r30;
219
+ .loc 2 98 22
220
+ add.f32 %f70, %f70, %f25;
221
+ .loc 2 101 30
222
+ sub.f32 %f26, %f23, %f70;
223
+ $L__tmp4:
224
+ .loc 1 47 48
225
+ fma.rn.f32 %f71, %f24, %f26, %f71;
226
+ .loc 1 31 36
227
+ add.s32 %r79, %r79, 4;
228
+ add.s64 %rd77, %rd77, 16;
229
+ add.s64 %rd76, %rd76, 16;
230
+ setp.lt.u32 %p10, %r79, 252;
231
+ @%p10 bra $L__BB0_2;
232
+ $L__BB0_5:
233
+ .loc 1 0 36
234
+ ld.param.u64 %rd36, [triton__0d1d2d3d4d5de6de_param_4];
235
+ $L__tmp5:
236
+ .loc 2 120 46
237
+ mov.b32 %r54, %f70;
238
+ shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1;
239
+ mov.b32 %f34, %r55;
240
+ mov.b32 %r56, %f71;
241
+ shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1;
242
+ mov.b32 %f35, %r57;
243
+ mov.b32 %r58, %f72;
244
+ shfl.sync.bfly.b32 %r45, %r58, 2, 31, -1;
245
+ mov.b32 %f36, %r45;
246
+ $L__tmp6:
247
+ .loc 2 108 21
248
+ sub.f32 %f37, %f34, %f70;
249
+ .loc 2 109 28
250
+ add.f32 %f38, %f72, %f36;
251
+ .loc 2 110 39
252
+ setp.eq.f32 %p16, %f38, 0f00000000;
253
+ .loc 2 110 60
254
+ mov.b32 %r46, %f38;
255
+ div.full.f32 %r44, %r45, %r46;
256
+ mov.b32 %f39, %r44;
257
+ .loc 2 110 49
258
+ selp.f32 %f40, 0f00000000, %f39, %p16;
259
+ .loc 2 112 17
260
+ fma.rn.f32 %f41, %f37, %f40, %f70;
261
+ .loc 2 113 15
262
+ add.f32 %f42, %f71, %f35;
263
+ .loc 2 113 30
264
+ mul.f32 %f43, %f37, %f37;
265
+ .loc 2 113 38
266
+ mul.f32 %f44, %f72, %f43;
267
+ .loc 2 113 22
268
+ fma.rn.f32 %f45, %f44, %f40, %f42;
269
+ $L__tmp7:
270
+ .loc 2 120 46
271
+ mov.b32 %r59, %f41;
272
+ shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
273
+ mov.b32 %f46, %r60;
274
+ mov.b32 %r61, %f45;
275
+ shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
276
+ mov.b32 %f47, %r62;
277
+ shfl.sync.bfly.b32 %r48, %r46, 1, 31, -1;
278
+ mov.b32 %f48, %r48;
279
+ $L__tmp8:
280
+ .loc 2 108 21
281
+ sub.f32 %f49, %f46, %f41;
282
+ .loc 2 109 28
283
+ add.f32 %f50, %f38, %f48;
284
+ .loc 2 110 39
285
+ setp.eq.f32 %p17, %f50, 0f00000000;
286
+ .loc 2 110 60
287
+ mov.b32 %r49, %f50;
288
+ div.full.f32 %r47, %r48, %r49;
289
+ mov.b32 %f51, %r47;
290
+ .loc 2 110 49
291
+ selp.f32 %f52, 0f00000000, %f51, %p17;
292
+ .loc 2 112 17
293
+ fma.rn.f32 %f16, %f49, %f52, %f41;
294
+ .loc 2 113 15
295
+ add.f32 %f53, %f45, %f47;
296
+ .loc 2 113 30
297
+ mul.f32 %f54, %f49, %f49;
298
+ .loc 2 113 38
299
+ mul.f32 %f55, %f38, %f54;
300
+ .loc 2 113 22
301
+ fma.rn.f32 %f56, %f52, %f55, %f53;
302
+ $L__tmp9:
303
+ .loc 1 69 23
304
+ mov.b32 %r51, %f56;
305
+ mov.b32 %r52, 1132462080;
306
+ div.full.f32 %r50, %r51, %r52;
307
+ mov.b32 %f57, %r50;
308
+ .loc 1 71 24
309
+ add.f32 %f17, %f57, 0f3727C5AC;
310
+ .loc 1 55 36
311
+ shl.b32 %r63, %r13, 14;
312
+ shl.b32 %r64, %r2, 8;
313
+ or.b32 %r65, %r63, %r64;
314
+ or.b32 %r10, %r65, %r3;
315
+ add.s64 %rd83, %rd33, %rd79;
316
+ add.s64 %rd82, %rd35, %rd80;
317
+ add.s64 %rd81, %rd34, %rd78;
318
+ mov.b32 %r80, -4;
319
+ setp.lt.u64 %p22, %rd45, 50257;
320
+ rsqrt.approx.ftz.f32 %f61, %f17;
321
+ bra.uni $L__BB0_6;
322
+ $L__BB0_8:
323
+ .loc 1 0 0
324
+ mov.b32 %f18, %r66;
325
+ mov.b32 %f19, %r68;
326
+ .loc 1 65 54
327
+ mov.u32 %r71, 0x0;
328
+ @%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ];
329
+ @!%p11 mov.u32 %r71, %r67;
330
+ mov.b32 %f58, %r71;
331
+ .loc 1 66 24
332
+ add.f32 %f59, %f18, %f58;
333
+ .loc 1 67 24
334
+ sub.f32 %f60, %f59, %f16;
335
+ .loc 1 73 24
336
+ mul.f32 %f62, %f60, %f61;
337
+ .loc 1 74 24
338
+ mul.f32 %f63, %f62, %f19;
339
+ .loc 1 55 36
340
+ add.s32 %r80, %r80, 4;
341
+ .loc 1 76 29
342
+ add.s32 %r74, %r80, %r10;
343
+ mul.wide.s32 %rd72, %r74, 2;
344
+ add.s64 %rd71, %rd36, %rd72;
345
+ .loc 1 76 52
346
+ mov.b32 %r73, %f63;
347
+ cvt.rn.bf16.f32 %rs1, %r73;
348
+ @%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 };
349
+ .loc 1 55 36
350
+ add.s64 %rd83, %rd83, 16;
351
+ add.s64 %rd82, %rd82, 16;
352
+ add.s64 %rd81, %rd81, 16;
353
+ setp.lt.u32 %p26, %r80, 252;
354
+ @%p26 bra $L__BB0_6;
355
+ bra.uni $L__BB0_9;
356
+ $L__BB0_6:
357
+ .loc 1 59 51
358
+ mov.u32 %r66, 0x0;
359
+ @%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ];
360
+ @!%p11 mov.u32 %r66, %r67;
361
+ .loc 1 60 40
362
+ mov.u32 %r68, 0x0;
363
+ @%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ];
364
+ @!%p11 mov.u32 %r68, %r67;
365
+ .loc 1 64 57
366
+ @%p22 bra $L__BB0_8;
367
+ mov.u64 %rd63, assertMessage_1;
368
+ cvta.global.u64 %rd64, %rd63;
369
+ mov.u64 %rd65, assertFile_1;
370
+ cvta.global.u64 %rd66, %rd65;
371
+ mov.u64 %rd67, assertFunc_1;
372
+ cvta.global.u64 %rd68, %rd67;
373
+ { // callseq 11, 0
374
+ .reg .b32 temp_param_reg;
375
+ .param .b64 param0;
376
+ st.param.b64 [param0+0], %rd64;
377
+ .param .b64 param1;
378
+ st.param.b64 [param1+0], %rd66;
379
+ .param .b32 param2;
380
+ st.param.b32 [param2+0], %r77;
381
+ .param .b64 param3;
382
+ st.param.b64 [param3+0], %rd68;
383
+ .param .b64 param4;
384
+ st.param.b64 [param4+0], %rd73;
385
+ call.uni
386
+ __assertfail,
387
+ (
388
+ param0,
389
+ param1,
390
+ param2,
391
+ param3,
392
+ param4
393
+ );
394
+ } // callseq 11
395
+ bra.uni $L__BB0_8;
396
+ $L__BB0_9:
397
+ .loc 1 55 4
398
+ ret;
399
+ $L__tmp10:
400
+ $L__func_end0:
401
+
402
+ }
403
+ // .globl __nv_rsqrtf
404
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
405
+ .param .b32 __nv_rsqrtf_param_0
406
+ )
407
+ {
408
+ .reg .f32 %f<3>;
409
+ $L__func_begin1:
410
+
411
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
412
+ rsqrt.approx.ftz.f32 %f2, %f1;
413
+ st.param.f32 [func_retval0+0], %f2;
414
+ ret;
415
+ $L__func_end1:
416
+
417
+ }
418
+ .file 1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
419
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
420
+ .section .debug_abbrev
421
+ {
422
+ .b8 1
423
+ .b8 17
424
+ .b8 1
425
+ .b8 37
426
+ .b8 8
427
+ .b8 19
428
+ .b8 5
429
+ .b8 3
430
+ .b8 8
431
+ .b8 16
432
+ .b8 6
433
+ .b8 27
434
+ .b8 8
435
+ .b8 180
436
+ .b8 66
437
+ .b8 12
438
+ .b8 17
439
+ .b8 1
440
+ .b8 18
441
+ .b8 1
442
+ .b8 0
443
+ .b8 0
444
+ .b8 2
445
+ .b8 46
446
+ .b8 0
447
+ .b8 135
448
+ .b8 64
449
+ .b8 8
450
+ .b8 3
451
+ .b8 8
452
+ .b8 58
453
+ .b8 11
454
+ .b8 59
455
+ .b8 11
456
+ .b8 63
457
+ .b8 12
458
+ .b8 32
459
+ .b8 11
460
+ .b8 0
461
+ .b8 0
462
+ .b8 3
463
+ .b8 46
464
+ .b8 1
465
+ .b8 17
466
+ .b8 1
467
+ .b8 18
468
+ .b8 1
469
+ .b8 64
470
+ .b8 10
471
+ .b8 49
472
+ .b8 19
473
+ .b8 0
474
+ .b8 0
475
+ .b8 4
476
+ .b8 29
477
+ .b8 0
478
+ .b8 49
479
+ .b8 19
480
+ .b8 17
481
+ .b8 1
482
+ .b8 18
483
+ .b8 1
484
+ .b8 88
485
+ .b8 11
486
+ .b8 89
487
+ .b8 11
488
+ .b8 87
489
+ .b8 11
490
+ .b8 0
491
+ .b8 0
492
+ .b8 5
493
+ .b8 29
494
+ .b8 1
495
+ .b8 49
496
+ .b8 19
497
+ .b8 17
498
+ .b8 1
499
+ .b8 18
500
+ .b8 1
501
+ .b8 88
502
+ .b8 11
503
+ .b8 89
504
+ .b8 11
505
+ .b8 87
506
+ .b8 11
507
+ .b8 0
508
+ .b8 0
509
+ .b8 0
510
+ }
511
+ .section .debug_info
512
+ {
513
+ .b32 298
514
+ .b8 2
515
+ .b8 0
516
+ .b32 .debug_abbrev
517
+ .b8 8
518
+ .b8 1
519
+ .b8 116
520
+ .b8 114
521
+ .b8 105
522
+ .b8 116
523
+ .b8 111
524
+ .b8 110
525
+ .b8 0
526
+ .b8 2
527
+ .b8 0
528
+ .b8 99
529
+ .b8 108
530
+ .b8 104
531
+ .b8 101
532
+ .b8 52
533
+ .b8 97
534
+ .b8 51
535
+ .b8 115
536
+ .b8 116
537
+ .b8 118
538
+ .b8 117
539
+ .b8 102
540
+ .b8 120
541
+ .b8 97
542
+ .b8 102
543
+ .b8 109
544
+ .b8 113
545
+ .b8 51
546
+ .b8 107
547
+ .b8 107
548
+ .b8 53
549
+ .b8 104
550
+ .b8 111
551
+ .b8 100
552
+ .b8 97
553
+ .b8 122
554
+ .b8 122
555
+ .b8 50
556
+ .b8 101
557
+ .b8 102
558
+ .b8 99
559
+ .b8 116
560
+ .b8 102
561
+ .b8 102
562
+ .b8 116
563
+ .b8 101
564
+ .b8 54
565
+ .b8 52
566
+ .b8 54
567
+ .b8 122
568
+ .b8 110
569
+ .b8 106
570
+ .b8 100
571
+ .b8 110
572
+ .b8 118
573
+ .b8 51
574
+ .b8 108
575
+ .b8 113
576
+ .b8 105
577
+ .b8 53
578
+ .b8 111
579
+ .b8 97
580
+ .b8 46
581
+ .b8 112
582
+ .b8 121
583
+ .b8 0
584
+ .b32 .debug_line
585
+ .b8 47
586
+ .b8 116
587
+ .b8 109
588
+ .b8 112
589
+ .b8 47
590
+ .b8 116
591
+ .b8 111
592
+ .b8 114
593
+ .b8 99
594
+ .b8 104
595
+ .b8 105
596
+ .b8 110
597
+ .b8 100
598
+ .b8 117
599
+ .b8 99
600
+ .b8 116
601
+ .b8 111
602
+ .b8 114
603
+ .b8 95
604
+ .b8 114
605
+ .b8 111
606
+ .b8 111
607
+ .b8 116
608
+ .b8 47
609
+ .b8 108
610
+ .b8 104
611
+ .b8 0
612
+ .b8 1
613
+ .b64 $L__func_begin0
614
+ .b64 $L__func_end0
615
+ .b8 2
616
+ .b8 116
617
+ .b8 114
618
+ .b8 105
619
+ .b8 116
620
+ .b8 111
621
+ .b8 110
622
+ .b8 95
623
+ .b8 95
624
+ .b8 48
625
+ .b8 100
626
+ .b8 49
627
+ .b8 100
628
+ .b8 50
629
+ .b8 100
630
+ .b8 51
631
+ .b8 100
632
+ .b8 52
633
+ .b8 100
634
+ .b8 53
635
+ .b8 100
636
+ .b8 101
637
+ .b8 54
638
+ .b8 100
639
+ .b8 101
640
+ .b8 0
641
+ .b8 116
642
+ .b8 114
643
+ .b8 105
644
+ .b8 116
645
+ .b8 111
646
+ .b8 110
647
+ .b8 95
648
+ .b8 95
649
+ .b8 48
650
+ .b8 100
651
+ .b8 49
652
+ .b8 100
653
+ .b8 50
654
+ .b8 100
655
+ .b8 51
656
+ .b8 100
657
+ .b8 52
658
+ .b8 100
659
+ .b8 53
660
+ .b8 100
661
+ .b8 101
662
+ .b8 54
663
+ .b8 100
664
+ .b8 101
665
+ .b8 0
666
+ .b8 1
667
+ .b8 18
668
+ .b8 1
669
+ .b8 1
670
+ .b8 3
671
+ .b64 $L__func_begin0
672
+ .b64 $L__func_end0
673
+ .b8 1
674
+ .b8 156
675
+ .b32 125
676
+ .b8 4
677
+ .b32 125
678
+ .b64 $L__tmp1
679
+ .b64 $L__tmp4
680
+ .b8 2
681
+ .b8 44
682
+ .b8 38
683
+ .b8 4
684
+ .b32 125
685
+ .b64 $L__tmp5
686
+ .b64 $L__tmp8
687
+ .b8 2
688
+ .b8 50
689
+ .b8 41
690
+ .b8 5
691
+ .b32 125
692
+ .b64 $L__tmp6
693
+ .b64 $L__tmp9
694
+ .b8 2
695
+ .b8 50
696
+ .b8 41
697
+ .b8 4
698
+ .b32 125
699
+ .b64 $L__tmp6
700
+ .b64 $L__tmp9
701
+ .b8 2
702
+ .b8 120
703
+ .b8 46
704
+ .b8 0
705
+ .b8 0
706
+ .b8 0
707
+ }
708
+ .section .debug_pubnames
709
+ {
710
+ .b32 $L__pubNames_end0-$L__pubNames_start0
711
+ $L__pubNames_start0:
712
+ .b8 2
713
+ .b8 0
714
+ .b32 .debug_info
715
+ .b32 302
716
+ .b32 125
717
+ .b8 116
718
+ .b8 114
719
+ .b8 105
720
+ .b8 116
721
+ .b8 111
722
+ .b8 110
723
+ .b8 95
724
+ .b8 95
725
+ .b8 48
726
+ .b8 100
727
+ .b8 49
728
+ .b8 100
729
+ .b8 50
730
+ .b8 100
731
+ .b8 51
732
+ .b8 100
733
+ .b8 52
734
+ .b8 100
735
+ .b8 53
736
+ .b8 100
737
+ .b8 101
738
+ .b8 54
739
+ .b8 100
740
+ .b8 101
741
+ .b8 0
742
+ .b32 0
743
+ $L__pubNames_end0:
744
+ }
745
+ .section .debug_pubtypes
746
+ {
747
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
748
+ $L__pubTypes_start0:
749
+ .b8 2
750
+ .b8 0
751
+ .b32 .debug_info
752
+ .b32 302
753
+ .b32 0
754
+ $L__pubTypes_end0:
755
+ }
756
+ .section .debug_loc { }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
9
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
10
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
11
+ %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
12
+ %c0_i32 = arith.constant 0 : i32
13
+ %c4_i32 = arith.constant 4 : i32
14
+ %c256_i32 = arith.constant 256 : i32
15
+ %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
16
+ %cst_7 = arith.constant 0.000000e+00 : f32
17
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
18
+ %cst_9 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
19
+ %cst_10 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
20
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
21
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
22
+ %c64_i32 = arith.constant 64 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c64_i32 : i32
25
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
29
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
31
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
33
+ %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
35
+ %12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
36
+ %13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
37
+ %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
38
+ %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
39
+ %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
40
+ %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
41
+ %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
42
+ %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
43
+ %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
44
+ %21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
45
+ %22 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
46
+ %23 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
47
+ %24 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
48
+ %25 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
49
+ %26 = arith.select %24, %22, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
50
+ %27 = arith.select %25, %23, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
51
+ %28 = arith.cmpi sge, %27, %cst_5 : tensor<64x1xi64, #blocked1>
52
+ %29 = arith.cmpi slt, %27, %cst_4 : tensor<64x1xi64, #blocked1>
53
+ %30 = arith.andi %28, %29 : tensor<64x1xi1, #blocked1>
54
+ %31 = arith.muli %26, %cst_1 : tensor<64x1xi64, #blocked>
55
+ %32 = tt.broadcast %31 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
56
+ %33 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
57
+ %34:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_9, %arg9 = %cst_9, %arg10 = %cst_9) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 {
58
+ %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
59
+ %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
60
+ %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
61
+ %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
62
+ %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
63
+ %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
64
+ %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
65
+ %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
66
+ tt.assert %30, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
67
+ %53 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
68
+ %54 = tt.broadcast %53 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
69
+ %55 = arith.addi %54, %32 : tensor<64x4xi64, #blocked>
70
+ %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
71
+ %57 = tt.load %56, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
72
+ %58 = arith.addf %57, %52 : tensor<64x4xf32, #blocked>
73
+ %59 = arith.subf %58, %arg8 : tensor<64x4xf32, #blocked>
74
+ %60 = arith.addf %arg10, %cst_6 : tensor<64x4xf32, #blocked>
75
+ %61 = arith.divf %59, %60 : tensor<64x4xf32, #blocked>
76
+ %62 = arith.addf %arg8, %61 : tensor<64x4xf32, #blocked>
77
+ %63 = arith.subf %58, %62 : tensor<64x4xf32, #blocked>
78
+ %64 = arith.mulf %59, %63 : tensor<64x4xf32, #blocked>
79
+ %65 = arith.addf %arg9, %64 : tensor<64x4xf32, #blocked>
80
+ %66 = arith.select %51, %62, %arg8 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
81
+ %67 = arith.select %51, %65, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
82
+ %68 = arith.select %51, %60, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
83
+ scf.yield %66, %67, %68 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
84
+ }
85
+ %35:3 = "tt.reduce"(%34#0, %34#1, %34#2) <{axis = 1 : i32}> ({
86
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
87
+ %45 = arith.subf %arg10, %arg7 : f32
88
+ %46 = arith.addf %arg9, %arg12 : f32
89
+ %47 = arith.cmpf oeq, %46, %cst_7 : f32
90
+ %48 = arith.divf %arg12, %46 : f32
91
+ %49 = arith.select %47, %cst_7, %48 : f32
92
+ %50 = arith.mulf %45, %49 : f32
93
+ %51 = arith.addf %arg7, %50 : f32
94
+ %52 = arith.addf %arg8, %arg11 : f32
95
+ %53 = arith.mulf %45, %45 : f32
96
+ %54 = arith.mulf %53, %arg9 : f32
97
+ %55 = arith.mulf %54, %49 : f32
98
+ %56 = arith.addf %52, %55 : f32
99
+ tt.reduce.return %51, %56, %46 : f32, f32, f32
100
+ }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
101
+ %36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
102
+ %37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
103
+ %38 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
104
+ %39 = tt.broadcast %36 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
105
+ %40 = arith.divf %37, %cst_12 : tensor<64x1xf32, #blocked>
106
+ %41 = arith.addf %40, %cst_11 : tensor<64x1xf32, #blocked>
107
+ %42 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
108
+ %43 = tt.broadcast %42 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
109
+ %44 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
110
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
111
+ %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
112
+ %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
113
+ %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
114
+ %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
115
+ %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
116
+ %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
117
+ %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
118
+ %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
119
+ %53 = tt.addptr %38, %46 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
120
+ %54 = tt.load %53, %47, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
121
+ tt.assert %30, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
122
+ %55 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
123
+ %56 = tt.broadcast %55 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
124
+ %57 = arith.addi %56, %32 : tensor<64x4xi64, #blocked>
125
+ %58 = tt.addptr %33, %57 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
126
+ %59 = tt.load %58, %51, %cst_9 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
127
+ %60 = arith.addf %59, %52 : tensor<64x4xf32, #blocked>
128
+ %61 = arith.subf %60, %39 : tensor<64x4xf32, #blocked>
129
+ %62 = tt.extern_elementwise %41 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
130
+ %63 = tt.broadcast %62 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
131
+ %64 = arith.mulf %61, %63 : tensor<64x4xf32, #blocked>
132
+ %65 = tt.broadcast %54 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
133
+ %66 = arith.mulf %64, %65 : tensor<64x4xf32, #blocked>
134
+ %67 = arith.addi %48, %43 : tensor<64x4xi32, #blocked>
135
+ %68 = tt.addptr %44, %67 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
136
+ %69 = arith.truncf %66 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
137
+ tt.store %68, %69, %51 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
138
+ }
139
+ tt.return
140
+ }
141
+ }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant 0.000000e+00 : f32
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %c4_i32 = arith.constant 4 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
12
+ %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
15
+ %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
16
+ %cst_9 = arith.constant dense<256> : tensor<1x4xi32>
17
+ %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
18
+ %c64_i32 = arith.constant 64 : i32
19
+ %0 = tt.get_program_id x : i32
20
+ %1 = arith.muli %0, %c64_i32 : i32
21
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
22
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
23
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
24
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
25
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
26
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
27
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
28
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
29
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
30
+ %11 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
31
+ %12 = arith.muli %11, %cst_8 : tensor<64x1xi32>
32
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
33
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
34
+ %15 = arith.addi %10, %cst_3 : tensor<64x1xi64>
35
+ %16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
36
+ %17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64>
37
+ %18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64>
38
+ %19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64>
39
+ %20 = arith.andi %18, %19 : tensor<64x1xi1>
40
+ %21 = arith.muli %17, %cst_1 : tensor<64x1xi64>
41
+ %22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x4xi64>
42
+ %23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
43
+ %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
44
+ %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
45
+ %48 = arith.addi %47, %7 : tensor<1x4xi32>
46
+ %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
47
+ %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
48
+ %51 = arith.addi %50, %13 : tensor<64x4xi32>
49
+ %52 = tt.addptr %14, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
50
+ %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
51
+ %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
52
+ tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
53
+ %55 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
54
+ %56 = tt.broadcast %55 : (tensor<1x4xi64>) -> tensor<64x4xi64>
55
+ %57 = arith.addi %56, %22 : tensor<64x4xi64>
56
+ %58 = tt.addptr %23, %57 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
57
+ %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
58
+ %60 = arith.addf %59, %54 : tensor<64x4xf32>
59
+ %61 = arith.subf %60, %arg8 : tensor<64x4xf32>
60
+ %62 = arith.addf %arg10, %cst_0 : tensor<64x4xf32>
61
+ %63 = arith.divf %61, %62 : tensor<64x4xf32>
62
+ %64 = arith.addf %arg8, %63 : tensor<64x4xf32>
63
+ %65 = arith.subf %60, %64 : tensor<64x4xf32>
64
+ %66 = arith.mulf %61, %65 : tensor<64x4xf32>
65
+ %67 = arith.addf %arg9, %66 : tensor<64x4xf32>
66
+ %68 = arith.select %53, %64, %arg8 : tensor<64x4xi1>, tensor<64x4xf32>
67
+ %69 = arith.select %53, %67, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
68
+ %70 = arith.select %53, %62, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
69
+ scf.yield %68, %69, %70 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
70
+ }
71
+ %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
72
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
73
+ %47 = arith.subf %arg10, %arg7 : f32
74
+ %48 = arith.addf %arg9, %arg12 : f32
75
+ %49 = arith.cmpf oeq, %48, %cst : f32
76
+ %50 = arith.divf %arg12, %48 : f32
77
+ %51 = arith.select %49, %cst, %50 : f32
78
+ %52 = arith.mulf %47, %51 : f32
79
+ %53 = arith.addf %arg7, %52 : f32
80
+ %54 = arith.addf %arg8, %arg11 : f32
81
+ %55 = arith.mulf %47, %47 : f32
82
+ %56 = arith.mulf %55, %arg9 : f32
83
+ %57 = arith.mulf %56, %51 : f32
84
+ %58 = arith.addf %54, %57 : f32
85
+ tt.reduce.return %53, %58, %48 : f32, f32, f32
86
+ }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
87
+ %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
88
+ %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
89
+ %28 = arith.muli %11, %cst_8 : tensor<64x1xi32>
90
+ %29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x4xi32>
91
+ %30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
92
+ %31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
93
+ %32 = arith.addi %10, %cst_3 : tensor<64x1xi64>
94
+ %33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
95
+ %34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64>
96
+ %35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64>
97
+ %36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64>
98
+ %37 = arith.andi %35, %36 : tensor<64x1xi1>
99
+ %38 = arith.muli %34, %cst_1 : tensor<64x1xi64>
100
+ %39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x4xi64>
101
+ %40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
102
+ %41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x4xf32>
103
+ %42 = arith.divf %27, %cst_5 : tensor<64x1xf32>
104
+ %43 = arith.addf %42, %cst_4 : tensor<64x1xf32>
105
+ %44 = arith.muli %5, %cst_8 : tensor<64x1xi32>
106
+ %45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x4xi32>
107
+ %46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
108
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
109
+ %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
110
+ %48 = arith.addi %47, %7 : tensor<1x4xi32>
111
+ %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
112
+ %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
113
+ %51 = arith.addi %50, %29 : tensor<64x4xi32>
114
+ %52 = tt.addptr %30, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
115
+ %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
116
+ %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
117
+ %55 = tt.addptr %31, %48 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
118
+ %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
119
+ tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
120
+ %57 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
121
+ %58 = tt.broadcast %57 : (tensor<1x4xi64>) -> tensor<64x4xi64>
122
+ %59 = arith.addi %58, %39 : tensor<64x4xi64>
123
+ %60 = tt.addptr %40, %59 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
124
+ %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
125
+ %62 = arith.addf %61, %54 : tensor<64x4xf32>
126
+ %63 = arith.subf %62, %41 : tensor<64x4xf32>
127
+ %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
128
+ %65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x4xf32>
129
+ %66 = arith.mulf %63, %65 : tensor<64x4xf32>
130
+ %67 = tt.broadcast %56 : (tensor<1x4xf32>) -> tensor<64x4xf32>
131
+ %68 = arith.mulf %66, %67 : tensor<64x4xf32>
132
+ %69 = arith.addi %50, %45 : tensor<64x4xi32>
133
+ %70 = tt.addptr %46, %69 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
134
+ %71 = arith.truncf %68 : tensor<64x4xf32> to tensor<64x4xbf16>
135
+ tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
136
+ }
137
+ tt.return
138
+ }
139
+ }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 60, !dbg !8
12
+ %11 = and i32 %8, 3, !dbg !9
13
+ %12 = lshr i32 %7, 4, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 1, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
17
+ %16 = shl i32 %15, 6, !dbg !11
18
+ %17 = or i32 %16, %10, !dbg !12
19
+ %.frozen = freeze i32 %17
20
+ %18 = sdiv i32 %.frozen, 256, !dbg !13
21
+ %19 = mul i32 %18, 256
22
+ %.decomposed = sub i32 %.frozen, %19
23
+ %20 = shl i32 %18, 15, !dbg !14
24
+ %21 = add i32 %20, %.decomposed
25
+ br label %22, !dbg !15
26
+
27
+ 22: ; preds = %5, %22
28
+ %23 = phi i32 [ 0, %5 ], [ %53, %22 ]
29
+ %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %52, %22 ]
30
+ %25 = or i32 %23, %14, !dbg !16
31
+ %26 = shl i32 %25, 8, !dbg !17
32
+ %27 = add i32 %21, %26, !dbg !18
33
+ %28 = sext i32 %27 to i64, !dbg !19
34
+ %29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19
35
+ %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
36
+ %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !20
37
+ %32 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !20
38
+ %33 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !20
39
+ %34 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !20
40
+ %35 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !21
41
+ %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !22
42
+ %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !22
43
+ %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !22
44
+ %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !22
45
+ %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !22
46
+ %41 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !20
47
+ %42 = insertelement <4 x i32> %41, i32 %32, i64 1, !dbg !20
48
+ %43 = insertelement <4 x i32> %42, i32 %33, i64 2, !dbg !20
49
+ %44 = insertelement <4 x i32> %43, i32 %34, i64 3, !dbg !20
50
+ %45 = bitcast <4 x i32> %44 to <4 x float>, !dbg !20
51
+ %46 = insertelement <4 x i32> poison, i32 %37, i64 0, !dbg !22
52
+ %47 = insertelement <4 x i32> %46, i32 %38, i64 1, !dbg !22
53
+ %48 = insertelement <4 x i32> %47, i32 %39, i64 2, !dbg !22
54
+ %49 = insertelement <4 x i32> %48, i32 %40, i64 3, !dbg !22
55
+ %50 = bitcast <4 x i32> %49 to <4 x float>, !dbg !22
56
+ %51 = fmul <4 x float> %45, %50, !dbg !23
57
+ %52 = fadd <4 x float> %24, %51, !dbg !24
58
+ %53 = add nuw nsw i32 %23, 8, !dbg !15
59
+ %54 = icmp ult i32 %23, 120, !dbg !15
60
+ br i1 %54, label %22, label %55, !dbg !15
61
+
62
+ 55: ; preds = %22
63
+ %56 = and i32 %6, 63, !dbg !8
64
+ %57 = or i32 %16, %56, !dbg !12
65
+ %58 = or i32 %10, 3, !dbg !25
66
+ %59 = or i32 %10, 2, !dbg !25
67
+ %60 = or i32 %10, 1, !dbg !25
68
+ %61 = extractelement <4 x float> %52, i64 0, !dbg !25
69
+ %62 = bitcast float %61 to i32, !dbg !25
70
+ %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 16, i32 31), !dbg !25
71
+ %64 = bitcast i32 %63 to float, !dbg !25
72
+ %65 = fadd float %61, %64, !dbg !29
73
+ %66 = extractelement <4 x float> %52, i64 1, !dbg !25
74
+ %67 = bitcast float %66 to i32, !dbg !25
75
+ %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !25
76
+ %69 = bitcast i32 %68 to float, !dbg !25
77
+ %70 = fadd float %66, %69, !dbg !29
78
+ %71 = extractelement <4 x float> %52, i64 2, !dbg !25
79
+ %72 = bitcast float %71 to i32, !dbg !25
80
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !25
81
+ %74 = bitcast i32 %73 to float, !dbg !25
82
+ %75 = fadd float %71, %74, !dbg !29
83
+ %76 = extractelement <4 x float> %52, i64 3, !dbg !25
84
+ %77 = bitcast float %76 to i32, !dbg !25
85
+ %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !25
86
+ %79 = bitcast i32 %78 to float, !dbg !25
87
+ %80 = fadd float %76, %79, !dbg !29
88
+ %81 = icmp ult i32 %7, 16, !dbg !25
89
+ %82 = shl nuw nsw i32 %10, 2, !dbg !25
90
+ %83 = or i32 %82, %11, !dbg !25
91
+ %84 = zext nneg i32 %83 to i64, !dbg !25
92
+ %85 = getelementptr float, ptr addrspace(3) @global_smem, i64 %84, !dbg !25
93
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, float %65, i1 %81) #3, !dbg !25
94
+ %86 = shl nuw nsw i32 %60, 2, !dbg !25
95
+ %87 = or i32 %86, %11, !dbg !25
96
+ %88 = zext nneg i32 %87 to i64, !dbg !25
97
+ %89 = getelementptr float, ptr addrspace(3) @global_smem, i64 %88, !dbg !25
98
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %89, float %70, i1 %81) #3, !dbg !25
99
+ %90 = shl nuw nsw i32 %59, 2, !dbg !25
100
+ %91 = or i32 %90, %11, !dbg !25
101
+ %92 = zext nneg i32 %91 to i64, !dbg !25
102
+ %93 = getelementptr float, ptr addrspace(3) @global_smem, i64 %92, !dbg !25
103
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %93, float %75, i1 %81) #3, !dbg !25
104
+ %94 = shl nuw nsw i32 %58, 2, !dbg !25
105
+ %95 = or i32 %94, %11, !dbg !25
106
+ %96 = zext nneg i32 %95 to i64, !dbg !25
107
+ %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !25
108
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %80, i1 %81) #3, !dbg !25
109
+ tail call void @llvm.nvvm.barrier0(), !dbg !25
110
+ %98 = icmp slt i32 %6, 256, !dbg !25
111
+ %99 = sext i32 %6 to i64, !dbg !25
112
+ %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !25
113
+ %101 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %100, i1 %98) #3, !dbg !25
114
+ %102 = bitcast float %101 to i32, !dbg !25
115
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !25
116
+ %104 = bitcast i32 %103 to float, !dbg !25
117
+ %105 = fadd float %101, %104, !dbg !29
118
+ %106 = bitcast float %105 to i32, !dbg !25
119
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !25
120
+ %108 = bitcast i32 %107 to float, !dbg !25
121
+ %109 = fadd float %105, %108, !dbg !29
122
+ %110 = and i32 %6, 3, !dbg !25
123
+ %111 = icmp eq i32 %110, 0, !dbg !25
124
+ %112 = and i1 %98, %111, !dbg !25
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %109, i1 %112) #3, !dbg !25
126
+ %113 = add i32 %6, 128, !dbg !25
127
+ %114 = sext i32 %113 to i64, !dbg !25
128
+ %115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !25
129
+ %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %98) #3, !dbg !25
130
+ %117 = bitcast float %116 to i32, !dbg !25
131
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !25
132
+ %119 = bitcast i32 %118 to float, !dbg !25
133
+ %120 = fadd float %116, %119, !dbg !29
134
+ %121 = bitcast float %120 to i32, !dbg !25
135
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !25
136
+ %123 = bitcast i32 %122 to float, !dbg !25
137
+ %124 = fadd float %120, %123, !dbg !29
138
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %124, i1 %112) #3, !dbg !25
139
+ tail call void @llvm.nvvm.barrier0(), !dbg !25
140
+ %125 = zext nneg i32 %82 to i64, !dbg !25
141
+ %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !25
142
+ %127 = load float, ptr addrspace(3) %126, align 4, !dbg !25
143
+ %128 = zext nneg i32 %86 to i64, !dbg !25
144
+ %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !25
145
+ %130 = load float, ptr addrspace(3) %129, align 4, !dbg !25
146
+ %131 = zext nneg i32 %90 to i64, !dbg !25
147
+ %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !25
148
+ %133 = load float, ptr addrspace(3) %132, align 4, !dbg !25
149
+ %134 = zext nneg i32 %94 to i64, !dbg !25
150
+ %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !25
151
+ %136 = load float, ptr addrspace(3) %135, align 4, !dbg !25
152
+ tail call void @llvm.nvvm.barrier0(), !dbg !33
153
+ %137 = zext nneg i32 %10 to i64, !dbg !33
154
+ %138 = getelementptr float, ptr addrspace(3) @global_smem, i64 %137, !dbg !33
155
+ %139 = insertelement <1 x float> undef, float %127, i64 0, !dbg !33
156
+ store <1 x float> %139, ptr addrspace(3) %138, align 4, !dbg !33
157
+ %140 = zext nneg i32 %60 to i64, !dbg !33
158
+ %141 = getelementptr float, ptr addrspace(3) @global_smem, i64 %140, !dbg !33
159
+ %142 = insertelement <1 x float> undef, float %130, i64 0, !dbg !33
160
+ store <1 x float> %142, ptr addrspace(3) %141, align 4, !dbg !33
161
+ %143 = zext nneg i32 %59 to i64, !dbg !33
162
+ %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !33
163
+ %145 = insertelement <1 x float> undef, float %133, i64 0, !dbg !33
164
+ store <1 x float> %145, ptr addrspace(3) %144, align 4, !dbg !33
165
+ %146 = zext nneg i32 %58 to i64, !dbg !33
166
+ %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !33
167
+ %148 = insertelement <1 x float> undef, float %136, i64 0, !dbg !33
168
+ store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !33
169
+ tail call void @llvm.nvvm.barrier0(), !dbg !33
170
+ %149 = zext nneg i32 %56 to i64, !dbg !33
171
+ %150 = getelementptr float, ptr addrspace(3) @global_smem, i64 %149, !dbg !33
172
+ %151 = load i32, ptr addrspace(3) %150, align 4, !dbg !33
173
+ %152 = sext i32 %57 to i64, !dbg !34
174
+ %153 = getelementptr float, ptr addrspace(1) %2, i64 %152, !dbg !34
175
+ %154 = and i32 %6, 64, !dbg !35
176
+ %155 = icmp eq i32 %154, 0, !dbg !35
177
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %153, i1 %155) #3, !dbg !35
178
+ ret void, !dbg !36
179
+ }
180
+
181
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
182
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
183
+
184
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
185
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
186
+
187
+ ; Function Attrs: convergent nocallback nounwind
188
+ declare void @llvm.nvvm.barrier0() #2
189
+
190
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
191
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
192
+ attributes #2 = { convergent nocallback nounwind }
193
+ attributes #3 = { nounwind }
194
+
195
+ !llvm.module.flags = !{!0}
196
+ !llvm.dbg.cu = !{!1}
197
+ !nvvm.annotations = !{!3, !4, !4, !3}
198
+
199
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
200
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
201
+ !2 = !DIFile(filename: "cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py", directory: "/tmp/torchinductor_root/qd")
202
+ !3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
203
+ !4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
204
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
205
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
206
+ !7 = !{}
207
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
208
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
209
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
210
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
211
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
212
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
213
+ !14 = !DILocation(line: 33, column: 57, scope: !5)
214
+ !15 = !DILocation(line: 29, column: 36, scope: !5)
215
+ !16 = !DILocation(line: 30, column: 27, scope: !5)
216
+ !17 = !DILocation(line: 33, column: 44, scope: !5)
217
+ !18 = !DILocation(line: 33, column: 51, scope: !5)
218
+ !19 = !DILocation(line: 33, column: 34, scope: !5)
219
+ !20 = !DILocation(line: 33, column: 63, scope: !5)
220
+ !21 = !DILocation(line: 34, column: 34, scope: !5)
221
+ !22 = !DILocation(line: 34, column: 63, scope: !5)
222
+ !23 = !DILocation(line: 35, column: 22, scope: !5)
223
+ !24 = !DILocation(line: 38, column: 38, scope: !5)
224
+ !25 = !DILocation(line: 243, column: 36, scope: !26, inlinedAt: !28)
225
+ !26 = distinct !DILexicalBlockFile(scope: !5, file: !27, discriminator: 0)
226
+ !27 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
227
+ !28 = !DILocation(line: 39, column: 25, scope: !26)
228
+ !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !31)
229
+ !30 = distinct !DILexicalBlockFile(scope: !26, file: !27, discriminator: 0)
230
+ !31 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
231
+ !32 = !DILocation(line: 39, column: 25, scope: !30)
232
+ !33 = !DILocation(line: 39, column: 28, scope: !5)
233
+ !34 = !DILocation(line: 40, column: 25, scope: !5)
234
+ !35 = !DILocation(line: 40, column: 36, scope: !5)
235
+ !36 = !DILocation(line: 40, column: 4, scope: !5)
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c8_i32 = arith.constant 8 : i32
4
+ %c128_i32 = arith.constant 128 : i32
5
+ %c0_i32 = arith.constant 0 : i32
6
+ %cst = arith.constant dense<32768> : tensor<64x1xi32>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x8xi32>
8
+ %cst_1 = arith.constant dense<128> : tensor<1x8xi32>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
10
+ %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
11
+ %c64_i32 = arith.constant 64 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c64_i32 : i32
14
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
18
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
20
+ %8 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
21
+ %9 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
22
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
23
+ %11 = arith.muli %9, %cst : tensor<64x1xi32>
24
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
26
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
27
+ %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>) : i32 {
28
+ %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
29
+ %21 = arith.addi %20, %7 : tensor<1x8xi32>
30
+ %22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32>
31
+ %23 = arith.muli %21, %cst_0 : tensor<1x8xi32>
32
+ %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
33
+ %25 = arith.addi %10, %24 : tensor<64x8xi32>
34
+ %26 = arith.addi %25, %12 : tensor<64x8xi32>
35
+ %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
36
+ %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
37
+ %29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
38
+ %30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
39
+ %31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
40
+ %32 = arith.mulf %29, %31 : tensor<64x8xf32>
41
+ %33 = arith.addf %arg6, %32 : tensor<64x8xf32>
42
+ %34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
43
+ scf.yield %34 : tensor<64x8xf32>
44
+ }
45
+ %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
46
+ ^bb0(%arg5: f32, %arg6: f32):
47
+ %20 = arith.addf %arg5, %arg6 : f32
48
+ tt.reduce.return %20 : f32
49
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
50
+ %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
51
+ %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
52
+ %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
53
+ tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
54
+ tt.return
55
+ }
56
+ }
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin ADDED
Binary file (29 kB). View file
 
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = and i32 %8, 31, !dbg !10
18
+ %10 = lshr i32 %8, 5, !dbg !10
19
+ %11 = lshr i32 %8, 6, !dbg !10
20
+ %12 = and i32 %11, 1, !dbg !10
21
+ %13 = and i32 %8, 1, !dbg !10
22
+ %14 = and i32 %10, 1, !dbg !11
23
+ %urem = shl i32 %8, 2, !dbg !11
24
+ %15 = and i32 %urem, 252, !dbg !11
25
+ %16 = shl i32 %8, 1, !dbg !11
26
+ %17 = and i32 %16, 254, !dbg !11
27
+ %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
28
+ %19 = shl i32 %18, 1, !dbg !13
29
+ %20 = or i32 %19, %12, !dbg !14
30
+ %21 = or i32 %19, %13, !dbg !14
31
+ %22 = sext i32 %20 to i64, !dbg !15
32
+ %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
33
+ %24 = sext i32 %21 to i64, !dbg !15
34
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
35
+ %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
36
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
37
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
38
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
39
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
40
+ %31 = srem i32 %20, 512, !dbg !17
41
+ %32 = shl nsw i32 %31, 8, !dbg !18
42
+ %33 = or i32 %32, %15, !dbg !19
43
+ %34 = sext i32 %33 to i64, !dbg !20
44
+ %35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
45
+ %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
46
+ %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
47
+ %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
48
+ %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
49
+ %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
50
+ %41 = bitcast i32 %37 to float, !dbg !21
51
+ %42 = bitcast i32 %38 to float, !dbg !21
52
+ %43 = bitcast i32 %39 to float, !dbg !21
53
+ %44 = bitcast i32 %40 to float, !dbg !21
54
+ %45 = add i64 %30, 50257, !dbg !22
55
+ %46 = icmp slt i64 %26, 0, !dbg !23
56
+ %47 = icmp slt i64 %30, 0, !dbg !23
57
+ %48 = select i1 %47, i64 %45, i64 %30, !dbg !24
58
+ %49 = icmp ugt i64 %48, 50256, !dbg !25
59
+ br i1 %49, label %50, label %51, !dbg !26
60
+
61
+ 50: ; preds = %7
62
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
63
+ br label %51, !dbg !26
64
+
65
+ 51: ; preds = %50, %7
66
+ %52 = shl i64 %26, 8, !dbg !27
67
+ %53 = add i64 %52, 12865792, !dbg !27
68
+ %54 = select i1 %46, i64 %53, i64 %52, !dbg !27
69
+ %55 = zext nneg i32 %15 to i64
70
+ %56 = or i64 %54, %55, !dbg !28
71
+ %57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
72
+ %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
73
+ %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
74
+ %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
75
+ %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
76
+ %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
77
+ %63 = bitcast i32 %59 to float, !dbg !30
78
+ %64 = bitcast i32 %60 to float, !dbg !30
79
+ %65 = bitcast i32 %61 to float, !dbg !30
80
+ %66 = bitcast i32 %62 to float, !dbg !30
81
+ %67 = fadd float %41, %63, !dbg !31
82
+ %68 = fadd float %42, %64, !dbg !31
83
+ %69 = fadd float %43, %65, !dbg !31
84
+ %70 = fadd float %44, %66, !dbg !31
85
+ %71 = fadd float %67, 0.000000e+00, !dbg !32
86
+ %72 = fadd float %68, 0.000000e+00, !dbg !32
87
+ %73 = fadd float %69, 0.000000e+00, !dbg !32
88
+ %74 = fadd float %70, 0.000000e+00, !dbg !32
89
+ %75 = fsub float %67, %71, !dbg !36
90
+ %76 = fsub float %68, %72, !dbg !36
91
+ %77 = fsub float %69, %73, !dbg !36
92
+ %78 = fsub float %70, %74, !dbg !36
93
+ %79 = fmul float %67, %75, !dbg !37
94
+ %80 = fmul float %68, %76, !dbg !37
95
+ %81 = fmul float %69, %77, !dbg !37
96
+ %82 = fmul float %70, %78, !dbg !37
97
+ %83 = fadd float %79, 0.000000e+00, !dbg !38
98
+ %84 = fadd float %80, 0.000000e+00, !dbg !38
99
+ %85 = fadd float %81, 0.000000e+00, !dbg !38
100
+ %86 = fadd float %82, 0.000000e+00, !dbg !38
101
+ %87 = fsub float %72, %71, !dbg !39
102
+ %88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
103
+ %89 = fmul float %88, %87, !dbg !44
104
+ %90 = fadd float %71, %89, !dbg !45
105
+ %91 = fadd float %83, %84, !dbg !46
106
+ %92 = fmul float %87, %87, !dbg !47
107
+ %93 = fmul float %88, %92, !dbg !48
108
+ %94 = fadd float %93, %91, !dbg !49
109
+ %95 = fsub float %73, %90, !dbg !39
110
+ %96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
111
+ %97 = fmul float %96, %95, !dbg !44
112
+ %98 = fadd float %90, %97, !dbg !45
113
+ %99 = fadd float %85, %94, !dbg !46
114
+ %100 = fmul float %95, %95, !dbg !47
115
+ %101 = fmul float %100, 2.000000e+00, !dbg !50
116
+ %102 = fmul float %96, %101, !dbg !48
117
+ %103 = fadd float %99, %102, !dbg !49
118
+ %104 = fsub float %74, %98, !dbg !39
119
+ %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
120
+ %106 = fmul float %105, %104, !dbg !44
121
+ %107 = fadd float %98, %106, !dbg !45
122
+ %108 = fadd float %86, %103, !dbg !46
123
+ %109 = fmul float %104, %104, !dbg !47
124
+ %110 = fmul float %109, 3.000000e+00, !dbg !50
125
+ %111 = fmul float %105, %110, !dbg !48
126
+ %112 = fadd float %108, %111, !dbg !49
127
+ %113 = bitcast float %107 to i32, !dbg !51
128
+ %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
129
+ %115 = bitcast i32 %114 to float, !dbg !51
130
+ %116 = bitcast float %112 to i32, !dbg !51
131
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
132
+ %118 = bitcast i32 %117 to float, !dbg !51
133
+ %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
134
+ %120 = bitcast i32 %119 to float, !dbg !51
135
+ %121 = fsub float %115, %107, !dbg !39
136
+ %122 = fadd float %120, 4.000000e+00, !dbg !53
137
+ %123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
138
+ %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
139
+ %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
140
+ %126 = fmul float %125, %121, !dbg !44
141
+ %127 = fadd float %107, %126, !dbg !45
142
+ %128 = fadd float %112, %118, !dbg !46
143
+ %129 = fmul float %121, %121, !dbg !47
144
+ %130 = fmul float %129, 4.000000e+00, !dbg !50
145
+ %131 = fmul float %125, %130, !dbg !48
146
+ %132 = fadd float %128, %131, !dbg !49
147
+ %133 = bitcast float %127 to i32, !dbg !51
148
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
149
+ %135 = bitcast i32 %134 to float, !dbg !51
150
+ %136 = bitcast float %132 to i32, !dbg !51
151
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
152
+ %138 = bitcast i32 %137 to float, !dbg !51
153
+ %139 = bitcast float %122 to i32, !dbg !51
154
+ %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
155
+ %141 = bitcast i32 %140 to float, !dbg !51
156
+ %142 = fsub float %135, %127, !dbg !39
157
+ %143 = fadd float %122, %141, !dbg !53
158
+ %144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
159
+ %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
160
+ %146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
161
+ %147 = fmul float %146, %142, !dbg !44
162
+ %148 = fadd float %127, %147, !dbg !45
163
+ %149 = fadd float %132, %138, !dbg !46
164
+ %150 = fmul float %142, %142, !dbg !47
165
+ %151 = fmul float %122, %150, !dbg !50
166
+ %152 = fmul float %146, %151, !dbg !48
167
+ %153 = fadd float %149, %152, !dbg !49
168
+ %154 = bitcast float %148 to i32, !dbg !51
169
+ %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
170
+ %156 = bitcast i32 %155 to float, !dbg !51
171
+ %157 = bitcast float %153 to i32, !dbg !51
172
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
173
+ %159 = bitcast i32 %158 to float, !dbg !51
174
+ %160 = bitcast float %143 to i32, !dbg !51
175
+ %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
176
+ %162 = bitcast i32 %161 to float, !dbg !51
177
+ %163 = fsub float %156, %148, !dbg !39
178
+ %164 = fadd float %143, %162, !dbg !53
179
+ %165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
180
+ %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
181
+ %167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
182
+ %168 = fmul float %167, %163, !dbg !44
183
+ %169 = fadd float %148, %168, !dbg !45
184
+ %170 = fadd float %153, %159, !dbg !46
185
+ %171 = fmul float %163, %163, !dbg !47
186
+ %172 = fmul float %143, %171, !dbg !50
187
+ %173 = fmul float %167, %172, !dbg !48
188
+ %174 = fadd float %170, %173, !dbg !49
189
+ %175 = bitcast float %169 to i32, !dbg !51
190
+ %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
191
+ %177 = bitcast i32 %176 to float, !dbg !51
192
+ %178 = bitcast float %174 to i32, !dbg !51
193
+ %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
194
+ %180 = bitcast i32 %179 to float, !dbg !51
195
+ %181 = bitcast float %164 to i32, !dbg !51
196
+ %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
197
+ %183 = bitcast i32 %182 to float, !dbg !51
198
+ %184 = fsub float %177, %169, !dbg !39
199
+ %185 = fadd float %164, %183, !dbg !53
200
+ %186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
201
+ %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
202
+ %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
203
+ %189 = fmul float %188, %184, !dbg !44
204
+ %190 = fadd float %169, %189, !dbg !45
205
+ %191 = fadd float %174, %180, !dbg !46
206
+ %192 = fmul float %184, %184, !dbg !47
207
+ %193 = fmul float %164, %192, !dbg !50
208
+ %194 = fmul float %188, %193, !dbg !48
209
+ %195 = fadd float %191, %194, !dbg !49
210
+ %196 = bitcast float %190 to i32, !dbg !51
211
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
212
+ %198 = bitcast i32 %197 to float, !dbg !51
213
+ %199 = bitcast float %195 to i32, !dbg !51
214
+ %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
215
+ %201 = bitcast i32 %200 to float, !dbg !51
216
+ %202 = bitcast float %185 to i32, !dbg !51
217
+ %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
218
+ %204 = bitcast i32 %203 to float, !dbg !51
219
+ %205 = fsub float %198, %190, !dbg !39
220
+ %206 = fadd float %185, %204, !dbg !53
221
+ %207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
222
+ %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
223
+ %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
224
+ %210 = fmul float %205, %209, !dbg !44
225
+ %211 = fadd float %190, %210, !dbg !45
226
+ %212 = fadd float %195, %201, !dbg !46
227
+ %213 = fmul float %205, %205, !dbg !47
228
+ %214 = fmul float %185, %213, !dbg !50
229
+ %215 = fmul float %209, %214, !dbg !48
230
+ %216 = fadd float %212, %215, !dbg !49
231
+ %217 = icmp eq i32 %9, 0, !dbg !51
232
+ %218 = shl nuw nsw i32 %12, 1, !dbg !51
233
+ %219 = or i32 %218, %14, !dbg !51
234
+ %220 = zext nneg i32 %219 to i64, !dbg !51
235
+ %221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
236
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
237
+ %222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
238
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
239
+ %223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
240
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
241
+ tail call void @llvm.nvvm.barrier0(), !dbg !51
242
+ %224 = icmp slt i32 %8, 4, !dbg !51
243
+ %225 = sext i32 %8 to i64, !dbg !51
244
+ %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
245
+ %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
246
+ %228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
247
+ %229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
248
+ %230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
249
+ %231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
250
+ %232 = bitcast float %227 to i32, !dbg !51
251
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
252
+ %234 = bitcast i32 %233 to float, !dbg !51
253
+ %235 = bitcast float %229 to i32, !dbg !51
254
+ %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
255
+ %237 = bitcast i32 %236 to float, !dbg !51
256
+ %238 = bitcast float %231 to i32, !dbg !51
257
+ %239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
258
+ %240 = bitcast i32 %239 to float, !dbg !51
259
+ %241 = fsub float %234, %227, !dbg !39
260
+ %242 = fadd float %231, %240, !dbg !53
261
+ %243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
262
+ %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
263
+ %245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
264
+ %246 = fmul float %241, %245, !dbg !44
265
+ %247 = fadd float %227, %246, !dbg !45
266
+ %248 = fadd float %229, %237, !dbg !46
267
+ %249 = fmul float %241, %241, !dbg !47
268
+ %250 = fmul float %231, %249, !dbg !50
269
+ %251 = fmul float %250, %245, !dbg !48
270
+ %252 = fadd float %248, %251, !dbg !49
271
+ %253 = icmp eq i32 %13, 0, !dbg !51
272
+ %254 = and i1 %224, %253, !dbg !51
273
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
274
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
275
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
276
+ tail call void @llvm.nvvm.barrier0(), !dbg !51
277
+ %255 = zext nneg i32 %218 to i64, !dbg !51
278
+ %256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
279
+ %257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
280
+ %258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
281
+ %259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
282
+ %260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
283
+ %261 = zext nneg i32 %17 to i64, !dbg !57
284
+ %262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
285
+ %263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
286
+ br i1 %49, label %264, label %265, !dbg !59
287
+
288
+ 264: ; preds = %51
289
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
290
+ br label %265, !dbg !59
291
+
292
+ 265: ; preds = %264, %51
293
+ %266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
294
+ %267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
295
+ %268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
296
+ %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
297
+ %270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
298
+ %271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
299
+ %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
300
+ %.not.i = icmp eq i32 %272, 0, !dbg !63
301
+ br i1 %.not.i, label %275, label %273, !dbg !63
302
+
303
+ 273: ; preds = %265
304
+ %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
305
+ br label %__nv_rsqrtf.exit, !dbg !63
306
+
307
+ 275: ; preds = %265
308
+ %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
309
+ br label %__nv_rsqrtf.exit, !dbg !63
310
+
311
+ __nv_rsqrtf.exit: ; preds = %273, %275
312
+ %.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
313
+ %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
314
+ %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
315
+ %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
316
+ %280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
317
+ %281 = bitcast i32 %280 to float, !dbg !60
318
+ %282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
319
+ %283 = bitcast i32 %282 to float, !dbg !56
320
+ %284 = fadd float %283, %281, !dbg !64
321
+ %285 = fsub float %284, %257, !dbg !65
322
+ %286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
323
+ %287 = bitcast i32 %286 to float, !dbg !60
324
+ %288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
325
+ %289 = bitcast i32 %288 to float, !dbg !56
326
+ %290 = fadd float %289, %287, !dbg !64
327
+ %291 = fsub float %290, %257, !dbg !65
328
+ %292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
329
+ %293 = bitcast i32 %292 to float, !dbg !60
330
+ %294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
331
+ %295 = bitcast i32 %294 to float, !dbg !56
332
+ %296 = fadd float %295, %293, !dbg !64
333
+ %297 = fsub float %296, %257, !dbg !65
334
+ %298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
335
+ %299 = bitcast i32 %298 to float, !dbg !60
336
+ %300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
337
+ %301 = bitcast i32 %300 to float, !dbg !56
338
+ %302 = fadd float %301, %299, !dbg !64
339
+ %303 = fsub float %302, %257, !dbg !65
340
+ %304 = extractvalue { i32, i32 } %263, 0, !dbg !58
341
+ %305 = extractvalue { i32, i32 } %263, 1, !dbg !58
342
+ %306 = fmul float %303, %.0.i, !dbg !66
343
+ %307 = fmul float %297, %.0.i, !dbg !66
344
+ %308 = fmul float %291, %.0.i, !dbg !66
345
+ %309 = fmul float %285, %.0.i, !dbg !66
346
+ tail call void @llvm.nvvm.barrier0(), !dbg !67
347
+ %310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
348
+ %311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
349
+ %312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
350
+ store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
351
+ tail call void @llvm.nvvm.barrier0(), !dbg !67
352
+ %313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
353
+ %314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
354
+ %315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
355
+ %316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
356
+ %317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
357
+ %318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
358
+ %319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
359
+ %320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
360
+ %321 = fmul float %306, %314, !dbg !67
361
+ %322 = fmul float %307, %316, !dbg !67
362
+ %323 = fmul float %308, %318, !dbg !67
363
+ %324 = fmul float %309, %320, !dbg !67
364
+ %325 = shl i32 %20, 8, !dbg !68
365
+ %326 = or i32 %325, %15, !dbg !69
366
+ %327 = sext i32 %326 to i64, !dbg !70
367
+ %328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
368
+ %329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
369
+ %330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
370
+ %331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
371
+ %332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
372
+ %333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
373
+ %334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
374
+ %335 = bitcast <2 x i16> %334 to i32, !dbg !71
375
+ %336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
376
+ %337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
377
+ %338 = bitcast <2 x i16> %337 to i32, !dbg !71
378
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
379
+ ret void, !dbg !72
380
+ }
381
+
382
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
383
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
384
+
385
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
386
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
387
+
388
+ ; Function Attrs: convergent nocallback nounwind
389
+ declare void @llvm.nvvm.barrier0() #2
390
+
391
+ ; Function Attrs: alwaysinline nounwind
392
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
393
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
394
+ %.not = icmp eq i32 %1, 0
395
+ br i1 %.not, label %4, label %2
396
+
397
+ 2: ; preds = %0
398
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
399
+ br label %6
400
+
401
+ 4: ; preds = %0
402
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
403
+ br label %6
404
+
405
+ 6: ; preds = %4, %2
406
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
407
+ ret float %.0
408
+ }
409
+
410
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
411
+
412
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
413
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
414
+
415
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
416
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
417
+
418
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
419
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
420
+ attributes #2 = { convergent nocallback nounwind }
421
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
422
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
423
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
424
+ attributes #6 = { nounwind }
425
+
426
+ !llvm.module.flags = !{!0, !1}
427
+ !llvm.dbg.cu = !{!2}
428
+ !nvvm.annotations = !{!4, !5, !5, !4}
429
+ !llvm.ident = !{!6}
430
+
431
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
432
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
433
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
434
+ !3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
435
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
436
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
437
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
438
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
439
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
440
+ !9 = !{}
441
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
442
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
443
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
444
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
445
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
446
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
447
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
448
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
449
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
450
+ !19 = !DILocation(line: 35, column: 40, scope: !7)
451
+ !20 = !DILocation(line: 35, column: 34, scope: !7)
452
+ !21 = !DILocation(line: 35, column: 50, scope: !7)
453
+ !22 = !DILocation(line: 36, column: 22, scope: !7)
454
+ !23 = !DILocation(line: 37, column: 22, scope: !7)
455
+ !24 = !DILocation(line: 38, column: 36, scope: !7)
456
+ !25 = !DILocation(line: 39, column: 40, scope: !7)
457
+ !26 = !DILocation(line: 39, column: 55, scope: !7)
458
+ !27 = !DILocation(line: 40, column: 44, scope: !7)
459
+ !28 = !DILocation(line: 40, column: 40, scope: !7)
460
+ !29 = !DILocation(line: 40, column: 34, scope: !7)
461
+ !30 = !DILocation(line: 40, column: 52, scope: !7)
462
+ !31 = !DILocation(line: 41, column: 22, scope: !7)
463
+ !32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
464
+ !33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
465
+ !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
466
+ !35 = !DILocation(line: 44, column: 38, scope: !33)
467
+ !36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
468
+ !37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
469
+ !38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
470
+ !39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
471
+ !40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
472
+ !41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
473
+ !42 = !DILocation(line: 50, column: 41, scope: !40)
474
+ !43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
475
+ !44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
476
+ !45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
477
+ !46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
478
+ !47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
479
+ !48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
480
+ !49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
481
+ !50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
482
+ !51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
483
+ !52 = !DILocation(line: 50, column: 41, scope: !33)
484
+ !53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
485
+ !54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
486
+ !55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
487
+ !56 = !DILocation(line: 59, column: 51, scope: !7)
488
+ !57 = !DILocation(line: 60, column: 35, scope: !7)
489
+ !58 = !DILocation(line: 60, column: 40, scope: !7)
490
+ !59 = !DILocation(line: 64, column: 57, scope: !7)
491
+ !60 = !DILocation(line: 65, column: 54, scope: !7)
492
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
493
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
494
+ !63 = !DILocation(line: 72, column: 30, scope: !7)
495
+ !64 = !DILocation(line: 66, column: 24, scope: !7)
496
+ !65 = !DILocation(line: 67, column: 24, scope: !7)
497
+ !66 = !DILocation(line: 73, column: 24, scope: !7)
498
+ !67 = !DILocation(line: 74, column: 24, scope: !7)
499
+ !68 = !DILocation(line: 76, column: 39, scope: !7)
500
+ !69 = !DILocation(line: 76, column: 35, scope: !7)
501
+ !70 = !DILocation(line: 76, column: 29, scope: !7)
502
+ !71 = !DILocation(line: 76, column: 52, scope: !7)
503
+ !72 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
16
+ %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
17
+ %cst_10 = arith.constant 0.000000e+00 : f32
18
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
19
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
22
+ %c2_i32 = arith.constant 2 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c2_i32 : i32
25
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
29
+ %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
31
+ %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
33
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
35
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
36
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
37
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
38
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
39
+ %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
40
+ %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
41
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
42
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
43
+ %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
44
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
45
+ %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
46
+ %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
47
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
48
+ %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
49
+ %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
50
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
51
+ %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
52
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
53
+ %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
54
+ %31 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
55
+ %32 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
56
+ %33 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
57
+ %34 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
58
+ %35 = arith.select %33, %31, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
59
+ %36 = arith.select %34, %32, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
60
+ %37 = arith.cmpi sge, %36, %cst_8 : tensor<2x1xi64, #blocked2>
61
+ %38 = arith.cmpi slt, %36, %cst_9 : tensor<2x1xi64, #blocked2>
62
+ %39 = arith.andi %37, %38 : tensor<2x1xi1, #blocked2>
63
+ tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
64
+ %40 = arith.muli %35, %cst_5 : tensor<2x1xi64, #blocked>
65
+ %41 = tt.broadcast %40 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
66
+ %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
67
+ %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
68
+ %44 = arith.addi %43, %41 : tensor<2x256xi64, #blocked>
69
+ %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
70
+ %46 = tt.addptr %45, %44 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
71
+ %47 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
72
+ %48 = arith.addf %47, %30 : tensor<2x256xf32, #blocked>
73
+ %49 = arith.addf %48, %cst_13 : tensor<2x256xf32, #blocked>
74
+ %50 = arith.subf %48, %49 : tensor<2x256xf32, #blocked>
75
+ %51 = arith.mulf %48, %50 : tensor<2x256xf32, #blocked>
76
+ %52 = arith.addf %51, %cst_13 : tensor<2x256xf32, #blocked>
77
+ %53 = arith.select %29, %49, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
78
+ %54 = arith.select %29, %52, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
79
+ %55 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
80
+ %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
81
+ %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
82
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
83
+ %82 = arith.subf %arg10, %arg7 : f32
84
+ %83 = arith.addf %arg9, %arg12 : f32
85
+ %84 = arith.cmpf oeq, %83, %cst_10 : f32
86
+ %85 = arith.divf %arg12, %83 : f32
87
+ %86 = arith.select %84, %cst_10, %85 : f32
88
+ %87 = arith.mulf %82, %86 : f32
89
+ %88 = arith.addf %arg7, %87 : f32
90
+ %89 = arith.addf %arg8, %arg11 : f32
91
+ %90 = arith.mulf %82, %82 : f32
92
+ %91 = arith.mulf %90, %arg9 : f32
93
+ %92 = arith.mulf %91, %86 : f32
94
+ %93 = arith.addf %89, %92 : f32
95
+ tt.reduce.return %88, %93, %83 : f32, f32, f32
96
+ }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
97
+ %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
98
+ %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
99
+ %60 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
100
+ %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
101
+ %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
102
+ %63 = tt.load %62, %22, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
103
+ tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
104
+ %64 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
105
+ %65 = arith.addf %64, %60 : tensor<2x256xf32, #blocked>
106
+ %66 = tt.broadcast %58 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
107
+ %67 = arith.subf %65, %66 : tensor<2x256xf32, #blocked>
108
+ %68 = arith.divf %59, %cst_12 : tensor<2x1xf32, #blocked>
109
+ %69 = arith.addf %68, %cst_11 : tensor<2x1xf32, #blocked>
110
+ %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
111
+ %71 = tt.broadcast %70 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
112
+ %72 = arith.mulf %67, %71 : tensor<2x256xf32, #blocked>
113
+ %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
114
+ %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
115
+ %75 = arith.mulf %72, %74 : tensor<2x256xf32, #blocked>
116
+ %76 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
117
+ %77 = tt.broadcast %76 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
118
+ %78 = arith.addi %24, %77 : tensor<2x256xi32, #blocked>
119
+ %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
120
+ %80 = tt.addptr %79, %78 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
121
+ %81 = arith.truncf %75 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
122
+ tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
123
+ tt.return
124
+ }
125
+ }
.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
12
+ %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
15
+ %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
16
+ %cst_9 = arith.constant 0.000000e+00 : f32
17
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
18
+ %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
19
+ %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
22
+ %c16_i32 = arith.constant 16 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c16_i32 : i32
25
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
29
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
31
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
33
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
35
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
36
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
37
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
38
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
39
+ %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
40
+ %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
41
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
42
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
43
+ %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
44
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
45
+ %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
46
+ %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
47
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
48
+ %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
49
+ %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
50
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
51
+ %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
52
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
53
+ %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
54
+ %31 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
55
+ %32 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
56
+ %33 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
57
+ %34 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
58
+ %35 = arith.select %33, %31, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
59
+ %36 = arith.select %34, %32, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
60
+ %37 = arith.cmpi sge, %36, %cst_7 : tensor<16x1xi64, #blocked1>
61
+ %38 = arith.cmpi slt, %36, %cst_8 : tensor<16x1xi64, #blocked1>
62
+ %39 = arith.andi %37, %38 : tensor<16x1xi1, #blocked1>
63
+ tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
64
+ %40 = arith.muli %35, %cst_4 : tensor<16x1xi64, #blocked>
65
+ %41 = tt.broadcast %40 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
66
+ %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
67
+ %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
68
+ %44 = arith.addi %43, %41 : tensor<16x256xi64, #blocked>
69
+ %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
70
+ %46 = tt.addptr %45, %44 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
71
+ %47 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
72
+ %48 = arith.addf %47, %30 : tensor<16x256xf32, #blocked>
73
+ %49 = arith.addf %48, %cst_14 : tensor<16x256xf32, #blocked>
74
+ %50 = arith.subf %48, %49 : tensor<16x256xf32, #blocked>
75
+ %51 = arith.mulf %48, %50 : tensor<16x256xf32, #blocked>
76
+ %52 = arith.addf %51, %cst_14 : tensor<16x256xf32, #blocked>
77
+ %53 = arith.select %29, %49, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
78
+ %54 = arith.select %29, %52, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
79
+ %55 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
80
+ %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
81
+ %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
82
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
83
+ %82 = arith.subf %arg10, %arg7 : f32
84
+ %83 = arith.addf %arg9, %arg12 : f32
85
+ %84 = arith.cmpf oeq, %83, %cst_9 : f32
86
+ %85 = arith.divf %arg12, %83 : f32
87
+ %86 = arith.select %84, %cst_9, %85 : f32
88
+ %87 = arith.mulf %82, %86 : f32
89
+ %88 = arith.addf %arg7, %87 : f32
90
+ %89 = arith.addf %arg8, %arg11 : f32
91
+ %90 = arith.mulf %82, %82 : f32
92
+ %91 = arith.mulf %90, %arg9 : f32
93
+ %92 = arith.mulf %91, %86 : f32
94
+ %93 = arith.addf %89, %92 : f32
95
+ tt.reduce.return %88, %93, %83 : f32, f32, f32
96
+ }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
97
+ %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
98
+ %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
99
+ %60 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
100
+ %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
101
+ %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
102
+ %63 = tt.load %62, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
103
+ tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
104
+ %64 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
105
+ %65 = arith.addf %64, %60 : tensor<16x256xf32, #blocked>
106
+ %66 = tt.broadcast %58 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
107
+ %67 = arith.subf %65, %66 : tensor<16x256xf32, #blocked>
108
+ %68 = arith.divf %59, %cst_13 : tensor<16x1xf32, #blocked>
109
+ %69 = arith.addf %68, %cst_12 : tensor<16x1xf32, #blocked>
110
+ %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
111
+ %71 = tt.broadcast %70 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
112
+ %72 = arith.mulf %67, %71 : tensor<16x256xf32, #blocked>
113
+ %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
114
+ %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
115
+ %75 = arith.mulf %72, %74 : tensor<16x256xf32, #blocked>
116
+ %76 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
117
+ %77 = tt.broadcast %76 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
118
+ %78 = arith.addi %24, %77 : tensor<16x256xi32, #blocked>
119
+ %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
120
+ %80 = tt.addptr %79, %78 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
121
+ %81 = arith.truncf %75 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
122
+ tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
123
+ tt.return
124
+ }
125
+ }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<10>;
22
+ .reg .b32 %r<44>;
23
+ .reg .f32 %f<11>;
24
+ .reg .b64 %rd<16>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
30
+ ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
31
+ ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
32
+ $L__tmp0:
33
+ .loc 1 22 44
34
+ mov.u32 %r1, %tid.x;
35
+ and.b32 %r2, %r1, 63;
36
+ .loc 1 24 33
37
+ bfe.u32 %r3, %r1, 6, 2;
38
+ .loc 1 21 28
39
+ mov.u32 %r10, %ctaid.x;
40
+ .loc 1 21 33
41
+ shl.b32 %r12, %r10, 6;
42
+ .loc 1 22 23
43
+ or.b32 %r4, %r12, %r2;
44
+ .loc 1 27 36
45
+ shl.b32 %r13, %r3, 17;
46
+ add.s32 %r14, %r13, %r12;
47
+ or.b32 %r42, %r14, %r2;
48
+ mov.f32 %f10, 0f00000000;
49
+ mov.b32 %r43, -4;
50
+ mov.pred %p4, -1;
51
+ $L__BB0_1:
52
+ .loc 1 31 34
53
+ mul.wide.s32 %rd5, %r42, 4;
54
+ add.s64 %rd4, %rd1, %rd5;
55
+ mov.b32 %r16, 0;
56
+ .loc 1 31 53
57
+ mov.u32 %r15, 0x0;
58
+ @%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ];
59
+ @!%p4 mov.u32 %r15, %r16;
60
+ mov.b32 %f4, %r15;
61
+ .loc 1 34 38
62
+ add.f32 %f10, %f10, %f4;
63
+ .loc 1 27 36
64
+ add.s32 %r43, %r43, 4;
65
+ add.s32 %r42, %r42, 524288;
66
+ setp.lt.u32 %p3, %r43, 116;
67
+ @%p3 bra $L__BB0_1;
68
+ $L__tmp1:
69
+ .loc 2 243 36
70
+ shl.b32 %r25, %r3, 2;
71
+ shl.b32 %r26, %r2, 4;
72
+ or.b32 %r27, %r26, %r25;
73
+ mov.u32 %r28, global_smem;
74
+ add.s32 %r17, %r28, %r27;
75
+ mov.b32 %r18, %f10;
76
+ @%p4 st.shared.b32 [ %r17 + 0 ], %r18;
77
+ bar.sync 0;
78
+ setp.lt.s32 %p5, %r1, 256;
79
+ shl.b32 %r29, %r1, 2;
80
+ add.s32 %r20, %r28, %r29;
81
+ @%p5 ld.shared.b32 %r19, [ %r20 + 0 ];
82
+ mov.b32 %f5, %r19;
83
+ shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1;
84
+ mov.b32 %f6, %r30;
85
+ $L__tmp2:
86
+ .loc 2 233 15
87
+ add.f32 %f7, %f5, %f6;
88
+ $L__tmp3:
89
+ .loc 2 243 36
90
+ mov.b32 %r31, %f7;
91
+ shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1;
92
+ mov.b32 %f8, %r32;
93
+ $L__tmp4:
94
+ .loc 2 233 15
95
+ add.f32 %f9, %f7, %f8;
96
+ $L__tmp5:
97
+ .loc 2 243 36
98
+ and.b32 %r33, %r1, 3;
99
+ setp.eq.s32 %p9, %r33, 0;
100
+ and.pred %p6, %p5, %p9;
101
+ mov.b32 %r22, %f9;
102
+ @%p6 st.shared.b32 [ %r20 + 0 ], %r22;
103
+ bar.sync 0;
104
+ add.s32 %r34, %r28, %r26;
105
+ $L__tmp6:
106
+ .loc 1 36 20
107
+ shr.s32 %r36, %r4, 31;
108
+ shr.u32 %r37, %r36, 24;
109
+ add.s32 %r38, %r4, %r37;
110
+ shr.s32 %r39, %r38, 8;
111
+ and.b32 %r40, %r38, -256;
112
+ sub.s32 %r41, %r4, %r40;
113
+ .loc 1 38 30
114
+ mul.wide.s32 %rd9, %r39, 8;
115
+ add.s64 %rd7, %rd2, %rd9;
116
+ .loc 1 45 55
117
+ ld.shared.u32 %r24, [%r34];
118
+ .loc 1 38 35
119
+ mov.u64 %rd6, 0x0;
120
+ @%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
121
+ .loc 1 41 32
122
+ shr.u64 %rd10, %rd6, 54;
123
+ and.b64 %rd11, %rd10, 512;
124
+ add.s64 %rd12, %rd11, %rd6;
125
+ .loc 1 45 30
126
+ shl.b64 %rd13, %rd12, 10;
127
+ add.s64 %rd14, %rd3, %rd13;
128
+ mul.wide.s32 %rd15, %r41, 4;
129
+ add.s64 %rd8, %rd14, %rd15;
130
+ .loc 1 45 55
131
+ setp.eq.s32 %p8, %r3, 0;
132
+ mov.u32 %r23, 0x0;
133
+ @%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24;
134
+ .loc 1 45 4
135
+ ret;
136
+ $L__tmp7:
137
+ $L__func_end0:
138
+
139
+ }
140
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
141
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
142
+ .section .debug_abbrev
143
+ {
144
+ .b8 1
145
+ .b8 17
146
+ .b8 1
147
+ .b8 37
148
+ .b8 8
149
+ .b8 19
150
+ .b8 5
151
+ .b8 3
152
+ .b8 8
153
+ .b8 16
154
+ .b8 6
155
+ .b8 27
156
+ .b8 8
157
+ .b8 180
158
+ .b8 66
159
+ .b8 12
160
+ .b8 17
161
+ .b8 1
162
+ .b8 18
163
+ .b8 1
164
+ .b8 0
165
+ .b8 0
166
+ .b8 2
167
+ .b8 46
168
+ .b8 0
169
+ .b8 135
170
+ .b8 64
171
+ .b8 8
172
+ .b8 3
173
+ .b8 8
174
+ .b8 58
175
+ .b8 11
176
+ .b8 59
177
+ .b8 11
178
+ .b8 63
179
+ .b8 12
180
+ .b8 32
181
+ .b8 11
182
+ .b8 0
183
+ .b8 0
184
+ .b8 3
185
+ .b8 46
186
+ .b8 1
187
+ .b8 17
188
+ .b8 1
189
+ .b8 18
190
+ .b8 1
191
+ .b8 64
192
+ .b8 10
193
+ .b8 49
194
+ .b8 19
195
+ .b8 0
196
+ .b8 0
197
+ .b8 4
198
+ .b8 29
199
+ .b8 0
200
+ .b8 49
201
+ .b8 19
202
+ .b8 17
203
+ .b8 1
204
+ .b8 18
205
+ .b8 1
206
+ .b8 88
207
+ .b8 11
208
+ .b8 89
209
+ .b8 11
210
+ .b8 87
211
+ .b8 11
212
+ .b8 0
213
+ .b8 0
214
+ .b8 5
215
+ .b8 29
216
+ .b8 1
217
+ .b8 49
218
+ .b8 19
219
+ .b8 17
220
+ .b8 1
221
+ .b8 18
222
+ .b8 1
223
+ .b8 88
224
+ .b8 11
225
+ .b8 89
226
+ .b8 11
227
+ .b8 87
228
+ .b8 11
229
+ .b8 0
230
+ .b8 0
231
+ .b8 0
232
+ }
233
+ .section .debug_info
234
+ {
235
+ .b32 264
236
+ .b8 2
237
+ .b8 0
238
+ .b32 .debug_abbrev
239
+ .b8 8
240
+ .b8 1
241
+ .b8 116
242
+ .b8 114
243
+ .b8 105
244
+ .b8 116
245
+ .b8 111
246
+ .b8 110
247
+ .b8 0
248
+ .b8 2
249
+ .b8 0
250
+ .b8 99
251
+ .b8 54
252
+ .b8 105
253
+ .b8 107
254
+ .b8 53
255
+ .b8 118
256
+ .b8 120
257
+ .b8 55
258
+ .b8 112
259
+ .b8 50
260
+ .b8 50
261
+ .b8 102
262
+ .b8 112
263
+ .b8 107
264
+ .b8 52
265
+ .b8 100
266
+ .b8 99
267
+ .b8 118
268
+ .b8 104
269
+ .b8 53
270
+ .b8 53
271
+ .b8 122
272
+ .b8 105
273
+ .b8 109
274
+ .b8 119
275
+ .b8 52
276
+ .b8 116
277
+ .b8 53
278
+ .b8 110
279
+ .b8 114
280
+ .b8 53
281
+ .b8 122
282
+ .b8 110
283
+ .b8 50
284
+ .b8 98
285
+ .b8 55
286
+ .b8 105
287
+ .b8 110
288
+ .b8 117
289
+ .b8 106
290
+ .b8 120
291
+ .b8 106
292
+ .b8 97
293
+ .b8 117
294
+ .b8 120
295
+ .b8 115
296
+ .b8 104
297
+ .b8 108
298
+ .b8 106
299
+ .b8 117
300
+ .b8 109
301
+ .b8 109
302
+ .b8 46
303
+ .b8 112
304
+ .b8 121
305
+ .b8 0
306
+ .b32 .debug_line
307
+ .b8 47
308
+ .b8 116
309
+ .b8 109
310
+ .b8 112
311
+ .b8 47
312
+ .b8 116
313
+ .b8 111
314
+ .b8 114
315
+ .b8 99
316
+ .b8 104
317
+ .b8 105
318
+ .b8 110
319
+ .b8 100
320
+ .b8 117
321
+ .b8 99
322
+ .b8 116
323
+ .b8 111
324
+ .b8 114
325
+ .b8 95
326
+ .b8 114
327
+ .b8 111
328
+ .b8 111
329
+ .b8 116
330
+ .b8 47
331
+ .b8 54
332
+ .b8 105
333
+ .b8 0
334
+ .b8 1
335
+ .b64 $L__func_begin0
336
+ .b64 $L__func_end0
337
+ .b8 2
338
+ .b8 116
339
+ .b8 114
340
+ .b8 105
341
+ .b8 116
342
+ .b8 111
343
+ .b8 110
344
+ .b8 95
345
+ .b8 95
346
+ .b8 48
347
+ .b8 100
348
+ .b8 49
349
+ .b8 100
350
+ .b8 50
351
+ .b8 100
352
+ .b8 51
353
+ .b8 100
354
+ .b8 101
355
+ .b8 52
356
+ .b8 101
357
+ .b8 0
358
+ .b8 116
359
+ .b8 114
360
+ .b8 105
361
+ .b8 116
362
+ .b8 111
363
+ .b8 110
364
+ .b8 95
365
+ .b8 95
366
+ .b8 48
367
+ .b8 100
368
+ .b8 49
369
+ .b8 100
370
+ .b8 50
371
+ .b8 100
372
+ .b8 51
373
+ .b8 100
374
+ .b8 101
375
+ .b8 52
376
+ .b8 101
377
+ .b8 0
378
+ .b8 1
379
+ .b8 18
380
+ .b8 1
381
+ .b8 1
382
+ .b8 3
383
+ .b64 $L__func_begin0
384
+ .b64 $L__func_end0
385
+ .b8 1
386
+ .b8 156
387
+ .b32 125
388
+ .b8 4
389
+ .b32 125
390
+ .b64 $L__tmp1
391
+ .b64 $L__tmp6
392
+ .b8 2
393
+ .b8 35
394
+ .b8 25
395
+ .b8 5
396
+ .b32 125
397
+ .b64 $L__tmp2
398
+ .b64 $L__tmp5
399
+ .b8 2
400
+ .b8 35
401
+ .b8 25
402
+ .b8 4
403
+ .b32 125
404
+ .b64 $L__tmp2
405
+ .b64 $L__tmp5
406
+ .b8 2
407
+ .b8 243
408
+ .b8 36
409
+ .b8 0
410
+ .b8 0
411
+ .b8 0
412
+ }
413
+ .section .debug_pubnames
414
+ {
415
+ .b32 $L__pubNames_end0-$L__pubNames_start0
416
+ $L__pubNames_start0:
417
+ .b8 2
418
+ .b8 0
419
+ .b32 .debug_info
420
+ .b32 268
421
+ .b32 125
422
+ .b8 116
423
+ .b8 114
424
+ .b8 105
425
+ .b8 116
426
+ .b8 111
427
+ .b8 110
428
+ .b8 95
429
+ .b8 95
430
+ .b8 48
431
+ .b8 100
432
+ .b8 49
433
+ .b8 100
434
+ .b8 50
435
+ .b8 100
436
+ .b8 51
437
+ .b8 100
438
+ .b8 101
439
+ .b8 52
440
+ .b8 101
441
+ .b8 0
442
+ .b32 0
443
+ $L__pubNames_end0:
444
+ }
445
+ .section .debug_pubtypes
446
+ {
447
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
448
+ $L__pubTypes_start0:
449
+ .b8 2
450
+ .b8 0
451
+ .b32 .debug_info
452
+ .b32 268
453
+ .b32 0
454
+ $L__pubTypes_end0:
455
+ }
456
+ .section .debug_loc { }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
5
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
8
+ %cst_3 = arith.constant dense<131072> : tensor<1x4xi32, #blocked>
9
+ %cst_4 = arith.constant dense<120> : tensor<1x4xi32, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c120_i32 = arith.constant 120 : i32
12
+ %c4_i32 = arith.constant 4 : i32
13
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
14
+ %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
15
+ %c64_i32 = arith.constant 64 : i32
16
+ %0 = tt.get_program_id x : i32
17
+ %1 = arith.muli %0, %c64_i32 : i32
18
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
19
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
20
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
21
+ %5 = arith.addi %4, %3 : tensor<64x1xi32, #blocked>
22
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
24
+ %8 = tt.broadcast %5 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
25
+ %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
26
+ %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x4xf32, #blocked>) : i32 {
27
+ %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32, #blocked>
28
+ %28 = arith.addi %27, %7 : tensor<1x4xi32, #blocked>
29
+ %29 = arith.cmpi slt, %28, %cst_4 : tensor<1x4xi32, #blocked>
30
+ %30 = arith.muli %28, %cst_3 : tensor<1x4xi32, #blocked>
31
+ %31 = tt.broadcast %30 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
32
+ %32 = arith.addi %8, %31 : tensor<64x4xi32, #blocked>
33
+ %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
34
+ %34 = tt.broadcast %29 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
35
+ %35 = tt.load %33, %34, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
36
+ %36 = arith.addf %arg6, %35 : tensor<64x4xf32, #blocked>
37
+ %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
38
+ scf.yield %37 : tensor<64x4xf32, #blocked>
39
+ }
40
+ %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
41
+ ^bb0(%arg5: f32, %arg6: f32):
42
+ %27 = arith.addf %arg5, %arg6 : f32
43
+ tt.reduce.return %27 : f32
44
+ }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
45
+ %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
46
+ %13 = arith.divsi %5, %cst_2 : tensor<64x1xi32, #blocked>
47
+ %14 = arith.remsi %5, %cst_2 : tensor<64x1xi32, #blocked>
48
+ %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
49
+ %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
50
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
51
+ %18 = arith.addi %17, %cst_1 : tensor<64x1xi64, #blocked>
52
+ %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64, #blocked>
53
+ %20 = arith.select %19, %18, %17 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
54
+ %21 = arith.muli %20, %cst : tensor<64x1xi64, #blocked>
55
+ %22 = arith.extsi %14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
56
+ %23 = arith.addi %22, %21 : tensor<64x1xi64, #blocked>
57
+ %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
58
+ %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
59
+ %26 = "tt.atomic_rmw"(%25, %12, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
60
+ tt.return
61
+ }
62
+ }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<64x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64>
6
+ %c4_i32 = arith.constant 4 : i32
7
+ %c120_i32 = arith.constant 120 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<true> : tensor<64x1xi1>
10
+ %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
11
+ %cst_4 = arith.constant dense<131072> : tensor<1x4xi32>
12
+ %cst_5 = arith.constant dense<120> : tensor<1x4xi32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
14
+ %c64_i32 = arith.constant 64 : i32
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = arith.muli %0, %c64_i32 : i32
17
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
18
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
19
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
20
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
21
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
22
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
23
+ %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x4xi32>
24
+ %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
25
+ %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x4xf32>) : i32 {
26
+ %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32>
27
+ %28 = arith.addi %27, %7 : tensor<1x4xi32>
28
+ %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x4xi32>
29
+ %30 = arith.muli %28, %cst_4 : tensor<1x4xi32>
30
+ %31 = tt.broadcast %30 : (tensor<1x4xi32>) -> tensor<64x4xi32>
31
+ %32 = arith.addi %8, %31 : tensor<64x4xi32>
32
+ %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
33
+ %34 = tt.broadcast %29 : (tensor<1x4xi1>) -> tensor<64x4xi1>
34
+ %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
35
+ %36 = arith.addf %arg6, %35 : tensor<64x4xf32>
36
+ %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1>, tensor<64x4xf32>
37
+ scf.yield %37 : tensor<64x4xf32>
38
+ }
39
+ %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
40
+ ^bb0(%arg5: f32, %arg6: f32):
41
+ %27 = arith.addf %arg5, %arg6 : f32
42
+ tt.reduce.return %27 : f32
43
+ }) : (tensor<64x4xf32>) -> tensor<64xf32>
44
+ %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
45
+ %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
46
+ %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
47
+ %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
48
+ %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
49
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
50
+ %18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
51
+ %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
52
+ %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
53
+ %21 = arith.muli %20, %cst : tensor<64x1xi64>
54
+ %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
55
+ %23 = arith.addi %22, %21 : tensor<64x1xi64>
56
+ %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
57
+ %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
58
+ %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
59
+ tt.return
60
+ }
61
+ }
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin ADDED
Binary file (40.4 kB). View file
 
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
11
+
12
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
13
+
14
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
15
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
16
+ %10 = lshr i32 %9, 2, !dbg !10
17
+ %11 = and i32 %10, 63, !dbg !10
18
+ %12 = and i32 %9, 63, !dbg !10
19
+ %13 = and i32 %9, 3, !dbg !11
20
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
21
+ %15 = shl i32 %14, 6, !dbg !13
22
+ %16 = or i32 %15, %11, !dbg !14
23
+ %17 = or i32 %15, %12, !dbg !14
24
+ %18 = sext i32 %16 to i64, !dbg !15
25
+ %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
26
+ %20 = sext i32 %17 to i64, !dbg !15
27
+ %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
28
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #5, !dbg !16
29
+ %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #5, !dbg !16
30
+ %24 = srem i32 %16, 512, !dbg !17
31
+ %25 = shl nsw i32 %24, 8, !dbg !18
32
+ %26 = shl i32 %16, 8, !dbg !19
33
+ %27 = add i64 %23, 50257, !dbg !20
34
+ %28 = icmp slt i64 %22, 0, !dbg !21
35
+ %29 = icmp slt i64 %23, 0, !dbg !21
36
+ %30 = select i1 %29, i64 %27, i64 %23, !dbg !22
37
+ %.fr8 = freeze i64 %30, !dbg !23
38
+ %31 = icmp ugt i64 %.fr8, 50256, !dbg !23
39
+ %32 = shl i64 %22, 8, !dbg !24
40
+ %33 = add i64 %32, 12865792, !dbg !24
41
+ %34 = select i1 %28, i64 %33, i64 %32, !dbg !24
42
+ %35 = getelementptr float, ptr addrspace(1) %1, i64 %34
43
+ br i1 %31, label %.split.us, label %.split, !dbg !25
44
+
45
+ .split.us: ; preds = %8, %.split.us
46
+ %36 = phi float [ %58, %.split.us ], [ 0.000000e+00, %8 ]
47
+ %37 = phi float [ %63, %.split.us ], [ 0.000000e+00, %8 ]
48
+ %38 = phi float [ %60, %.split.us ], [ 0.000000e+00, %8 ]
49
+ %39 = phi i32 [ %64, %.split.us ], [ 0, %8 ]
50
+ %40 = or i32 %39, %13, !dbg !26
51
+ %41 = add i32 %40, %25, !dbg !27
52
+ %42 = sext i32 %41 to i64, !dbg !28
53
+ %43 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !28
54
+ %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true) #5, !dbg !29
55
+ %45 = bitcast i32 %44 to float, !dbg !29
56
+ %46 = add i32 %40, %26, !dbg !30
57
+ %47 = sext i32 %46 to i64, !dbg !31
58
+ %48 = getelementptr i16, ptr addrspace(1) %3, i64 %47, !dbg !31
59
+ %49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %48, i1 true, i16 0, i1 true) #5, !dbg !32
60
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #5, !dbg !33
61
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !25
62
+ %51 = zext nneg i32 %40 to i64, !dbg !34
63
+ %52 = getelementptr float, ptr addrspace(1) %35, i64 %51, !dbg !35
64
+ %53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true) #5, !dbg !36
65
+ %54 = bitcast i32 %53 to float, !dbg !36
66
+ %55 = fadd float %45, %54, !dbg !37
67
+ %56 = fadd float %50, %55, !dbg !38
68
+ %57 = fsub float %56, %38, !dbg !39
69
+ %58 = fadd float %36, 1.000000e+00, !dbg !43
70
+ %59 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %57, float %58) #5, !dbg !44
71
+ %60 = fadd float %38, %59, !dbg !45
72
+ %61 = fsub float %56, %60, !dbg !46
73
+ %62 = fmul float %57, %61, !dbg !47
74
+ %63 = fadd float %37, %62, !dbg !48
75
+ %64 = add nuw nsw i32 %39, 4, !dbg !49
76
+ %65 = icmp ult i32 %39, 252, !dbg !49
77
+ br i1 %65, label %.split.us, label %.split5.us, !dbg !49
78
+
79
+ .split: ; preds = %8, %.split
80
+ %66 = phi float [ %88, %.split ], [ 0.000000e+00, %8 ]
81
+ %67 = phi float [ %93, %.split ], [ 0.000000e+00, %8 ]
82
+ %68 = phi float [ %90, %.split ], [ 0.000000e+00, %8 ]
83
+ %69 = phi i32 [ %94, %.split ], [ 0, %8 ]
84
+ %70 = or i32 %69, %13, !dbg !26
85
+ %71 = add i32 %70, %25, !dbg !27
86
+ %72 = sext i32 %71 to i64, !dbg !28
87
+ %73 = getelementptr float, ptr addrspace(1) %2, i64 %72, !dbg !28
88
+ %74 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %73, i1 true, i32 0, i1 true) #5, !dbg !29
89
+ %75 = bitcast i32 %74 to float, !dbg !29
90
+ %76 = add i32 %70, %26, !dbg !30
91
+ %77 = sext i32 %76 to i64, !dbg !31
92
+ %78 = getelementptr i16, ptr addrspace(1) %3, i64 %77, !dbg !31
93
+ %79 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %78, i1 true, i16 0, i1 true) #5, !dbg !32
94
+ %80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %79) #5, !dbg !33
95
+ %81 = zext nneg i32 %70 to i64, !dbg !34
96
+ %82 = getelementptr float, ptr addrspace(1) %35, i64 %81, !dbg !35
97
+ %83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true) #5, !dbg !36
98
+ %84 = bitcast i32 %83 to float, !dbg !36
99
+ %85 = fadd float %75, %84, !dbg !37
100
+ %86 = fadd float %80, %85, !dbg !38
101
+ %87 = fsub float %86, %68, !dbg !39
102
+ %88 = fadd float %66, 1.000000e+00, !dbg !43
103
+ %89 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %87, float %88) #5, !dbg !44
104
+ %90 = fadd float %68, %89, !dbg !45
105
+ %91 = fsub float %86, %90, !dbg !46
106
+ %92 = fmul float %87, %91, !dbg !47
107
+ %93 = fadd float %67, %92, !dbg !48
108
+ %94 = add nuw nsw i32 %69, 4, !dbg !49
109
+ %95 = icmp ult i32 %69, 252, !dbg !49
110
+ br i1 %95, label %.split, label %.split5.us, !dbg !49
111
+
112
+ .split5.us: ; preds = %.split, %.split.us
113
+ %.us-phi = phi float [ %60, %.split.us ], [ %90, %.split ]
114
+ %.us-phi6 = phi float [ %63, %.split.us ], [ %93, %.split ]
115
+ %.us-phi7 = phi float [ %58, %.split.us ], [ %88, %.split ]
116
+ %96 = bitcast float %.us-phi to i32, !dbg !50
117
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 2, i32 31), !dbg !50
118
+ %98 = bitcast i32 %97 to float, !dbg !50
119
+ %99 = bitcast float %.us-phi6 to i32, !dbg !50
120
+ %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 2, i32 31), !dbg !50
121
+ %101 = bitcast i32 %100 to float, !dbg !50
122
+ %102 = bitcast float %.us-phi7 to i32, !dbg !50
123
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !50
124
+ %104 = bitcast i32 %103 to float, !dbg !50
125
+ %105 = fsub float %98, %.us-phi, !dbg !52
126
+ %106 = fadd float %.us-phi7, %104, !dbg !56
127
+ %107 = fcmp oeq float %106, 0.000000e+00, !dbg !57
128
+ %108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %104, float %106) #5, !dbg !58
129
+ %109 = select i1 %107, float 0.000000e+00, float %108, !dbg !59
130
+ %110 = fmul float %105, %109, !dbg !60
131
+ %111 = fadd float %.us-phi, %110, !dbg !61
132
+ %112 = fadd float %.us-phi6, %101, !dbg !62
133
+ %113 = fmul float %105, %105, !dbg !63
134
+ %114 = fmul float %.us-phi7, %113, !dbg !64
135
+ %115 = fmul float %114, %109, !dbg !65
136
+ %116 = fadd float %112, %115, !dbg !66
137
+ %117 = bitcast float %111 to i32, !dbg !50
138
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !50
139
+ %119 = bitcast i32 %118 to float, !dbg !50
140
+ %120 = bitcast float %116 to i32, !dbg !50
141
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 1, i32 31), !dbg !50
142
+ %122 = bitcast i32 %121 to float, !dbg !50
143
+ %123 = bitcast float %106 to i32, !dbg !50
144
+ %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !50
145
+ %125 = bitcast i32 %124 to float, !dbg !50
146
+ %126 = fsub float %119, %111, !dbg !52
147
+ %127 = fadd float %106, %125, !dbg !56
148
+ %128 = fcmp oeq float %127, 0.000000e+00, !dbg !57
149
+ %129 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %125, float %127) #5, !dbg !58
150
+ %130 = select i1 %128, float 0.000000e+00, float %129, !dbg !59
151
+ %131 = fmul float %126, %130, !dbg !60
152
+ %132 = fadd float %111, %131, !dbg !61
153
+ %133 = fadd float %116, %122, !dbg !62
154
+ %134 = fmul float %126, %126, !dbg !63
155
+ %135 = fmul float %106, %134, !dbg !64
156
+ %136 = fmul float %130, %135, !dbg !65
157
+ %137 = fadd float %133, %136, !dbg !66
158
+ %138 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float 2.560000e+02) #5, !dbg !67
159
+ %139 = fadd float %138, 0x3EE4F8B580000000, !dbg !68
160
+ br label %140, !dbg !69
161
+
162
+ 140: ; preds = %.split5.us, %__nv_rsqrtf.exit
163
+ %141 = phi i32 [ 0, %.split5.us ], [ %174, %__nv_rsqrtf.exit ]
164
+ %142 = or i32 %141, %13, !dbg !70
165
+ %143 = add i32 %142, %25, !dbg !71
166
+ %144 = sext i32 %143 to i64, !dbg !72
167
+ %145 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !72
168
+ %146 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %145, i1 true, i32 0, i1 true) #5, !dbg !73
169
+ %147 = bitcast i32 %146 to float, !dbg !73
170
+ %148 = add i32 %142, %26, !dbg !74
171
+ %149 = sext i32 %148 to i64, !dbg !75
172
+ %150 = getelementptr i16, ptr addrspace(1) %3, i64 %149, !dbg !75
173
+ %151 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %150, i1 true, i16 0, i1 true) #5, !dbg !76
174
+ %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %151) #5, !dbg !77
175
+ %153 = zext nneg i32 %142 to i64, !dbg !78
176
+ %154 = getelementptr float, ptr addrspace(1) %4, i64 %153, !dbg !78
177
+ %155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %154, i1 true, i32 0, i1 true) #5, !dbg !79
178
+ %156 = bitcast i32 %155 to float, !dbg !79
179
+ br i1 %31, label %157, label %158, !dbg !80
180
+
181
+ 157: ; preds = %140
182
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
183
+ br label %158, !dbg !80
184
+
185
+ 158: ; preds = %157, %140
186
+ %159 = getelementptr float, ptr addrspace(1) %35, i64 %153, !dbg !81
187
+ %160 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true) #5, !dbg !82
188
+ %161 = bitcast i32 %160 to float, !dbg !82
189
+ %162 = fadd float %147, %161, !dbg !83
190
+ %163 = fadd float %152, %162, !dbg !84
191
+ %164 = fsub float %163, %132, !dbg !85
192
+ %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !86
193
+ %.not.i = icmp eq i32 %165, 0, !dbg !86
194
+ br i1 %.not.i, label %168, label %166, !dbg !86
195
+
196
+ 166: ; preds = %158
197
+ %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %139), !dbg !86
198
+ br label %__nv_rsqrtf.exit, !dbg !86
199
+
200
+ 168: ; preds = %158
201
+ %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %139), !dbg !86
202
+ br label %__nv_rsqrtf.exit, !dbg !86
203
+
204
+ __nv_rsqrtf.exit: ; preds = %166, %168
205
+ %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !86
206
+ %170 = fmul float %164, %.0.i, !dbg !87
207
+ %171 = fmul float %170, %156, !dbg !88
208
+ %172 = getelementptr i16, ptr addrspace(1) %5, i64 %149, !dbg !89
209
+ %173 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %171) #5, !dbg !90
210
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %173, ptr addrspace(1) %172, i1 true) #5, !dbg !90
211
+ %174 = add nuw nsw i32 %141, 4, !dbg !69
212
+ %175 = icmp ult i32 %141, 252, !dbg !69
213
+ br i1 %175, label %140, label %176, !dbg !69
214
+
215
+ 176: ; preds = %__nv_rsqrtf.exit
216
+ ret void, !dbg !91
217
+ }
218
+
219
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
220
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
221
+
222
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
223
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
224
+
225
+ ; Function Attrs: alwaysinline nounwind
226
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
227
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
228
+ %.not = icmp eq i32 %1, 0
229
+ br i1 %.not, label %4, label %2
230
+
231
+ 2: ; preds = %0
232
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
233
+ br label %6
234
+
235
+ 4: ; preds = %0
236
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
237
+ br label %6
238
+
239
+ 6: ; preds = %4, %2
240
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
241
+ ret float %.0
242
+ }
243
+
244
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
245
+
246
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
247
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
248
+
249
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
250
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #4
251
+
252
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
253
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
254
+ attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
255
+ attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
256
+ attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
257
+ attributes #5 = { nounwind }
258
+
259
+ !llvm.module.flags = !{!0, !1}
260
+ !llvm.dbg.cu = !{!2}
261
+ !nvvm.annotations = !{!4, !5, !5, !4}
262
+ !llvm.ident = !{!6}
263
+
264
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
265
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
266
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
267
+ !3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
268
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
269
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
270
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
271
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
272
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
273
+ !9 = !{}
274
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
275
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
276
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
277
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
278
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
279
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
280
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
281
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
282
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
283
+ !19 = !DILocation(line: 36, column: 44, scope: !7)
284
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
285
+ !21 = !DILocation(line: 38, column: 22, scope: !7)
286
+ !22 = !DILocation(line: 39, column: 36, scope: !7)
287
+ !23 = !DILocation(line: 40, column: 40, scope: !7)
288
+ !24 = !DILocation(line: 41, column: 44, scope: !7)
289
+ !25 = !DILocation(line: 40, column: 55, scope: !7)
290
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
291
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
292
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
293
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
294
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
295
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
296
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
297
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
298
+ !34 = !DILocation(line: 41, column: 40, scope: !7)
299
+ !35 = !DILocation(line: 41, column: 34, scope: !7)
300
+ !36 = !DILocation(line: 41, column: 52, scope: !7)
301
+ !37 = !DILocation(line: 42, column: 22, scope: !7)
302
+ !38 = !DILocation(line: 44, column: 22, scope: !7)
303
+ !39 = !DILocation(line: 96, column: 20, scope: !40, inlinedAt: !42)
304
+ !40 = distinct !DILexicalBlockFile(scope: !7, file: !41, discriminator: 0)
305
+ !41 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
306
+ !42 = !DILocation(line: 47, column: 41, scope: !40)
307
+ !43 = !DILocation(line: 97, column: 26, scope: !40, inlinedAt: !42)
308
+ !44 = !DILocation(line: 98, column: 30, scope: !40, inlinedAt: !42)
309
+ !45 = !DILocation(line: 98, column: 22, scope: !40, inlinedAt: !42)
310
+ !46 = !DILocation(line: 101, column: 30, scope: !40, inlinedAt: !42)
311
+ !47 = !DILocation(line: 101, column: 22, scope: !40, inlinedAt: !42)
312
+ !48 = !DILocation(line: 50, column: 50, scope: !7)
313
+ !49 = !DILocation(line: 31, column: 36, scope: !7)
314
+ !50 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !51)
315
+ !51 = !DILocation(line: 53, column: 44, scope: !40)
316
+ !52 = !DILocation(line: 108, column: 21, scope: !53, inlinedAt: !54)
317
+ !53 = distinct !DILexicalBlockFile(scope: !40, file: !41, discriminator: 0)
318
+ !54 = !DILocation(line: 120, column: 46, scope: !53, inlinedAt: !55)
319
+ !55 = !DILocation(line: 53, column: 44, scope: !53)
320
+ !56 = !DILocation(line: 109, column: 28, scope: !53, inlinedAt: !54)
321
+ !57 = !DILocation(line: 110, column: 39, scope: !53, inlinedAt: !54)
322
+ !58 = !DILocation(line: 110, column: 60, scope: !53, inlinedAt: !54)
323
+ !59 = !DILocation(line: 110, column: 49, scope: !53, inlinedAt: !54)
324
+ !60 = !DILocation(line: 112, column: 25, scope: !53, inlinedAt: !54)
325
+ !61 = !DILocation(line: 112, column: 17, scope: !53, inlinedAt: !54)
326
+ !62 = !DILocation(line: 113, column: 15, scope: !53, inlinedAt: !54)
327
+ !63 = !DILocation(line: 113, column: 30, scope: !53, inlinedAt: !54)
328
+ !64 = !DILocation(line: 113, column: 38, scope: !53, inlinedAt: !54)
329
+ !65 = !DILocation(line: 113, column: 49, scope: !53, inlinedAt: !54)
330
+ !66 = !DILocation(line: 113, column: 22, scope: !53, inlinedAt: !54)
331
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
332
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
333
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
334
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
335
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
336
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
337
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
338
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
339
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
340
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
341
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
342
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
343
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
344
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
345
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
346
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
347
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
348
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
349
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
350
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
351
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
352
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
353
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
354
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
355
+ !91 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c4_i32 = arith.constant 4 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x4xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
23
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
24
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
25
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
26
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
27
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
28
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
29
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
30
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
31
+ %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
32
+ %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
33
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
35
+ %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
36
+ %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x4xi32>
37
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
38
+ %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
39
+ %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
40
+ %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
41
+ %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
42
+ %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
43
+ %23 = arith.andi %21, %22 : tensor<64x1xi1>
44
+ %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
45
+ %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x4xi64>
46
+ %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
47
+ %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
48
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
49
+ %52 = arith.addi %51, %7 : tensor<1x4xi32>
50
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
51
+ %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
52
+ %55 = arith.addi %54, %13 : tensor<64x4xi32>
53
+ %56 = tt.addptr %14, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
54
+ %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
55
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
56
+ %59 = arith.addi %54, %16 : tensor<64x4xi32>
57
+ %60 = tt.addptr %17, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
58
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16>
59
+ %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
60
+ tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
61
+ %63 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
62
+ %64 = tt.broadcast %63 : (tensor<1x4xi64>) -> tensor<64x4xi64>
63
+ %65 = arith.addi %64, %25 : tensor<64x4xi64>
64
+ %66 = tt.addptr %26, %65 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
65
+ %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
66
+ %68 = arith.addf %67, %58 : tensor<64x4xf32>
67
+ %69 = arith.addf %68, %62 : tensor<64x4xf32>
68
+ %70 = arith.subf %69, %arg9 : tensor<64x4xf32>
69
+ %71 = arith.addf %arg11, %cst_1 : tensor<64x4xf32>
70
+ %72 = arith.divf %70, %71 : tensor<64x4xf32>
71
+ %73 = arith.addf %arg9, %72 : tensor<64x4xf32>
72
+ %74 = arith.subf %69, %73 : tensor<64x4xf32>
73
+ %75 = arith.mulf %70, %74 : tensor<64x4xf32>
74
+ %76 = arith.addf %arg10, %75 : tensor<64x4xf32>
75
+ %77 = arith.select %57, %73, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
76
+ %78 = arith.select %57, %76, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
77
+ %79 = arith.select %57, %71, %arg11 : tensor<64x4xi1>, tensor<64x4xf32>
78
+ scf.yield %77, %78, %79 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
79
+ }
80
+ %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
82
+ %51 = arith.subf %arg11, %arg8 : f32
83
+ %52 = arith.addf %arg10, %arg13 : f32
84
+ %53 = arith.cmpf oeq, %52, %cst_0 : f32
85
+ %54 = arith.divf %arg13, %52 : f32
86
+ %55 = arith.select %53, %cst_0, %54 : f32
87
+ %56 = arith.mulf %51, %55 : f32
88
+ %57 = arith.addf %arg8, %56 : f32
89
+ %58 = arith.addf %arg9, %arg12 : f32
90
+ %59 = arith.mulf %51, %51 : f32
91
+ %60 = arith.mulf %59, %arg10 : f32
92
+ %61 = arith.mulf %60, %55 : f32
93
+ %62 = arith.addf %58, %61 : f32
94
+ tt.reduce.return %57, %62, %52 : f32, f32, f32
95
+ }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
96
+ %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
97
+ %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
98
+ %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
99
+ %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x4xi32>
100
+ %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
101
+ %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
102
+ %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x4xi32>
103
+ %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
104
+ %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
105
+ %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
106
+ %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
107
+ %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
108
+ %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
109
+ %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
110
+ %43 = arith.andi %41, %42 : tensor<64x1xi1>
111
+ %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
112
+ %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x4xi64>
113
+ %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
114
+ %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x4xf32>
115
+ %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
116
+ %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
117
+ %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
118
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
119
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
120
+ %52 = arith.addi %51, %7 : tensor<1x4xi32>
121
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
122
+ %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
123
+ %55 = arith.addi %54, %32 : tensor<64x4xi32>
124
+ %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
125
+ %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
126
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
127
+ %59 = arith.addi %54, %35 : tensor<64x4xi32>
128
+ %60 = tt.addptr %36, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
129
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
130
+ %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
131
+ %63 = tt.addptr %37, %52 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
132
+ %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
133
+ tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
134
+ %65 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
135
+ %66 = tt.broadcast %65 : (tensor<1x4xi64>) -> tensor<64x4xi64>
136
+ %67 = arith.addi %66, %45 : tensor<64x4xi64>
137
+ %68 = tt.addptr %46, %67 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
138
+ %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
139
+ %70 = arith.addf %69, %58 : tensor<64x4xf32>
140
+ %71 = arith.addf %70, %62 : tensor<64x4xf32>
141
+ %72 = arith.subf %71, %47 : tensor<64x4xf32>
142
+ %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
143
+ %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x4xf32>
144
+ %75 = arith.mulf %72, %74 : tensor<64x4xf32>
145
+ %76 = tt.broadcast %64 : (tensor<1x4xf32>) -> tensor<64x4xf32>
146
+ %77 = arith.mulf %75, %76 : tensor<64x4xf32>
147
+ %78 = tt.addptr %50, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
148
+ %79 = arith.truncf %77 : tensor<64x4xf32> to tensor<64x4xbf16>
149
+ tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5de6de(
14
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
19
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
21
+ )
22
+ .maxntid 64, 1, 1
23
+ {
24
+ .reg .pred %p<26>;
25
+ .reg .b16 %rs<9>;
26
+ .reg .b32 %r<88>;
27
+ .reg .f32 %f<78>;
28
+ .reg .b64 %rd<14>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0];
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1];
35
+ $L__tmp0:
36
+ .loc 1 26 26
37
+ mov.u32 %r56, %tid.x;
38
+ and.b32 %r57, %r56, 31;
39
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2];
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4];
42
+ shl.b32 %r58, %r56, 2;
43
+ and.b32 %r59, %r58, 252;
44
+ .loc 1 23 28
45
+ mov.u32 %r1, %ctaid.x;
46
+ .loc 1 30 40
47
+ shl.b32 %r60, %r1, 8;
48
+ .loc 1 30 36
49
+ or.b32 %r61, %r60, %r59;
50
+ .loc 1 30 30
51
+ mul.wide.s32 %rd11, %r61, 4;
52
+ add.s64 %rd1, %rd6, %rd11;
53
+ mov.b32 %r6, 0;
54
+ mov.pred %p1, -1;
55
+ .loc 1 30 46
56
+ mov.u32 %r2, 0x0;
57
+ mov.u32 %r3, 0x0;
58
+ mov.u32 %r4, 0x0;
59
+ mov.u32 %r5, 0x0;
60
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
61
+ @!%p1 mov.u32 %r2, %r6;
62
+ @!%p1 mov.u32 %r3, %r6;
63
+ @!%p1 mov.u32 %r4, %r6;
64
+ @!%p1 mov.u32 %r5, %r6;
65
+ mov.b32 %f1, %r4;
66
+ mov.b32 %f2, %r5;
67
+ .loc 1 31 30
68
+ mul.wide.s32 %rd12, %r61, 2;
69
+ add.s64 %rd2, %rd7, %rd12;
70
+ .loc 1 31 46
71
+ mov.u32 %r10, 0x0;
72
+ mov.u32 %r11, 0x0;
73
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
74
+ @!%p1 mov.u32 %r10, %r6;
75
+ @!%p1 mov.u32 %r11, %r6;
76
+ cvt.u16.u32 %rs1, %r10;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
78
+ cvt.u16.u32 %rs3, %r11;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
80
+ .loc 1 31 67
81
+ cvt.f32.bf16 %r14, %rs1;
82
+ mov.b32 %f3, %r14;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ mov.b32 %f4, %r15;
85
+ cvt.f32.bf16 %r16, %rs3;
86
+ mov.b32 %f5, %r16;
87
+ cvt.f32.bf16 %r17, %rs4;
88
+ mov.b32 %f6, %r17;
89
+ .loc 1 32 30
90
+ add.s64 %rd3, %rd8, %rd12;
91
+ .loc 1 32 46
92
+ mov.u32 %r18, 0x0;
93
+ mov.u32 %r19, 0x0;
94
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
95
+ @!%p1 mov.u32 %r18, %r6;
96
+ @!%p1 mov.u32 %r19, %r6;
97
+ cvt.u16.u32 %rs5, %r18;
98
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
99
+ cvt.u16.u32 %rs7, %r19;
100
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
101
+ .loc 1 32 67
102
+ cvt.f32.bf16 %r22, %rs5;
103
+ mov.b32 %f7, %r22;
104
+ cvt.f32.bf16 %r23, %rs6;
105
+ mov.b32 %f8, %r23;
106
+ cvt.f32.bf16 %r24, %rs7;
107
+ mov.b32 %f9, %r24;
108
+ cvt.f32.bf16 %r25, %rs8;
109
+ mov.b32 %f10, %r25;
110
+ .loc 1 33 31
111
+ mul.wide.u32 %rd13, %r59, 4;
112
+ add.s64 %rd4, %rd9, %rd13;
113
+ .loc 1 33 36
114
+ mov.u32 %r26, 0x0;
115
+ mov.u32 %r27, 0x0;
116
+ mov.u32 %r28, 0x0;
117
+ mov.u32 %r29, 0x0;
118
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
119
+ @!%p1 mov.u32 %r26, %r6;
120
+ @!%p1 mov.u32 %r27, %r6;
121
+ @!%p1 mov.u32 %r28, %r6;
122
+ @!%p1 mov.u32 %r29, %r6;
123
+ .loc 1 35 18
124
+ add.f32 %f11, %f5, %f1;
125
+ add.f32 %f12, %f6, %f2;
126
+ .loc 1 30 46
127
+ mov.b32 %f13, %r3;
128
+ mov.b32 %f14, %r2;
129
+ .loc 1 35 18
130
+ add.f32 %f15, %f3, %f14;
131
+ add.f32 %f16, %f4, %f13;
132
+ .loc 1 37 18
133
+ add.f32 %f17, %f16, %f8;
134
+ add.f32 %f18, %f15, %f7;
135
+ add.f32 %f19, %f11, %f9;
136
+ add.f32 %f20, %f12, %f10;
137
+ $L__tmp1:
138
+ .loc 2 233 15
139
+ add.f32 %f21, %f18, %f17;
140
+ add.f32 %f22, %f21, %f19;
141
+ add.f32 %f23, %f22, %f20;
142
+ $L__tmp2:
143
+ .loc 2 243 36
144
+ mov.b32 %r62, %f23;
145
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
146
+ mov.b32 %f24, %r63;
147
+ $L__tmp3:
148
+ .loc 2 233 15
149
+ add.f32 %f25, %f23, %f24;
150
+ $L__tmp4:
151
+ .loc 2 243 36
152
+ mov.b32 %r64, %f25;
153
+ shfl.sync.bfly.b32 %r65, %r64, 8, 31, -1;
154
+ mov.b32 %f26, %r65;
155
+ $L__tmp5:
156
+ .loc 2 233 15
157
+ add.f32 %f27, %f25, %f26;
158
+ $L__tmp6:
159
+ .loc 2 243 36
160
+ mov.b32 %r66, %f27;
161
+ shfl.sync.bfly.b32 %r67, %r66, 4, 31, -1;
162
+ mov.b32 %f28, %r67;
163
+ $L__tmp7:
164
+ .loc 2 233 15
165
+ add.f32 %f29, %f27, %f28;
166
+ $L__tmp8:
167
+ .loc 2 243 36
168
+ mov.b32 %r68, %f29;
169
+ shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1;
170
+ mov.b32 %f30, %r69;
171
+ $L__tmp9:
172
+ .loc 2 233 15
173
+ add.f32 %f31, %f29, %f30;
174
+ $L__tmp10:
175
+ .loc 2 243 36
176
+ mov.b32 %r70, %f31;
177
+ shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1;
178
+ mov.b32 %f32, %r71;
179
+ $L__tmp11:
180
+ .loc 2 233 15
181
+ add.f32 %f33, %f31, %f32;
182
+ $L__tmp12:
183
+ .loc 2 243 36
184
+ setp.eq.s32 %p17, %r57, 0;
185
+ shr.u32 %r72, %r56, 3;
186
+ and.b32 %r73, %r72, 4;
187
+ mov.u32 %r74, global_smem;
188
+ add.s32 %r34, %r74, %r73;
189
+ mov.b32 %r35, %f33;
190
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r35;
191
+ bar.sync 0;
192
+ setp.lt.s32 %p18, %r56, 2;
193
+ add.s32 %r37, %r74, %r58;
194
+ @%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
195
+ mov.b32 %f34, %r36;
196
+ shfl.sync.bfly.b32 %r75, %r36, 1, 31, -1;
197
+ mov.b32 %f35, %r75;
198
+ $L__tmp13:
199
+ .loc 2 233 15
200
+ add.f32 %f36, %f34, %f35;
201
+ $L__tmp14:
202
+ .loc 2 243 36
203
+ and.b32 %r76, %r56, 1;
204
+ setp.eq.b32 %p24, %r76, 1;
205
+ not.pred %p25, %p24;
206
+ and.pred %p19, %p18, %p25;
207
+ mov.b32 %r39, %f36;
208
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r39;
209
+ bar.sync 0;
210
+ ld.shared.f32 %f37, [global_smem];
211
+ $L__tmp15:
212
+ .loc 3 8 15
213
+ add.f32 %f38, %f37, 0f00000000;
214
+ $L__tmp16:
215
+ .loc 1 45 20
216
+ mov.b32 %r41, %f38;
217
+ mov.b32 %r42, 1132462080;
218
+ div.full.f32 %r40, %r41, %r42;
219
+ mov.b32 %f39, %r40;
220
+ .loc 1 46 19
221
+ sub.f32 %f40, %f18, %f39;
222
+ sub.f32 %f41, %f17, %f39;
223
+ sub.f32 %f42, %f19, %f39;
224
+ sub.f32 %f43, %f20, %f39;
225
+ .loc 1 47 20
226
+ mul.f32 %f44, %f41, %f41;
227
+ $L__tmp17:
228
+ .loc 2 243 36
229
+ bar.sync 0;
230
+ $L__tmp18:
231
+ .loc 2 233 15
232
+ fma.rn.f32 %f45, %f40, %f40, %f44;
233
+ fma.rn.f32 %f46, %f42, %f42, %f45;
234
+ fma.rn.f32 %f47, %f43, %f43, %f46;
235
+ $L__tmp19:
236
+ .loc 2 243 36
237
+ mov.b32 %r77, %f47;
238
+ shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
239
+ mov.b32 %f48, %r78;
240
+ $L__tmp20:
241
+ .loc 2 233 15
242
+ add.f32 %f49, %f47, %f48;
243
+ $L__tmp21:
244
+ .loc 2 243 36
245
+ mov.b32 %r79, %f49;
246
+ shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
247
+ mov.b32 %f50, %r80;
248
+ $L__tmp22:
249
+ .loc 2 233 15
250
+ add.f32 %f51, %f49, %f50;
251
+ $L__tmp23:
252
+ .loc 2 243 36
253
+ mov.b32 %r81, %f51;
254
+ shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
255
+ mov.b32 %f52, %r82;
256
+ $L__tmp24:
257
+ .loc 2 233 15
258
+ add.f32 %f53, %f51, %f52;
259
+ $L__tmp25:
260
+ .loc 2 243 36
261
+ mov.b32 %r83, %f53;
262
+ shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
263
+ mov.b32 %f54, %r84;
264
+ $L__tmp26:
265
+ .loc 2 233 15
266
+ add.f32 %f55, %f53, %f54;
267
+ $L__tmp27:
268
+ .loc 2 243 36
269
+ mov.b32 %r85, %f55;
270
+ shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
271
+ mov.b32 %f56, %r86;
272
+ $L__tmp28:
273
+ .loc 2 233 15
274
+ add.f32 %f57, %f55, %f56;
275
+ $L__tmp29:
276
+ .loc 2 243 36
277
+ mov.b32 %r44, %f57;
278
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r44;
279
+ bar.sync 0;
280
+ @%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
281
+ mov.b32 %f58, %r45;
282
+ shfl.sync.bfly.b32 %r87, %r45, 1, 31, -1;
283
+ mov.b32 %f59, %r87;
284
+ $L__tmp30:
285
+ .loc 2 233 15
286
+ add.f32 %f60, %f58, %f59;
287
+ $L__tmp31:
288
+ .loc 2 243 36
289
+ mov.b32 %r48, %f60;
290
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r48;
291
+ bar.sync 0;
292
+ ld.shared.f32 %f61, [global_smem];
293
+ $L__tmp32:
294
+ .loc 3 8 15
295
+ add.f32 %f62, %f61, 0f00000000;
296
+ $L__tmp33:
297
+ .loc 1 53 20
298
+ mov.b32 %r50, %f62;
299
+ div.full.f32 %r49, %r50, %r42;
300
+ mov.b32 %f63, %r49;
301
+ .loc 1 55 20
302
+ add.f32 %f64, %f63, 0f3727C5AC;
303
+ .loc 1 56 26
304
+ rsqrt.approx.ftz.f32 %f65, %f64;
305
+ .loc 1 33 36
306
+ mov.b32 %f66, %r29;
307
+ mov.b32 %f67, %r28;
308
+ mov.b32 %f68, %r27;
309
+ mov.b32 %f69, %r26;
310
+ .loc 1 57 20
311
+ mul.f32 %f70, %f40, %f65;
312
+ mul.f32 %f71, %f41, %f65;
313
+ mul.f32 %f72, %f42, %f65;
314
+ mul.f32 %f73, %f43, %f65;
315
+ .loc 1 58 20
316
+ mul.f32 %f74, %f70, %f69;
317
+ mul.f32 %f75, %f71, %f68;
318
+ mul.f32 %f76, %f72, %f67;
319
+ mul.f32 %f77, %f73, %f66;
320
+ .loc 1 59 25
321
+ add.s64 %rd5, %rd10, %rd11;
322
+ .loc 1 59 48
323
+ mov.b32 %r52, %f74;
324
+ mov.b32 %r53, %f75;
325
+ mov.b32 %r54, %f76;
326
+ mov.b32 %r55, %f77;
327
+ @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r52, %r53, %r54, %r55 };
328
+ .loc 1 59 4
329
+ ret;
330
+ $L__tmp34:
331
+ $L__func_end0:
332
+
333
+ }
334
+ // .globl __nv_rsqrtf
335
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
336
+ .param .b32 __nv_rsqrtf_param_0
337
+ )
338
+ {
339
+ .reg .f32 %f<3>;
340
+ $L__func_begin1:
341
+
342
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
343
+ rsqrt.approx.ftz.f32 %f2, %f1;
344
+ st.param.f32 [func_retval0+0], %f2;
345
+ ret;
346
+ $L__func_end1:
347
+
348
+ }
349
+ .file 1 "/tmp/torchinductor_root/pe/cpedrbcgvftrmo3x6vfpo6dhkxbweq3ucfj5jibyyvr3hf67gsvx.py"
350
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
351
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
352
+ .section .debug_abbrev
353
+ {
354
+ .b8 1
355
+ .b8 17
356
+ .b8 1
357
+ .b8 37
358
+ .b8 8
359
+ .b8 19
360
+ .b8 5
361
+ .b8 3
362
+ .b8 8
363
+ .b8 16
364
+ .b8 6
365
+ .b8 27
366
+ .b8 8
367
+ .b8 180
368
+ .b8 66
369
+ .b8 12
370
+ .b8 17
371
+ .b8 1
372
+ .b8 18
373
+ .b8 1
374
+ .b8 0
375
+ .b8 0
376
+ .b8 2
377
+ .b8 46
378
+ .b8 0
379
+ .b8 135
380
+ .b8 64
381
+ .b8 8
382
+ .b8 3
383
+ .b8 8
384
+ .b8 58
385
+ .b8 11
386
+ .b8 59
387
+ .b8 11
388
+ .b8 63
389
+ .b8 12
390
+ .b8 32
391
+ .b8 11
392
+ .b8 0
393
+ .b8 0
394
+ .b8 3
395
+ .b8 46
396
+ .b8 1
397
+ .b8 17
398
+ .b8 1
399
+ .b8 18
400
+ .b8 1
401
+ .b8 64
402
+ .b8 10
403
+ .b8 49
404
+ .b8 19
405
+ .b8 0
406
+ .b8 0
407
+ .b8 4
408
+ .b8 29
409
+ .b8 1
410
+ .b8 49
411
+ .b8 19
412
+ .b8 17
413
+ .b8 1
414
+ .b8 18
415
+ .b8 1
416
+ .b8 88
417
+ .b8 11
418
+ .b8 89
419
+ .b8 11
420
+ .b8 87
421
+ .b8 11
422
+ .b8 0
423
+ .b8 0
424
+ .b8 5
425
+ .b8 29
426
+ .b8 0
427
+ .b8 49
428
+ .b8 19
429
+ .b8 17
430
+ .b8 1
431
+ .b8 18
432
+ .b8 1
433
+ .b8 88
434
+ .b8 11
435
+ .b8 89
436
+ .b8 11
437
+ .b8 87
438
+ .b8 11
439
+ .b8 0
440
+ .b8 0
441
+ .b8 0
442
+ }
443
+ .section .debug_info
444
+ {
445
+ .b32 395
446
+ .b8 2
447
+ .b8 0
448
+ .b32 .debug_abbrev
449
+ .b8 8
450
+ .b8 1
451
+ .b8 116
452
+ .b8 114
453
+ .b8 105
454
+ .b8 116
455
+ .b8 111
456
+ .b8 110
457
+ .b8 0
458
+ .b8 2
459
+ .b8 0
460
+ .b8 99
461
+ .b8 112
462
+ .b8 101
463
+ .b8 100
464
+ .b8 114
465
+ .b8 98
466
+ .b8 99
467
+ .b8 103
468
+ .b8 118
469
+ .b8 102
470
+ .b8 116
471
+ .b8 114
472
+ .b8 109
473
+ .b8 111
474
+ .b8 51
475
+ .b8 120
476
+ .b8 54
477
+ .b8 118
478
+ .b8 102
479
+ .b8 112
480
+ .b8 111
481
+ .b8 54
482
+ .b8 100
483
+ .b8 104
484
+ .b8 107
485
+ .b8 120
486
+ .b8 98
487
+ .b8 119
488
+ .b8 101
489
+ .b8 113
490
+ .b8 51
491
+ .b8 117
492
+ .b8 99
493
+ .b8 102
494
+ .b8 106
495
+ .b8 53
496
+ .b8 106
497
+ .b8 105
498
+ .b8 98
499
+ .b8 121
500
+ .b8 121
501
+ .b8 118
502
+ .b8 114
503
+ .b8 51
504
+ .b8 104
505
+ .b8 102
506
+ .b8 54
507
+ .b8 55
508
+ .b8 103
509
+ .b8 115
510
+ .b8 118
511
+ .b8 120
512
+ .b8 46
513
+ .b8 112
514
+ .b8 121
515
+ .b8 0
516
+ .b32 .debug_line
517
+ .b8 47
518
+ .b8 116
519
+ .b8 109
520
+ .b8 112
521
+ .b8 47
522
+ .b8 116
523
+ .b8 111
524
+ .b8 114
525
+ .b8 99
526
+ .b8 104
527
+ .b8 105
528
+ .b8 110
529
+ .b8 100
530
+ .b8 117
531
+ .b8 99
532
+ .b8 116
533
+ .b8 111
534
+ .b8 114
535
+ .b8 95
536
+ .b8 114
537
+ .b8 111
538
+ .b8 111
539
+ .b8 116
540
+ .b8 47
541
+ .b8 112
542
+ .b8 101
543
+ .b8 0
544
+ .b8 1
545
+ .b64 $L__func_begin0
546
+ .b64 $L__func_end0
547
+ .b8 2
548
+ .b8 116
549
+ .b8 114
550
+ .b8 105
551
+ .b8 116
552
+ .b8 111
553
+ .b8 110
554
+ .b8 95
555
+ .b8 95
556
+ .b8 48
557
+ .b8 100
558
+ .b8 49
559
+ .b8 100
560
+ .b8 50
561
+ .b8 100
562
+ .b8 51
563
+ .b8 100
564
+ .b8 52
565
+ .b8 100
566
+ .b8 53
567
+ .b8 100
568
+ .b8 101
569
+ .b8 54
570
+ .b8 100
571
+ .b8 101
572
+ .b8 0
573
+ .b8 116
574
+ .b8 114
575
+ .b8 105
576
+ .b8 116
577
+ .b8 111
578
+ .b8 110
579
+ .b8 95
580
+ .b8 95
581
+ .b8 48
582
+ .b8 100
583
+ .b8 49
584
+ .b8 100
585
+ .b8 50
586
+ .b8 100
587
+ .b8 51
588
+ .b8 100
589
+ .b8 52
590
+ .b8 100
591
+ .b8 53
592
+ .b8 100
593
+ .b8 101
594
+ .b8 54
595
+ .b8 100
596
+ .b8 101
597
+ .b8 0
598
+ .b8 1
599
+ .b8 18
600
+ .b8 1
601
+ .b8 1
602
+ .b8 3
603
+ .b64 $L__func_begin0
604
+ .b64 $L__func_end0
605
+ .b8 1
606
+ .b8 156
607
+ .b32 125
608
+ .b8 4
609
+ .b32 125
610
+ .b64 $L__tmp1
611
+ .b64 $L__tmp14
612
+ .b8 2
613
+ .b8 42
614
+ .b8 59
615
+ .b8 5
616
+ .b32 125
617
+ .b64 $L__tmp1
618
+ .b64 $L__tmp14
619
+ .b8 2
620
+ .b8 243
621
+ .b8 36
622
+ .b8 0
623
+ .b8 5
624
+ .b32 125
625
+ .b64 $L__tmp2
626
+ .b64 $L__tmp15
627
+ .b8 2
628
+ .b8 42
629
+ .b8 59
630
+ .b8 5
631
+ .b32 125
632
+ .b64 $L__tmp15
633
+ .b64 $L__tmp16
634
+ .b8 3
635
+ .b8 42
636
+ .b8 45
637
+ .b8 5
638
+ .b32 125
639
+ .b64 $L__tmp17
640
+ .b64 $L__tmp32
641
+ .b8 2
642
+ .b8 50
643
+ .b8 59
644
+ .b8 4
645
+ .b32 125
646
+ .b64 $L__tmp18
647
+ .b64 $L__tmp31
648
+ .b8 2
649
+ .b8 50
650
+ .b8 59
651
+ .b8 5
652
+ .b32 125
653
+ .b64 $L__tmp18
654
+ .b64 $L__tmp31
655
+ .b8 2
656
+ .b8 243
657
+ .b8 36
658
+ .b8 0
659
+ .b8 5
660
+ .b32 125
661
+ .b64 $L__tmp32
662
+ .b64 $L__tmp33
663
+ .b8 3
664
+ .b8 50
665
+ .b8 45
666
+ .b8 0
667
+ .b8 0
668
+ }
669
+ .section .debug_pubnames
670
+ {
671
+ .b32 $L__pubNames_end0-$L__pubNames_start0
672
+ $L__pubNames_start0:
673
+ .b8 2
674
+ .b8 0
675
+ .b32 .debug_info
676
+ .b32 399
677
+ .b32 125
678
+ .b8 116
679
+ .b8 114
680
+ .b8 105
681
+ .b8 116
682
+ .b8 111
683
+ .b8 110
684
+ .b8 95
685
+ .b8 95
686
+ .b8 48
687
+ .b8 100
688
+ .b8 49
689
+ .b8 100
690
+ .b8 50
691
+ .b8 100
692
+ .b8 51
693
+ .b8 100
694
+ .b8 52
695
+ .b8 100
696
+ .b8 53
697
+ .b8 100
698
+ .b8 101
699
+ .b8 54
700
+ .b8 100
701
+ .b8 101
702
+ .b8 0
703
+ .b32 0
704
+ $L__pubNames_end0:
705
+ }
706
+ .section .debug_pubtypes
707
+ {
708
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
709
+ $L__pubTypes_start0:
710
+ .b8 2
711
+ .b8 0
712
+ .b32 .debug_info
713
+ .b32 399
714
+ .b32 0
715
+ $L__pubTypes_end0:
716
+ }
717
+ .section .debug_loc { }
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2d34e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
5
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %7 = and i32 %6, 7, !dbg !8
7
+ %8 = zext nneg i32 %7 to i64, !dbg !9
8
+ %9 = getelementptr float, ptr addrspace(1) %1, i64 %8, !dbg !9
9
+ %10 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %9, i1 true, i32 0, i1 true) #3, !dbg !10
10
+ %11 = bitcast i32 %10 to float, !dbg !10
11
+ %12 = getelementptr i64, ptr addrspace(1) %2, i64 %8, !dbg !11
12
+ %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %12, i1 true, i1 true) #3, !dbg !12
13
+ %14 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %10, i32 4, i32 31), !dbg !13
14
+ %15 = bitcast i32 %14 to float, !dbg !13
15
+ %16 = fadd float %11, %15, !dbg !17
16
+ %17 = bitcast float %16 to i32, !dbg !13
17
+ %18 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %17, i32 2, i32 31), !dbg !13
18
+ %19 = bitcast i32 %18 to float, !dbg !13
19
+ %20 = fadd float %16, %19, !dbg !17
20
+ %21 = bitcast float %20 to i32, !dbg !13
21
+ %22 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %21, i32 1, i32 31), !dbg !13
22
+ %23 = bitcast i32 %22 to float, !dbg !13
23
+ %24 = fadd float %20, %23, !dbg !17
24
+ %25 = trunc i64 %13 to i32, !dbg !21
25
+ %26 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %25, i32 4, i32 31), !dbg !21
26
+ %bc = bitcast i64 %13 to <2 x i32>, !dbg !21
27
+ %27 = extractelement <2 x i32> %bc, i64 1, !dbg !21
28
+ %28 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %27, i32 4, i32 31), !dbg !21
29
+ %29 = insertelement <2 x i32> undef, i32 %26, i64 0, !dbg !21
30
+ %30 = insertelement <2 x i32> %29, i32 %28, i64 1, !dbg !21
31
+ %31 = bitcast <2 x i32> %30 to i64, !dbg !21
32
+ %32 = add i64 %13, %31, !dbg !23
33
+ %33 = trunc i64 %32 to i32, !dbg !21
34
+ %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 2, i32 31), !dbg !21
35
+ %bc1 = bitcast i64 %32 to <2 x i32>, !dbg !21
36
+ %35 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
37
+ %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !21
38
+ %37 = insertelement <2 x i32> undef, i32 %34, i64 0, !dbg !21
39
+ %38 = insertelement <2 x i32> %37, i32 %36, i64 1, !dbg !21
40
+ %39 = bitcast <2 x i32> %38 to i64, !dbg !21
41
+ %40 = add i64 %32, %39, !dbg !23
42
+ %41 = trunc i64 %40 to i32, !dbg !21
43
+ %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !21
44
+ %bc2 = bitcast i64 %40 to <2 x i32>, !dbg !21
45
+ %43 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
46
+ %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 1, i32 31), !dbg !21
47
+ %45 = insertelement <2 x i32> undef, i32 %42, i64 0, !dbg !21
48
+ %46 = insertelement <2 x i32> %45, i32 %44, i64 1, !dbg !21
49
+ %47 = bitcast <2 x i32> %46 to i64, !dbg !21
50
+ %48 = add i64 %40, %47, !dbg !23
51
+ %49 = sitofp i64 %48 to float, !dbg !26
52
+ %50 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %24, float %49) #3, !dbg !27
53
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
54
+ %51 = and i32 %6, 63, !dbg !29
55
+ %52 = icmp eq i32 %51, 0, !dbg !29
56
+ %53 = bitcast float %50 to i32, !dbg !29
57
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %53, ptr addrspace(1) %0, i1 %52) #3, !dbg !29
58
+ ret void, !dbg !30
59
+ }
60
+
61
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
62
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
63
+
64
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
65
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
66
+
67
+ ; Function Attrs: convergent nocallback nounwind
68
+ declare void @llvm.nvvm.barrier0() #2
69
+
70
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
71
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
72
+ attributes #2 = { convergent nocallback nounwind }
73
+ attributes #3 = { nounwind }
74
+
75
+ !llvm.module.flags = !{!0}
76
+ !llvm.dbg.cu = !{!1}
77
+ !nvvm.annotations = !{!3, !4, !4, !3}
78
+
79
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
80
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
81
+ !2 = !DIFile(filename: "c2qomesxoic3sfzpdzftrhej7z6hhd6pritis2f4ye2ckqoetmyt.py", directory: "/tmp/torchinductor_root/2q")
82
+ !3 = !{ptr @triton__0d1d2d34e, !"kernel", i32 1}
83
+ !4 = !{ptr @triton__0d1d2d34e, !"maxntidx", i32 64}
84
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d34e", linkageName: "triton__0d1d2d34e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
85
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
86
+ !7 = !{}
87
+ !8 = !DILocation(line: 25, column: 34, scope: !5)
88
+ !9 = !DILocation(line: 28, column: 30, scope: !5)
89
+ !10 = !DILocation(line: 28, column: 35, scope: !5)
90
+ !11 = !DILocation(line: 29, column: 30, scope: !5)
91
+ !12 = !DILocation(line: 29, column: 35, scope: !5)
92
+ !13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
93
+ !14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
94
+ !15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
95
+ !16 = !DILocation(line: 32, column: 24, scope: !14)
96
+ !17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
97
+ !18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
98
+ !19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
99
+ !20 = !DILocation(line: 32, column: 24, scope: !18)
100
+ !21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
101
+ !22 = !DILocation(line: 35, column: 24, scope: !14)
102
+ !23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
103
+ !24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
104
+ !25 = !DILocation(line: 35, column: 24, scope: !18)
105
+ !26 = !DILocation(line: 36, column: 20, scope: !5)
106
+ !27 = !DILocation(line: 37, column: 19, scope: !5)
107
+ !28 = !DILocation(line: 38, column: 4, scope: !5)
108
+ !29 = !DILocation(line: 39, column: 71, scope: !5)
109
+ !30 = !DILocation(line: 39, column: 4, scope: !5)
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<56>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<185>;
43
+ .reg .f32 %f<169>;
44
+ .reg .b64 %rd<59>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
50
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
51
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r1, %tid.x;
55
+ and.b32 %r2, %r1, 31;
56
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
57
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
58
+ bfe.u32 %r3, %r1, 6, 1;
59
+ and.b32 %r4, %r1, 1;
60
+ .loc 1 24 33
61
+ bfe.u32 %r5, %r1, 5, 1;
62
+ shl.b32 %r31, %r1, 2;
63
+ and.b32 %r6, %r31, 252;
64
+ shl.b32 %r32, %r1, 1;
65
+ and.b32 %r7, %r32, 254;
66
+ .loc 1 21 28
67
+ mov.u32 %r14, %ctaid.x;
68
+ .loc 1 21 33
69
+ shl.b32 %r33, %r14, 1;
70
+ .loc 1 22 23
71
+ or.b32 %r34, %r33, %r3;
72
+ or.b32 %r35, %r33, %r4;
73
+ .loc 1 26 30
74
+ mul.wide.s32 %rd25, %r34, 8;
75
+ add.s64 %rd11, %rd22, %rd25;
76
+ mul.wide.s32 %rd26, %r35, 8;
77
+ add.s64 %rd19, %rd22, %rd26;
78
+ mov.pred %p50, -1;
79
+ .loc 1 26 35
80
+ mov.u64 %rd10, 0x0;
81
+ @%p50 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
82
+ mov.u64 %rd12, 0x0;
83
+ @%p50 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
84
+ mov.u64 %rd14, 0x0;
85
+ @%p50 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
86
+ mov.u64 %rd16, 0x0;
87
+ @%p50 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
88
+ mov.u64 %rd18, 0x0;
89
+ @%p50 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
90
+ .loc 1 27 18
91
+ bfe.s32 %r36, %r14, 30, 1;
92
+ shr.u32 %r37, %r36, 23;
93
+ add.s32 %r38, %r34, %r37;
94
+ and.b32 %r39, %r38, 16776704;
95
+ sub.s32 %r40, %r34, %r39;
96
+ .loc 1 35 44
97
+ shl.b32 %r41, %r40, 8;
98
+ .loc 1 35 40
99
+ or.b32 %r42, %r41, %r6;
100
+ .loc 1 35 34
101
+ mul.wide.s32 %rd27, %r42, 4;
102
+ add.s64 %rd38, %rd23, %rd27;
103
+ mov.b32 %r155, 0;
104
+ .loc 1 35 50
105
+ mov.u32 %r15, 0x0;
106
+ mov.u32 %r16, 0x0;
107
+ mov.u32 %r17, 0x0;
108
+ mov.u32 %r18, 0x0;
109
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd38 + 0 ];
110
+ @!%p50 mov.u32 %r15, %r155;
111
+ @!%p50 mov.u32 %r16, %r155;
112
+ @!%p50 mov.u32 %r17, %r155;
113
+ @!%p50 mov.u32 %r18, %r155;
114
+ mov.b32 %f2, %r15;
115
+ mov.b32 %f1, %r16;
116
+ mov.b32 %f3, %r17;
117
+ mov.b32 %f4, %r18;
118
+ .loc 1 36 44
119
+ shl.b32 %r43, %r34, 8;
120
+ .loc 1 36 40
121
+ or.b32 %r44, %r43, %r6;
122
+ .loc 1 36 34
123
+ mul.wide.s32 %rd28, %r44, 2;
124
+ add.s64 %rd39, %rd24, %rd28;
125
+ .loc 1 36 50
126
+ mov.u32 %r23, 0x0;
127
+ mov.u32 %r24, 0x0;
128
+ @%p50 ld.global.L1::evict_last.v2.b32 { %r23, %r24 }, [ %rd39 + 0 ];
129
+ @!%p50 mov.u32 %r23, %r155;
130
+ @!%p50 mov.u32 %r24, %r155;
131
+ cvt.u16.u32 %rs1, %r23;
132
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r23; }
133
+ cvt.u16.u32 %rs3, %r24;
134
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r24; }
135
+ .loc 1 36 101
136
+ cvt.f32.bf16 %r27, %rs1;
137
+ mov.b32 %f5, %r27;
138
+ cvt.f32.bf16 %r28, %rs2;
139
+ mov.b32 %f6, %r28;
140
+ cvt.f32.bf16 %r29, %rs3;
141
+ mov.b32 %f7, %r29;
142
+ cvt.f32.bf16 %r30, %rs4;
143
+ mov.b32 %f8, %r30;
144
+ .loc 1 37 22
145
+ add.s64 %rd29, %rd18, 50257;
146
+ .loc 1 38 22
147
+ setp.lt.s64 %p14, %rd18, 0;
148
+ .loc 1 39 36
149
+ selp.b64 %rd5, %rd29, %rd18, %p14;
150
+ .loc 1 40 40
151
+ setp.lt.u64 %p15, %rd5, 50257;
152
+ mov.b32 %r184, 883;
153
+ mov.u64 %rd58, 1;
154
+ .loc 1 40 55
155
+ @%p15 bra $L__BB0_2;
156
+ mov.u64 %rd30, assertMessage_0;
157
+ cvta.global.u64 %rd31, %rd30;
158
+ mov.u64 %rd32, assertFile_0;
159
+ cvta.global.u64 %rd33, %rd32;
160
+ mov.u64 %rd34, assertFunc_0;
161
+ cvta.global.u64 %rd35, %rd34;
162
+ { // callseq 4, 0
163
+ .reg .b32 temp_param_reg;
164
+ .param .b64 param0;
165
+ st.param.b64 [param0+0], %rd31;
166
+ .param .b64 param1;
167
+ st.param.b64 [param1+0], %rd33;
168
+ .param .b32 param2;
169
+ st.param.b32 [param2+0], %r184;
170
+ .param .b64 param3;
171
+ st.param.b64 [param3+0], %rd35;
172
+ .param .b64 param4;
173
+ st.param.b64 [param4+0], %rd58;
174
+ call.uni
175
+ __assertfail,
176
+ (
177
+ param0,
178
+ param1,
179
+ param2,
180
+ param3,
181
+ param4
182
+ );
183
+ } // callseq 4
184
+ $L__BB0_2:
185
+ .loc 1 0 55
186
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
187
+ cvt.s64.s32 %rd3, %r44;
188
+ .loc 1 38 22
189
+ setp.lt.s64 %p42, %rd10, 0;
190
+ .loc 1 41 44
191
+ shl.b64 %rd41, %rd10, 8;
192
+ add.s64 %rd42, %rd41, 12865792;
193
+ selp.b64 %rd43, %rd42, %rd41, %p42;
194
+ cvt.u64.u32 %rd44, %r6;
195
+ .loc 1 41 40
196
+ or.b64 %rd45, %rd43, %rd44;
197
+ .loc 1 41 34
198
+ shl.b64 %rd46, %rd45, 2;
199
+ add.s64 %rd55, %rd7, %rd46;
200
+ .loc 1 41 52
201
+ mov.u32 %r46, 0x0;
202
+ mov.u32 %r47, 0x0;
203
+ mov.u32 %r48, 0x0;
204
+ mov.u32 %r49, 0x0;
205
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd55 + 0 ];
206
+ @!%p50 mov.u32 %r46, %r155;
207
+ @!%p50 mov.u32 %r47, %r155;
208
+ @!%p50 mov.u32 %r48, %r155;
209
+ @!%p50 mov.u32 %r49, %r155;
210
+ mov.b32 %f15, %r48;
211
+ mov.b32 %f16, %r49;
212
+ .loc 1 42 22
213
+ add.f32 %f17, %f3, %f15;
214
+ add.f32 %f18, %f4, %f16;
215
+ .loc 1 44 22
216
+ add.f32 %f19, %f7, %f17;
217
+ add.f32 %f20, %f8, %f18;
218
+ .loc 1 41 52
219
+ mov.b32 %f21, %r46;
220
+ mov.b32 %f22, %r47;
221
+ .loc 1 42 22
222
+ add.f32 %f23, %f1, %f22;
223
+ add.f32 %f24, %f2, %f21;
224
+ .loc 1 44 22
225
+ add.f32 %f25, %f5, %f24;
226
+ add.f32 %f26, %f6, %f23;
227
+ $L__tmp1:
228
+ .loc 2 98 22
229
+ add.f32 %f27, %f26, 0f00000000;
230
+ add.f32 %f28, %f25, 0f00000000;
231
+ add.f32 %f29, %f19, 0f00000000;
232
+ add.f32 %f30, %f20, 0f00000000;
233
+ .loc 2 101 30
234
+ sub.f32 %f31, %f25, %f28;
235
+ sub.f32 %f32, %f26, %f27;
236
+ sub.f32 %f33, %f19, %f29;
237
+ sub.f32 %f34, %f20, %f30;
238
+ .loc 2 101 13
239
+ fma.rn.f32 %f35, %f25, %f31, 0f00000000;
240
+ fma.rn.f32 %f36, %f26, %f32, 0f00000000;
241
+ fma.rn.f32 %f37, %f19, %f33, 0f00000000;
242
+ fma.rn.f32 %f38, %f20, %f34, 0f00000000;
243
+ $L__tmp2:
244
+ .loc 2 108 21
245
+ sub.f32 %f39, %f27, %f28;
246
+ mov.b32 %r55, 1065353216;
247
+ mov.b32 %r56, 1073741824;
248
+ .loc 2 110 60
249
+ div.full.f32 %r54, %r55, %r56;
250
+ mov.b32 %f40, %r54;
251
+ .loc 2 112 17
252
+ fma.rn.f32 %f41, %f40, %f39, %f28;
253
+ .loc 2 113 15
254
+ add.f32 %f42, %f35, %f36;
255
+ .loc 2 113 30
256
+ mul.f32 %f43, %f39, %f39;
257
+ .loc 2 113 22
258
+ fma.rn.f32 %f44, %f40, %f43, %f42;
259
+ .loc 2 108 21
260
+ sub.f32 %f45, %f29, %f41;
261
+ mov.b32 %r59, 1077936128;
262
+ .loc 2 110 60
263
+ div.full.f32 %r57, %r55, %r59;
264
+ mov.b32 %f46, %r57;
265
+ .loc 2 112 17
266
+ fma.rn.f32 %f47, %f46, %f45, %f41;
267
+ .loc 2 113 15
268
+ add.f32 %f48, %f37, %f44;
269
+ .loc 2 113 30
270
+ mul.f32 %f49, %f45, %f45;
271
+ .loc 2 113 38
272
+ fma.rn.f32 %f50, %f45, %f45, %f49;
273
+ .loc 2 113 22
274
+ fma.rn.f32 %f51, %f46, %f50, %f48;
275
+ .loc 2 108 21
276
+ sub.f32 %f52, %f30, %f47;
277
+ mov.b32 %r62, 1082130432;
278
+ .loc 2 110 60
279
+ div.full.f32 %r60, %r55, %r62;
280
+ mov.b32 %f53, %r60;
281
+ .loc 2 112 17
282
+ fma.rn.f32 %f54, %f53, %f52, %f47;
283
+ .loc 2 113 15
284
+ add.f32 %f55, %f38, %f51;
285
+ .loc 2 113 30
286
+ mul.f32 %f56, %f52, %f52;
287
+ .loc 2 113 38
288
+ mul.f32 %f57, %f56, 0f40400000;
289
+ .loc 2 113 22
290
+ fma.rn.f32 %f58, %f53, %f57, %f55;
291
+ $L__tmp3:
292
+ .loc 2 120 46
293
+ mov.b32 %r119, %f54;
294
+ shfl.sync.bfly.b32 %r120, %r119, 16, 31, -1;
295
+ mov.b32 %f59, %r120;
296
+ mov.b32 %r121, %f58;
297
+ shfl.sync.bfly.b32 %r122, %r121, 16, 31, -1;
298
+ mov.b32 %f60, %r122;
299
+ shfl.sync.bfly.b32 %r64, %r62, 16, 31, -1;
300
+ mov.b32 %f61, %r64;
301
+ $L__tmp4:
302
+ .loc 2 108 21
303
+ sub.f32 %f62, %f59, %f54;
304
+ .loc 2 109 28
305
+ add.f32 %f63, %f61, 0f40800000;
306
+ .loc 2 110 39
307
+ setp.eq.f32 %p43, %f63, 0f00000000;
308
+ .loc 2 110 60
309
+ mov.b32 %r65, %f63;
310
+ div.full.f32 %r63, %r64, %r65;
311
+ mov.b32 %f64, %r63;
312
+ .loc 2 110 49
313
+ selp.f32 %f65, 0f00000000, %f64, %p43;
314
+ .loc 2 112 17
315
+ fma.rn.f32 %f66, %f65, %f62, %f54;
316
+ .loc 2 113 15
317
+ add.f32 %f67, %f58, %f60;
318
+ .loc 2 113 30
319
+ mul.f32 %f68, %f62, %f62;
320
+ .loc 2 113 38
321
+ mul.f32 %f69, %f68, 0f40800000;
322
+ .loc 2 113 22
323
+ fma.rn.f32 %f70, %f65, %f69, %f67;
324
+ $L__tmp5:
325
+ .loc 2 120 46
326
+ mov.b32 %r123, %f66;
327
+ shfl.sync.bfly.b32 %r124, %r123, 8, 31, -1;
328
+ mov.b32 %f71, %r124;
329
+ mov.b32 %r125, %f70;
330
+ shfl.sync.bfly.b32 %r126, %r125, 8, 31, -1;
331
+ mov.b32 %f72, %r126;
332
+ shfl.sync.bfly.b32 %r67, %r65, 8, 31, -1;
333
+ mov.b32 %f73, %r67;
334
+ $L__tmp6:
335
+ .loc 2 108 21
336
+ sub.f32 %f74, %f71, %f66;
337
+ .loc 2 109 28
338
+ add.f32 %f75, %f63, %f73;
339
+ .loc 2 110 39
340
+ setp.eq.f32 %p44, %f75, 0f00000000;
341
+ .loc 2 110 60
342
+ mov.b32 %r68, %f75;
343
+ div.full.f32 %r66, %r67, %r68;
344
+ mov.b32 %f76, %r66;
345
+ .loc 2 110 49
346
+ selp.f32 %f77, 0f00000000, %f76, %p44;
347
+ .loc 2 112 17
348
+ fma.rn.f32 %f78, %f77, %f74, %f66;
349
+ .loc 2 113 15
350
+ add.f32 %f79, %f70, %f72;
351
+ .loc 2 113 30
352
+ mul.f32 %f80, %f74, %f74;
353
+ .loc 2 113 38
354
+ mul.f32 %f81, %f63, %f80;
355
+ .loc 2 113 22
356
+ fma.rn.f32 %f82, %f77, %f81, %f79;
357
+ $L__tmp7:
358
+ .loc 2 120 46
359
+ mov.b32 %r127, %f78;
360
+ shfl.sync.bfly.b32 %r128, %r127, 4, 31, -1;
361
+ mov.b32 %f83, %r128;
362
+ mov.b32 %r129, %f82;
363
+ shfl.sync.bfly.b32 %r130, %r129, 4, 31, -1;
364
+ mov.b32 %f84, %r130;
365
+ shfl.sync.bfly.b32 %r70, %r68, 4, 31, -1;
366
+ mov.b32 %f85, %r70;
367
+ $L__tmp8:
368
+ .loc 2 108 21
369
+ sub.f32 %f86, %f83, %f78;
370
+ .loc 2 109 28
371
+ add.f32 %f87, %f75, %f85;
372
+ .loc 2 110 39
373
+ setp.eq.f32 %p45, %f87, 0f00000000;
374
+ .loc 2 110 60
375
+ mov.b32 %r71, %f87;
376
+ div.full.f32 %r69, %r70, %r71;
377
+ mov.b32 %f88, %r69;
378
+ .loc 2 110 49
379
+ selp.f32 %f89, 0f00000000, %f88, %p45;
380
+ .loc 2 112 17
381
+ fma.rn.f32 %f90, %f89, %f86, %f78;
382
+ .loc 2 113 15
383
+ add.f32 %f91, %f82, %f84;
384
+ .loc 2 113 30
385
+ mul.f32 %f92, %f86, %f86;
386
+ .loc 2 113 38
387
+ mul.f32 %f93, %f75, %f92;
388
+ .loc 2 113 22
389
+ fma.rn.f32 %f94, %f89, %f93, %f91;
390
+ $L__tmp9:
391
+ .loc 2 120 46
392
+ mov.b32 %r131, %f90;
393
+ shfl.sync.bfly.b32 %r132, %r131, 2, 31, -1;
394
+ mov.b32 %f95, %r132;
395
+ mov.b32 %r133, %f94;
396
+ shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
397
+ mov.b32 %f96, %r134;
398
+ shfl.sync.bfly.b32 %r73, %r71, 2, 31, -1;
399
+ mov.b32 %f97, %r73;
400
+ $L__tmp10:
401
+ .loc 2 108 21
402
+ sub.f32 %f98, %f95, %f90;
403
+ .loc 2 109 28
404
+ add.f32 %f99, %f87, %f97;
405
+ .loc 2 110 39
406
+ setp.eq.f32 %p46, %f99, 0f00000000;
407
+ .loc 2 110 60
408
+ mov.b32 %r74, %f99;
409
+ div.full.f32 %r72, %r73, %r74;
410
+ mov.b32 %f100, %r72;
411
+ .loc 2 110 49
412
+ selp.f32 %f101, 0f00000000, %f100, %p46;
413
+ .loc 2 112 17
414
+ fma.rn.f32 %f102, %f101, %f98, %f90;
415
+ .loc 2 113 15
416
+ add.f32 %f103, %f94, %f96;
417
+ .loc 2 113 30
418
+ mul.f32 %f104, %f98, %f98;
419
+ .loc 2 113 38
420
+ mul.f32 %f105, %f87, %f104;
421
+ .loc 2 113 22
422
+ fma.rn.f32 %f106, %f101, %f105, %f103;
423
+ $L__tmp11:
424
+ .loc 2 120 46
425
+ mov.b32 %r135, %f102;
426
+ shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
427
+ mov.b32 %f107, %r136;
428
+ mov.b32 %r137, %f106;
429
+ shfl.sync.bfly.b32 %r138, %r137, 1, 31, -1;
430
+ mov.b32 %f108, %r138;
431
+ shfl.sync.bfly.b32 %r76, %r74, 1, 31, -1;
432
+ mov.b32 %f109, %r76;
433
+ $L__tmp12:
434
+ .loc 2 108 21
435
+ sub.f32 %f110, %f107, %f102;
436
+ .loc 2 109 28
437
+ add.f32 %f111, %f99, %f109;
438
+ .loc 2 110 39
439
+ setp.eq.f32 %p47, %f111, 0f00000000;
440
+ .loc 2 110 60
441
+ mov.b32 %r77, %f111;
442
+ div.full.f32 %r75, %r76, %r77;
443
+ mov.b32 %f112, %r75;
444
+ .loc 2 110 49
445
+ selp.f32 %f113, 0f00000000, %f112, %p47;
446
+ .loc 2 112 17
447
+ fma.rn.f32 %f114, %f113, %f110, %f102;
448
+ .loc 2 113 15
449
+ add.f32 %f115, %f106, %f108;
450
+ .loc 2 113 30
451
+ mul.f32 %f116, %f110, %f110;
452
+ .loc 2 113 38
453
+ mul.f32 %f117, %f99, %f116;
454
+ .loc 2 113 22
455
+ fma.rn.f32 %f118, %f113, %f117, %f115;
456
+ $L__tmp13:
457
+ .loc 2 120 46
458
+ setp.eq.s32 %p21, %r2, 0;
459
+ shl.b32 %r139, %r5, 2;
460
+ shl.b32 %r140, %r3, 3;
461
+ or.b32 %r141, %r140, %r139;
462
+ mov.u32 %r142, global_smem;
463
+ add.s32 %r78, %r142, %r141;
464
+ mov.b32 %r79, %f114;
465
+ @%p21 st.shared.b32 [ %r78 + 0 ], %r79;
466
+ add.s32 %r143, %r142, 16;
467
+ add.s32 %r80, %r143, %r141;
468
+ mov.b32 %r81, %f118;
469
+ @%p21 st.shared.b32 [ %r80 + 0 ], %r81;
470
+ add.s32 %r144, %r142, 32;
471
+ add.s32 %r82, %r144, %r141;
472
+ @%p21 st.shared.b32 [ %r82 + 0 ], %r77;
473
+ bar.sync 0;
474
+ setp.lt.s32 %p24, %r1, 4;
475
+ add.s32 %r85, %r142, %r31;
476
+ @%p24 ld.shared.b32 %r84, [ %r85 + 0 ];
477
+ mov.b32 %f119, %r84;
478
+ add.s32 %r87, %r143, %r31;
479
+ @%p24 ld.shared.b32 %r86, [ %r87 + 0 ];
480
+ mov.b32 %f120, %r86;
481
+ add.s32 %r89, %r144, %r31;
482
+ @%p24 ld.shared.b32 %r88, [ %r89 + 0 ];
483
+ mov.b32 %f121, %r88;
484
+ shfl.sync.bfly.b32 %r146, %r84, 1, 31, -1;
485
+ mov.b32 %f122, %r146;
486
+ shfl.sync.bfly.b32 %r147, %r86, 1, 31, -1;
487
+ mov.b32 %f123, %r147;
488
+ shfl.sync.bfly.b32 %r91, %r88, 1, 31, -1;
489
+ mov.b32 %f124, %r91;
490
+ $L__tmp14:
491
+ .loc 2 108 21
492
+ sub.f32 %f125, %f122, %f119;
493
+ .loc 2 109 28
494
+ add.f32 %f126, %f121, %f124;
495
+ .loc 2 110 39
496
+ setp.eq.f32 %p48, %f126, 0f00000000;
497
+ .loc 2 110 60
498
+ mov.b32 %r92, %f126;
499
+ div.full.f32 %r90, %r91, %r92;
500
+ mov.b32 %f127, %r90;
501
+ .loc 2 110 49
502
+ selp.f32 %f128, 0f00000000, %f127, %p48;
503
+ .loc 2 112 17
504
+ fma.rn.f32 %f129, %f125, %f128, %f119;
505
+ .loc 2 113 15
506
+ add.f32 %f130, %f120, %f123;
507
+ .loc 2 113 30
508
+ mul.f32 %f131, %f125, %f125;
509
+ .loc 2 113 38
510
+ mul.f32 %f132, %f121, %f131;
511
+ .loc 2 113 22
512
+ fma.rn.f32 %f133, %f132, %f128, %f130;
513
+ $L__tmp15:
514
+ .loc 2 120 46
515
+ setp.eq.s32 %p49, %r4, 0;
516
+ and.pred %p27, %p24, %p49;
517
+ mov.b32 %r94, %f129;
518
+ @%p27 st.shared.b32 [ %r85 + 0 ], %r94;
519
+ mov.b32 %r96, %f133;
520
+ @%p27 st.shared.b32 [ %r87 + 0 ], %r96;
521
+ @%p27 st.shared.b32 [ %r89 + 0 ], %r92;
522
+ bar.sync 0;
523
+ add.s32 %r148, %r142, %r140;
524
+ ld.shared.f32 %f9, [%r148];
525
+ add.s32 %r149, %r143, %r140;
526
+ ld.shared.f32 %f10, [%r149];
527
+ $L__tmp16:
528
+ .loc 1 62 51
529
+ mov.u32 %r99, 0x0;
530
+ mov.u32 %r100, 0x0;
531
+ mov.u32 %r101, 0x0;
532
+ mov.u32 %r102, 0x0;
533
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd38 + 0 ];
534
+ @!%p50 mov.u32 %r99, %r155;
535
+ @!%p50 mov.u32 %r100, %r155;
536
+ @!%p50 mov.u32 %r101, %r155;
537
+ @!%p50 mov.u32 %r102, %r155;
538
+ .loc 1 63 51
539
+ mov.u32 %r107, 0x0;
540
+ mov.u32 %r108, 0x0;
541
+ @%p50 ld.global.L1::evict_first.v2.b32 { %r107, %r108 }, [ %rd39 + 0 ];
542
+ @!%p50 mov.u32 %r107, %r155;
543
+ @!%p50 mov.u32 %r108, %r155;
544
+ cvt.u16.u32 %rs5, %r107;
545
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r107; }
546
+ cvt.u16.u32 %rs7, %r108;
547
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r108; }
548
+ .loc 1 63 103
549
+ cvt.f32.bf16 %r111, %rs5;
550
+ mov.b32 %f11, %r111;
551
+ cvt.f32.bf16 %r112, %rs6;
552
+ mov.b32 %f12, %r112;
553
+ cvt.f32.bf16 %r113, %rs7;
554
+ mov.b32 %f13, %r113;
555
+ cvt.f32.bf16 %r114, %rs8;
556
+ mov.b32 %f14, %r114;
557
+ .loc 1 64 35
558
+ mul.wide.u32 %rd47, %r7, 4;
559
+ add.s64 %rd40, %rd8, %rd47;
560
+ .loc 1 64 40
561
+ mov.u32 %r115, 0x0;
562
+ mov.u32 %r116, 0x0;
563
+ @%p50 ld.global.L1::evict_last.v2.b32 { %r115, %r116 }, [ %rd40 + 0 ];
564
+ @!%p50 mov.u32 %r115, %r155;
565
+ @!%p50 mov.u32 %r116, %r155;
566
+ .loc 1 68 57
567
+ @%p15 bra $L__BB0_4;
568
+ mov.u64 %rd48, assertMessage_1;
569
+ cvta.global.u64 %rd49, %rd48;
570
+ mov.u64 %rd50, assertFile_1;
571
+ cvta.global.u64 %rd51, %rd50;
572
+ mov.u64 %rd52, assertFunc_1;
573
+ cvta.global.u64 %rd53, %rd52;
574
+ { // callseq 5, 0
575
+ .reg .b32 temp_param_reg;
576
+ .param .b64 param0;
577
+ st.param.b64 [param0+0], %rd49;
578
+ .param .b64 param1;
579
+ st.param.b64 [param1+0], %rd51;
580
+ .param .b32 param2;
581
+ st.param.b32 [param2+0], %r184;
582
+ .param .b64 param3;
583
+ st.param.b64 [param3+0], %rd53;
584
+ .param .b64 param4;
585
+ st.param.b64 [param4+0], %rd58;
586
+ call.uni
587
+ __assertfail,
588
+ (
589
+ param0,
590
+ param1,
591
+ param2,
592
+ param3,
593
+ param4
594
+ );
595
+ } // callseq 5
596
+ $L__BB0_4:
597
+ .loc 1 69 54
598
+ mov.u32 %r151, 0x0;
599
+ mov.u32 %r152, 0x0;
600
+ mov.u32 %r153, 0x0;
601
+ mov.u32 %r154, 0x0;
602
+ @%p50 ld.global.L1::evict_first.v4.b32 { %r151, %r152, %r153, %r154 }, [ %rd55 + 0 ];
603
+ @!%p50 mov.u32 %r151, %r155;
604
+ @!%p50 mov.u32 %r152, %r155;
605
+ @!%p50 mov.u32 %r153, %r155;
606
+ @!%p50 mov.u32 %r154, %r155;
607
+ .loc 1 75 24
608
+ mov.b32 %r160, %f10;
609
+ mov.b32 %r161, 1132462080;
610
+ div.full.f32 %r159, %r160, %r161;
611
+ mov.b32 %f134, %r159;
612
+ .loc 1 77 24
613
+ add.f32 %f135, %f134, 0f3727C5AC;
614
+ .loc 1 78 30
615
+ rsqrt.approx.ftz.f32 %f136, %f135;
616
+ .loc 1 69 54
617
+ mov.b32 %f137, %r154;
618
+ .loc 1 62 51
619
+ mov.b32 %f138, %r102;
620
+ .loc 1 70 24
621
+ add.f32 %f139, %f138, %f137;
622
+ .loc 1 72 24
623
+ add.f32 %f140, %f14, %f139;
624
+ .loc 1 73 24
625
+ sub.f32 %f141, %f140, %f9;
626
+ .loc 1 69 54
627
+ mov.b32 %f142, %r153;
628
+ .loc 1 62 51
629
+ mov.b32 %f143, %r101;
630
+ .loc 1 70 24
631
+ add.f32 %f144, %f143, %f142;
632
+ .loc 1 72 24
633
+ add.f32 %f145, %f13, %f144;
634
+ .loc 1 73 24
635
+ sub.f32 %f146, %f145, %f9;
636
+ .loc 1 69 54
637
+ mov.b32 %f147, %r152;
638
+ .loc 1 62 51
639
+ mov.b32 %f148, %r100;
640
+ .loc 1 70 24
641
+ add.f32 %f149, %f148, %f147;
642
+ .loc 1 72 24
643
+ add.f32 %f150, %f12, %f149;
644
+ .loc 1 73 24
645
+ sub.f32 %f151, %f150, %f9;
646
+ .loc 1 69 54
647
+ mov.b32 %f152, %r151;
648
+ .loc 1 62 51
649
+ mov.b32 %f153, %r99;
650
+ .loc 1 70 24
651
+ add.f32 %f154, %f153, %f152;
652
+ .loc 1 72 24
653
+ add.f32 %f155, %f11, %f154;
654
+ .loc 1 73 24
655
+ sub.f32 %f156, %f155, %f9;
656
+ .loc 1 79 24
657
+ mul.f32 %f157, %f156, %f136;
658
+ mul.f32 %f158, %f151, %f136;
659
+ mul.f32 %f159, %f146, %f136;
660
+ mul.f32 %f160, %f141, %f136;
661
+ .loc 1 80 24
662
+ bar.sync 0;
663
+ shl.b32 %r177, %r7, 2;
664
+ add.s32 %r179, %r142, %r177;
665
+ st.shared.v2.u32 [%r179], {%r115, %r116};
666
+ bar.sync 0;
667
+ shl.b32 %r180, %r6, 2;
668
+ add.s32 %r181, %r142, %r180;
669
+ ld.shared.v4.f32 {%f161, %f162, %f163, %f164}, [%r181];
670
+ mul.f32 %f165, %f157, %f161;
671
+ mul.f32 %f166, %f158, %f162;
672
+ mul.f32 %f167, %f159, %f163;
673
+ mul.f32 %f168, %f160, %f164;
674
+ .loc 1 82 29
675
+ shl.b64 %rd57, %rd3, 1;
676
+ add.s64 %rd56, %rd9, %rd57;
677
+ .loc 1 82 52
678
+ mov.b32 %r171, %f165;
679
+ cvt.rn.bf16.f32 %rs9, %r171;
680
+ mov.b32 %r172, %f166;
681
+ cvt.rn.bf16.f32 %rs10, %r172;
682
+ mov.b32 %r173, %f167;
683
+ cvt.rn.bf16.f32 %rs11, %r173;
684
+ mov.b32 %r174, %f168;
685
+ cvt.rn.bf16.f32 %rs12, %r174;
686
+ mov.b32 %r182, {%rs9, %rs10};
687
+ mov.b32 %r183, {%rs11, %rs12};
688
+ @%p50 st.global.v2.b32 [ %rd56 + 0 ], { %r182, %r183 };
689
+ .loc 1 58 4
690
+ ret;
691
+ $L__tmp17:
692
+ $L__func_end0:
693
+
694
+ }
695
+ // .globl __nv_rsqrtf
696
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
697
+ .param .b32 __nv_rsqrtf_param_0
698
+ )
699
+ {
700
+ .reg .f32 %f<3>;
701
+ $L__func_begin1:
702
+
703
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
704
+ rsqrt.approx.ftz.f32 %f2, %f1;
705
+ st.param.f32 [func_retval0+0], %f2;
706
+ ret;
707
+ $L__func_end1:
708
+
709
+ }
710
+ .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
711
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
712
+ .section .debug_abbrev
713
+ {
714
+ .b8 1
715
+ .b8 17
716
+ .b8 1
717
+ .b8 37
718
+ .b8 8
719
+ .b8 19
720
+ .b8 5
721
+ .b8 3
722
+ .b8 8
723
+ .b8 16
724
+ .b8 6
725
+ .b8 27
726
+ .b8 8
727
+ .b8 180
728
+ .b8 66
729
+ .b8 12
730
+ .b8 17
731
+ .b8 1
732
+ .b8 18
733
+ .b8 1
734
+ .b8 0
735
+ .b8 0
736
+ .b8 2
737
+ .b8 46
738
+ .b8 0
739
+ .b8 135
740
+ .b8 64
741
+ .b8 8
742
+ .b8 3
743
+ .b8 8
744
+ .b8 58
745
+ .b8 11
746
+ .b8 59
747
+ .b8 11
748
+ .b8 63
749
+ .b8 12
750
+ .b8 32
751
+ .b8 11
752
+ .b8 0
753
+ .b8 0
754
+ .b8 3
755
+ .b8 46
756
+ .b8 1
757
+ .b8 17
758
+ .b8 1
759
+ .b8 18
760
+ .b8 1
761
+ .b8 64
762
+ .b8 10
763
+ .b8 49
764
+ .b8 19
765
+ .b8 0
766
+ .b8 0
767
+ .b8 4
768
+ .b8 29
769
+ .b8 0
770
+ .b8 49
771
+ .b8 19
772
+ .b8 17
773
+ .b8 1
774
+ .b8 18
775
+ .b8 1
776
+ .b8 88
777
+ .b8 11
778
+ .b8 89
779
+ .b8 11
780
+ .b8 87
781
+ .b8 11
782
+ .b8 0
783
+ .b8 0
784
+ .b8 5
785
+ .b8 29
786
+ .b8 1
787
+ .b8 49
788
+ .b8 19
789
+ .b8 17
790
+ .b8 1
791
+ .b8 18
792
+ .b8 1
793
+ .b8 88
794
+ .b8 11
795
+ .b8 89
796
+ .b8 11
797
+ .b8 87
798
+ .b8 11
799
+ .b8 0
800
+ .b8 0
801
+ .b8 0
802
+ }
803
+ .section .debug_info
804
+ {
805
+ .b32 302
806
+ .b8 2
807
+ .b8 0
808
+ .b32 .debug_abbrev
809
+ .b8 8
810
+ .b8 1
811
+ .b8 116
812
+ .b8 114
813
+ .b8 105
814
+ .b8 116
815
+ .b8 111
816
+ .b8 110
817
+ .b8 0
818
+ .b8 2
819
+ .b8 0
820
+ .b8 99
821
+ .b8 99
822
+ .b8 105
823
+ .b8 103
824
+ .b8 54
825
+ .b8 102
826
+ .b8 107
827
+ .b8 105
828
+ .b8 54
829
+ .b8 112
830
+ .b8 52
831
+ .b8 108
832
+ .b8 120
833
+ .b8 114
834
+ .b8 100
835
+ .b8 109
836
+ .b8 103
837
+ .b8 103
838
+ .b8 54
839
+ .b8 101
840
+ .b8 117
841
+ .b8 100
842
+ .b8 97
843
+ .b8 104
844
+ .b8 105
845
+ .b8 101
846
+ .b8 120
847
+ .b8 99
848
+ .b8 118
849
+ .b8 117
850
+ .b8 101
851
+ .b8 101
852
+ .b8 111
853
+ .b8 108
854
+ .b8 50
855
+ .b8 112
856
+ .b8 52
857
+ .b8 113
858
+ .b8 112
859
+ .b8 53
860
+ .b8 51
861
+ .b8 50
862
+ .b8 112
863
+ .b8 118
864
+ .b8 118
865
+ .b8 101
866
+ .b8 50
867
+ .b8 121
868
+ .b8 52
869
+ .b8 54
870
+ .b8 51
871
+ .b8 121
872
+ .b8 46
873
+ .b8 112
874
+ .b8 121
875
+ .b8 0
876
+ .b32 .debug_line
877
+ .b8 47
878
+ .b8 116
879
+ .b8 109
880
+ .b8 112
881
+ .b8 47
882
+ .b8 116
883
+ .b8 111
884
+ .b8 114
885
+ .b8 99
886
+ .b8 104
887
+ .b8 105
888
+ .b8 110
889
+ .b8 100
890
+ .b8 117
891
+ .b8 99
892
+ .b8 116
893
+ .b8 111
894
+ .b8 114
895
+ .b8 95
896
+ .b8 114
897
+ .b8 111
898
+ .b8 111
899
+ .b8 116
900
+ .b8 47
901
+ .b8 99
902
+ .b8 105
903
+ .b8 0
904
+ .b8 1
905
+ .b64 $L__func_begin0
906
+ .b64 $L__func_end0
907
+ .b8 2
908
+ .b8 116
909
+ .b8 114
910
+ .b8 105
911
+ .b8 116
912
+ .b8 111
913
+ .b8 110
914
+ .b8 95
915
+ .b8 95
916
+ .b8 48
917
+ .b8 100
918
+ .b8 49
919
+ .b8 100
920
+ .b8 50
921
+ .b8 100
922
+ .b8 51
923
+ .b8 100
924
+ .b8 52
925
+ .b8 100
926
+ .b8 53
927
+ .b8 100
928
+ .b8 54
929
+ .b8 100
930
+ .b8 101
931
+ .b8 55
932
+ .b8 100
933
+ .b8 101
934
+ .b8 0
935
+ .b8 116
936
+ .b8 114
937
+ .b8 105
938
+ .b8 116
939
+ .b8 111
940
+ .b8 110
941
+ .b8 95
942
+ .b8 95
943
+ .b8 48
944
+ .b8 100
945
+ .b8 49
946
+ .b8 100
947
+ .b8 50
948
+ .b8 100
949
+ .b8 51
950
+ .b8 100
951
+ .b8 52
952
+ .b8 100
953
+ .b8 53
954
+ .b8 100
955
+ .b8 54
956
+ .b8 100
957
+ .b8 101
958
+ .b8 55
959
+ .b8 100
960
+ .b8 101
961
+ .b8 0
962
+ .b8 1
963
+ .b8 18
964
+ .b8 1
965
+ .b8 1
966
+ .b8 3
967
+ .b64 $L__func_begin0
968
+ .b64 $L__func_end0
969
+ .b8 1
970
+ .b8 156
971
+ .b32 125
972
+ .b8 4
973
+ .b32 125
974
+ .b64 $L__tmp1
975
+ .b64 $L__tmp2
976
+ .b8 2
977
+ .b8 47
978
+ .b8 41
979
+ .b8 5
980
+ .b32 125
981
+ .b64 $L__tmp2
982
+ .b64 $L__tmp15
983
+ .b8 2
984
+ .b8 53
985
+ .b8 44
986
+ .b8 4
987
+ .b32 125
988
+ .b64 $L__tmp2
989
+ .b64 $L__tmp15
990
+ .b8 2
991
+ .b8 120
992
+ .b8 46
993
+ .b8 0
994
+ .b8 4
995
+ .b32 125
996
+ .b64 $L__tmp3
997
+ .b64 $L__tmp16
998
+ .b8 2
999
+ .b8 53
1000
+ .b8 44
1001
+ .b8 0
1002
+ .b8 0
1003
+ }
1004
+ .section .debug_pubnames
1005
+ {
1006
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1007
+ $L__pubNames_start0:
1008
+ .b8 2
1009
+ .b8 0
1010
+ .b32 .debug_info
1011
+ .b32 306
1012
+ .b32 125
1013
+ .b8 116
1014
+ .b8 114
1015
+ .b8 105
1016
+ .b8 116
1017
+ .b8 111
1018
+ .b8 110
1019
+ .b8 95
1020
+ .b8 95
1021
+ .b8 48
1022
+ .b8 100
1023
+ .b8 49
1024
+ .b8 100
1025
+ .b8 50
1026
+ .b8 100
1027
+ .b8 51
1028
+ .b8 100
1029
+ .b8 52
1030
+ .b8 100
1031
+ .b8 53
1032
+ .b8 100
1033
+ .b8 54
1034
+ .b8 100
1035
+ .b8 101
1036
+ .b8 55
1037
+ .b8 100
1038
+ .b8 101
1039
+ .b8 0
1040
+ .b32 0
1041
+ $L__pubNames_end0:
1042
+ }
1043
+ .section .debug_pubtypes
1044
+ {
1045
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1046
+ $L__pubTypes_start0:
1047
+ .b8 2
1048
+ .b8 0
1049
+ .b32 .debug_info
1050
+ .b32 306
1051
+ .b32 0
1052
+ $L__pubTypes_end0:
1053
+ }
1054
+ .section .debug_loc { }
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
16
+ %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
17
+ %cst_10 = arith.constant 0.000000e+00 : f32
18
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
19
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x256xbf16, #blocked>
22
+ %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
23
+ %c2_i32 = arith.constant 2 : i32
24
+ %0 = tt.get_program_id x : i32
25
+ %1 = arith.muli %0, %c2_i32 : i32
26
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
27
+ %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
28
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
29
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
30
+ %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
31
+ %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
32
+ %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
33
+ %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
34
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
35
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
36
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
37
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
38
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
39
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
40
+ %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
41
+ %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
42
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
43
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
44
+ %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
45
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
46
+ %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
47
+ %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
48
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
49
+ %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
50
+ %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
51
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
52
+ %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
53
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
54
+ %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
55
+ %31 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
56
+ %32 = tt.broadcast %31 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
57
+ %33 = arith.addi %24, %32 : tensor<2x256xi32, #blocked>
58
+ %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
59
+ %35 = tt.addptr %34, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
60
+ %36 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
61
+ %37 = arith.extf %36 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
62
+ %38 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
63
+ %39 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
64
+ %40 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
65
+ %41 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
66
+ %42 = arith.select %40, %38, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
67
+ %43 = arith.select %41, %39, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
68
+ %44 = arith.cmpi sge, %43, %cst_8 : tensor<2x1xi64, #blocked2>
69
+ %45 = arith.cmpi slt, %43, %cst_9 : tensor<2x1xi64, #blocked2>
70
+ %46 = arith.andi %44, %45 : tensor<2x1xi1, #blocked2>
71
+ tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
72
+ %47 = arith.muli %42, %cst_5 : tensor<2x1xi64, #blocked>
73
+ %48 = tt.broadcast %47 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
74
+ %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
75
+ %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
76
+ %51 = arith.addi %50, %48 : tensor<2x256xi64, #blocked>
77
+ %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
78
+ %53 = tt.addptr %52, %51 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
79
+ %54 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
80
+ %55 = arith.addf %54, %30 : tensor<2x256xf32, #blocked>
81
+ %56 = arith.addf %55, %37 : tensor<2x256xf32, #blocked>
82
+ %57 = arith.addf %56, %cst_13 : tensor<2x256xf32, #blocked>
83
+ %58 = arith.subf %56, %57 : tensor<2x256xf32, #blocked>
84
+ %59 = arith.mulf %56, %58 : tensor<2x256xf32, #blocked>
85
+ %60 = arith.addf %59, %cst_13 : tensor<2x256xf32, #blocked>
86
+ %61 = arith.select %29, %57, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
87
+ %62 = arith.select %29, %60, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
88
+ %63 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
89
+ %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
90
+ %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
91
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
92
+ %90 = arith.subf %arg11, %arg8 : f32
93
+ %91 = arith.addf %arg10, %arg13 : f32
94
+ %92 = arith.cmpf oeq, %91, %cst_10 : f32
95
+ %93 = arith.divf %arg13, %91 : f32
96
+ %94 = arith.select %92, %cst_10, %93 : f32
97
+ %95 = arith.mulf %90, %94 : f32
98
+ %96 = arith.addf %arg8, %95 : f32
99
+ %97 = arith.addf %arg9, %arg12 : f32
100
+ %98 = arith.mulf %90, %90 : f32
101
+ %99 = arith.mulf %98, %arg10 : f32
102
+ %100 = arith.mulf %99, %94 : f32
103
+ %101 = arith.addf %97, %100 : f32
104
+ tt.reduce.return %96, %101, %91 : f32, f32, f32
105
+ }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
106
+ %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
107
+ %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
108
+ %68 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
109
+ %69 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
110
+ %70 = arith.extf %69 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
111
+ %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
112
+ %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
113
+ %73 = tt.load %72, %22, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
114
+ tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
115
+ %74 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
116
+ %75 = arith.addf %74, %68 : tensor<2x256xf32, #blocked>
117
+ %76 = arith.addf %75, %70 : tensor<2x256xf32, #blocked>
118
+ %77 = tt.broadcast %66 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
119
+ %78 = arith.subf %76, %77 : tensor<2x256xf32, #blocked>
120
+ %79 = arith.divf %67, %cst_12 : tensor<2x1xf32, #blocked>
121
+ %80 = arith.addf %79, %cst_11 : tensor<2x1xf32, #blocked>
122
+ %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
123
+ %82 = tt.broadcast %81 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
124
+ %83 = arith.mulf %78, %82 : tensor<2x256xf32, #blocked>
125
+ %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
126
+ %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
127
+ %86 = arith.mulf %83, %85 : tensor<2x256xf32, #blocked>
128
+ %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
129
+ %88 = tt.addptr %87, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
130
+ %89 = arith.truncf %86 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
131
+ tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
132
+ tt.return
133
+ }
134
+ }
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<2x256xbf16>
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
5
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
6
+ %cst_2 = arith.constant 0.000000e+00 : f32
7
+ %cst_3 = arith.constant dense<256> : tensor<2x1xi64>
8
+ %cst_4 = arith.constant dense<50257> : tensor<2x1xi64>
9
+ %cst_5 = arith.constant dense<0> : tensor<2x1xi64>
10
+ %cst_6 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
11
+ %cst_7 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
12
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
13
+ %cst_9 = arith.constant dense<256> : tensor<2x1xi32>
14
+ %cst_10 = arith.constant dense<256> : tensor<1x256xi32>
15
+ %cst_11 = arith.constant dense<512> : tensor<2x1xi32>
16
+ %c2_i32 = arith.constant 2 : i32
17
+ %0 = tt.get_program_id x : i32
18
+ %1 = arith.muli %0, %c2_i32 : i32
19
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
20
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
21
+ %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
22
+ %5 = arith.addi %4, %3 : tensor<2x1xi32>
23
+ %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
24
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
25
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
26
+ %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
27
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
28
+ %11 = arith.remsi %5, %cst_11 : tensor<2x1xi32>
29
+ %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
30
+ %13 = arith.muli %11, %cst_9 : tensor<2x1xi32>
31
+ %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
32
+ %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
33
+ %16 = arith.addi %14, %15 : tensor<2x256xi32>
34
+ %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
35
+ %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
36
+ %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
37
+ %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
38
+ %21 = arith.muli %5, %cst_9 : tensor<2x1xi32>
39
+ %22 = tt.broadcast %21 : (tensor<2x1xi32>) -> tensor<2x256xi32>
40
+ %23 = arith.addi %14, %22 : tensor<2x256xi32>
41
+ %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
42
+ %25 = tt.addptr %24, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
43
+ %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16>
44
+ %27 = arith.extf %26 : tensor<2x256xbf16> to tensor<2x256xf32>
45
+ %28 = arith.addi %10, %cst_4 : tensor<2x1xi64>
46
+ %29 = arith.cmpi slt, %10, %cst_5 : tensor<2x1xi64>
47
+ %30 = arith.select %29, %28, %10 : tensor<2x1xi1>, tensor<2x1xi64>
48
+ %31 = arith.cmpi sge, %30, %cst_5 : tensor<2x1xi64>
49
+ %32 = arith.cmpi slt, %30, %cst_4 : tensor<2x1xi64>
50
+ %33 = arith.andi %31, %32 : tensor<2x1xi1>
51
+ tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
52
+ %34 = arith.muli %30, %cst_3 : tensor<2x1xi64>
53
+ %35 = tt.broadcast %34 : (tensor<2x1xi64>) -> tensor<2x256xi64>
54
+ %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
55
+ %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<2x256xi64>
56
+ %38 = arith.addi %37, %35 : tensor<2x256xi64>
57
+ %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
58
+ %40 = tt.addptr %39, %38 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
59
+ %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
60
+ %42 = arith.addf %41, %20 : tensor<2x256xf32>
61
+ %43 = arith.addf %42, %27 : tensor<2x256xf32>
62
+ %44 = arith.addf %43, %cst_8 : tensor<2x256xf32>
63
+ %45 = arith.subf %43, %44 : tensor<2x256xf32>
64
+ %46 = arith.mulf %43, %45 : tensor<2x256xf32>
65
+ %47 = arith.addf %46, %cst_8 : tensor<2x256xf32>
66
+ %48 = arith.select %19, %44, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
67
+ %49 = arith.select %19, %47, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
68
+ %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
69
+ %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
70
+ %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
71
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
72
+ %76 = arith.subf %arg11, %arg8 : f32
73
+ %77 = arith.addf %arg10, %arg13 : f32
74
+ %78 = arith.cmpf oeq, %77, %cst_2 : f32
75
+ %79 = arith.divf %arg13, %77 : f32
76
+ %80 = arith.select %78, %cst_2, %79 : f32
77
+ %81 = arith.mulf %76, %80 : f32
78
+ %82 = arith.addf %arg8, %81 : f32
79
+ %83 = arith.addf %arg9, %arg12 : f32
80
+ %84 = arith.mulf %76, %76 : f32
81
+ %85 = arith.mulf %84, %arg10 : f32
82
+ %86 = arith.mulf %85, %80 : f32
83
+ %87 = arith.addf %83, %86 : f32
84
+ tt.reduce.return %82, %87, %77 : f32, f32, f32
85
+ }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
86
+ %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
87
+ %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
88
+ %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
89
+ %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16>
90
+ %57 = arith.extf %56 : tensor<2x256xbf16> to tensor<2x256xf32>
91
+ %58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
92
+ %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
93
+ %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
94
+ tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
95
+ %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
96
+ %62 = arith.addf %61, %55 : tensor<2x256xf32>
97
+ %63 = arith.addf %62, %57 : tensor<2x256xf32>
98
+ %64 = tt.broadcast %53 : (tensor<2x1xf32>) -> tensor<2x256xf32>
99
+ %65 = arith.subf %63, %64 : tensor<2x256xf32>
100
+ %66 = arith.divf %54, %cst_7 : tensor<2x1xf32>
101
+ %67 = arith.addf %66, %cst_6 : tensor<2x1xf32>
102
+ %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
103
+ %69 = tt.broadcast %68 : (tensor<2x1xf32>) -> tensor<2x256xf32>
104
+ %70 = arith.mulf %65, %69 : tensor<2x256xf32>
105
+ %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<2x256xf32>
106
+ %72 = arith.mulf %70, %71 : tensor<2x256xf32>
107
+ %73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
108
+ %74 = tt.addptr %73, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
109
+ %75 = arith.truncf %72 : tensor<2x256xf32> to tensor<2x256xbf16>
110
+ tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
111
+ tt.return
112
+ }
113
+ }
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.cubin ADDED
Binary file (12.2 kB). View file
 
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.cubin ADDED
Binary file (12 kB). View file
 
.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.398942292> : tensor<512xf32>
4
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
6
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32>
7
+ %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32>
8
+ %c512_i32 = arith.constant 512 : i32
9
+ %0 = tt.get_program_id x : i32
10
+ %1 = arith.muli %0, %c512_i32 : i32
11
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
12
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
13
+ %4 = arith.addi %3, %2 : tensor<512xi32>
14
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
15
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
16
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
17
+ %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
18
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
19
+ %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
20
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
21
+ %12 = arith.extf %11 : tensor<512xbf16> to tensor<512xf32>
22
+ %13 = arith.mulf %12, %cst_3 : tensor<512xf32>
23
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
24
+ %15 = arith.addf %14, %cst_2 : tensor<512xf32>
25
+ %16 = arith.mulf %15, %cst_1 : tensor<512xf32>
26
+ %17 = arith.mulf %12, %12 : tensor<512xf32>
27
+ %18 = arith.mulf %17, %cst_0 : tensor<512xf32>
28
+ %19 = math.exp %18 : tensor<512xf32>
29
+ %20 = arith.mulf %19, %cst : tensor<512xf32>
30
+ %21 = arith.mulf %12, %20 : tensor<512xf32>
31
+ %22 = arith.addf %16, %21 : tensor<512xf32>
32
+ %23 = arith.mulf %8, %22 : tensor<512xf32>
33
+ %24 = arith.truncf %23 : tensor<512xf32> to tensor<512xbf16>
34
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
35
+ tt.return
36
+ }
37
+ }
.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir ADDED
@@ -0,0 +1,1211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = lshr i32 %8, 3, !dbg !10
18
+ %10 = and i32 %9, 31, !dbg !10
19
+ %11 = and i32 %8, 63, !dbg !10
20
+ %12 = shl i32 %8, 3, !dbg !11
21
+ %13 = and i32 %12, 56, !dbg !11
22
+ %14 = or i32 %13, 4, !dbg !11
23
+ %15 = lshr i32 %8, 6, !dbg !12
24
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
25
+ %17 = shl i32 %16, 6, !dbg !14
26
+ %18 = or i32 %17, %10, !dbg !15
27
+ %19 = or i32 %18, 32, !dbg !15
28
+ %20 = or i32 %17, %11, !dbg !15
29
+ %21 = sext i32 %18 to i64, !dbg !16
30
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !16
31
+ %23 = sext i32 %19 to i64, !dbg !16
32
+ %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !16
33
+ %25 = sext i32 %20 to i64, !dbg !16
34
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
40
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
41
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
42
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
43
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
44
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
45
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
46
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
47
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
48
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
49
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
50
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
51
+ %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
52
+ %44 = srem i32 %18, 512, !dbg !18
53
+ %45 = srem i32 %19, 512, !dbg !18
54
+ %46 = shl nsw i32 %44, 8, !dbg !19
55
+ %47 = shl nsw i32 %45, 8, !dbg !19
56
+ %48 = add i64 %43, 50257, !dbg !20
57
+ %49 = icmp slt i64 %27, 0, !dbg !21
58
+ %50 = icmp slt i64 %35, 0, !dbg !21
59
+ %51 = icmp slt i64 %43, 0, !dbg !21
60
+ %52 = select i1 %51, i64 %48, i64 %43, !dbg !22
61
+ %53 = icmp ugt i64 %52, 50256, !dbg !23
62
+ %54 = shl i64 %27, 8, !dbg !24
63
+ %55 = add i64 %54, 12865792, !dbg !24
64
+ %56 = select i1 %49, i64 %55, i64 %54, !dbg !24
65
+ %57 = shl i64 %35, 8, !dbg !24
66
+ %58 = add i64 %57, 12865792, !dbg !24
67
+ %59 = select i1 %50, i64 %58, i64 %57, !dbg !24
68
+ %60 = getelementptr float, ptr addrspace(1) %1, i64 %56
69
+ %61 = getelementptr float, ptr addrspace(1) %1, i64 %59
70
+ br label %62, !dbg !12
71
+
72
+ 62: ; preds = %7, %179
73
+ %63 = phi float [ 0.000000e+00, %7 ], [ %254, %179 ]
74
+ %64 = phi float [ 0.000000e+00, %7 ], [ %255, %179 ]
75
+ %65 = phi float [ 0.000000e+00, %7 ], [ %256, %179 ]
76
+ %66 = phi float [ 0.000000e+00, %7 ], [ %257, %179 ]
77
+ %67 = phi float [ 0.000000e+00, %7 ], [ %258, %179 ]
78
+ %68 = phi float [ 0.000000e+00, %7 ], [ %259, %179 ]
79
+ %69 = phi float [ 0.000000e+00, %7 ], [ %260, %179 ]
80
+ %70 = phi float [ 0.000000e+00, %7 ], [ %261, %179 ]
81
+ %71 = phi float [ 0.000000e+00, %7 ], [ %262, %179 ]
82
+ %72 = phi float [ 0.000000e+00, %7 ], [ %263, %179 ]
83
+ %73 = phi float [ 0.000000e+00, %7 ], [ %264, %179 ]
84
+ %74 = phi float [ 0.000000e+00, %7 ], [ %265, %179 ]
85
+ %75 = phi float [ 0.000000e+00, %7 ], [ %266, %179 ]
86
+ %76 = phi float [ 0.000000e+00, %7 ], [ %267, %179 ]
87
+ %77 = phi float [ 0.000000e+00, %7 ], [ %268, %179 ]
88
+ %78 = phi float [ 0.000000e+00, %7 ], [ %269, %179 ]
89
+ %79 = phi float [ 0.000000e+00, %7 ], [ %270, %179 ]
90
+ %80 = phi float [ 0.000000e+00, %7 ], [ %271, %179 ]
91
+ %81 = phi float [ 0.000000e+00, %7 ], [ %272, %179 ]
92
+ %82 = phi float [ 0.000000e+00, %7 ], [ %273, %179 ]
93
+ %83 = phi float [ 0.000000e+00, %7 ], [ %274, %179 ]
94
+ %84 = phi float [ 0.000000e+00, %7 ], [ %275, %179 ]
95
+ %85 = phi float [ 0.000000e+00, %7 ], [ %276, %179 ]
96
+ %86 = phi float [ 0.000000e+00, %7 ], [ %277, %179 ]
97
+ %87 = phi float [ 0.000000e+00, %7 ], [ %278, %179 ]
98
+ %88 = phi float [ 0.000000e+00, %7 ], [ %279, %179 ]
99
+ %89 = phi float [ 0.000000e+00, %7 ], [ %280, %179 ]
100
+ %90 = phi float [ 0.000000e+00, %7 ], [ %281, %179 ]
101
+ %91 = phi float [ 0.000000e+00, %7 ], [ %282, %179 ]
102
+ %92 = phi float [ 0.000000e+00, %7 ], [ %283, %179 ]
103
+ %93 = phi float [ 0.000000e+00, %7 ], [ %284, %179 ]
104
+ %94 = phi float [ 0.000000e+00, %7 ], [ %285, %179 ]
105
+ %95 = phi float [ 0.000000e+00, %7 ], [ %350, %179 ]
106
+ %96 = phi float [ 0.000000e+00, %7 ], [ %351, %179 ]
107
+ %97 = phi float [ 0.000000e+00, %7 ], [ %352, %179 ]
108
+ %98 = phi float [ 0.000000e+00, %7 ], [ %353, %179 ]
109
+ %99 = phi float [ 0.000000e+00, %7 ], [ %354, %179 ]
110
+ %100 = phi float [ 0.000000e+00, %7 ], [ %355, %179 ]
111
+ %101 = phi float [ 0.000000e+00, %7 ], [ %356, %179 ]
112
+ %102 = phi float [ 0.000000e+00, %7 ], [ %357, %179 ]
113
+ %103 = phi float [ 0.000000e+00, %7 ], [ %358, %179 ]
114
+ %104 = phi float [ 0.000000e+00, %7 ], [ %359, %179 ]
115
+ %105 = phi float [ 0.000000e+00, %7 ], [ %360, %179 ]
116
+ %106 = phi float [ 0.000000e+00, %7 ], [ %361, %179 ]
117
+ %107 = phi float [ 0.000000e+00, %7 ], [ %362, %179 ]
118
+ %108 = phi float [ 0.000000e+00, %7 ], [ %363, %179 ]
119
+ %109 = phi float [ 0.000000e+00, %7 ], [ %364, %179 ]
120
+ %110 = phi float [ 0.000000e+00, %7 ], [ %365, %179 ]
121
+ %111 = phi float [ 0.000000e+00, %7 ], [ %302, %179 ]
122
+ %112 = phi float [ 0.000000e+00, %7 ], [ %303, %179 ]
123
+ %113 = phi float [ 0.000000e+00, %7 ], [ %304, %179 ]
124
+ %114 = phi float [ 0.000000e+00, %7 ], [ %305, %179 ]
125
+ %115 = phi float [ 0.000000e+00, %7 ], [ %306, %179 ]
126
+ %116 = phi float [ 0.000000e+00, %7 ], [ %307, %179 ]
127
+ %117 = phi float [ 0.000000e+00, %7 ], [ %308, %179 ]
128
+ %118 = phi float [ 0.000000e+00, %7 ], [ %309, %179 ]
129
+ %119 = phi float [ 0.000000e+00, %7 ], [ %310, %179 ]
130
+ %120 = phi float [ 0.000000e+00, %7 ], [ %311, %179 ]
131
+ %121 = phi float [ 0.000000e+00, %7 ], [ %312, %179 ]
132
+ %122 = phi float [ 0.000000e+00, %7 ], [ %313, %179 ]
133
+ %123 = phi float [ 0.000000e+00, %7 ], [ %314, %179 ]
134
+ %124 = phi float [ 0.000000e+00, %7 ], [ %315, %179 ]
135
+ %125 = phi float [ 0.000000e+00, %7 ], [ %316, %179 ]
136
+ %126 = phi float [ 0.000000e+00, %7 ], [ %317, %179 ]
137
+ %127 = phi i32 [ 0, %7 ], [ %366, %179 ]
138
+ %128 = or i32 %127, %13, !dbg !25
139
+ %129 = or i32 %127, %14, !dbg !25
140
+ %130 = add i32 %128, %46, !dbg !26
141
+ %131 = add i32 %129, %46, !dbg !26
142
+ %132 = add i32 %128, %47, !dbg !26
143
+ %133 = add i32 %129, %47, !dbg !26
144
+ %134 = sext i32 %130 to i64, !dbg !27
145
+ %135 = getelementptr float, ptr addrspace(1) %2, i64 %134, !dbg !27
146
+ %136 = sext i32 %131 to i64, !dbg !27
147
+ %137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !27
148
+ %138 = sext i32 %132 to i64, !dbg !27
149
+ %139 = getelementptr float, ptr addrspace(1) %2, i64 %138, !dbg !27
150
+ %140 = sext i32 %133 to i64, !dbg !27
151
+ %141 = getelementptr float, ptr addrspace(1) %2, i64 %140, !dbg !27
152
+ %142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %135, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
153
+ %143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !28
154
+ %144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !28
155
+ %145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !28
156
+ %146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !28
157
+ %147 = bitcast i32 %143 to float, !dbg !28
158
+ %148 = bitcast i32 %144 to float, !dbg !28
159
+ %149 = bitcast i32 %145 to float, !dbg !28
160
+ %150 = bitcast i32 %146 to float, !dbg !28
161
+ %151 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
162
+ %152 = extractvalue { i32, i32, i32, i32 } %151, 0, !dbg !28
163
+ %153 = extractvalue { i32, i32, i32, i32 } %151, 1, !dbg !28
164
+ %154 = extractvalue { i32, i32, i32, i32 } %151, 2, !dbg !28
165
+ %155 = extractvalue { i32, i32, i32, i32 } %151, 3, !dbg !28
166
+ %156 = bitcast i32 %152 to float, !dbg !28
167
+ %157 = bitcast i32 %153 to float, !dbg !28
168
+ %158 = bitcast i32 %154 to float, !dbg !28
169
+ %159 = bitcast i32 %155 to float, !dbg !28
170
+ %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %139, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
171
+ %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !28
172
+ %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !28
173
+ %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !28
174
+ %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !28
175
+ %165 = bitcast i32 %161 to float, !dbg !28
176
+ %166 = bitcast i32 %162 to float, !dbg !28
177
+ %167 = bitcast i32 %163 to float, !dbg !28
178
+ %168 = bitcast i32 %164 to float, !dbg !28
179
+ %169 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
180
+ %170 = extractvalue { i32, i32, i32, i32 } %169, 0, !dbg !28
181
+ %171 = extractvalue { i32, i32, i32, i32 } %169, 1, !dbg !28
182
+ %172 = extractvalue { i32, i32, i32, i32 } %169, 2, !dbg !28
183
+ %173 = extractvalue { i32, i32, i32, i32 } %169, 3, !dbg !28
184
+ %174 = bitcast i32 %170 to float, !dbg !28
185
+ %175 = bitcast i32 %171 to float, !dbg !28
186
+ %176 = bitcast i32 %172 to float, !dbg !28
187
+ %177 = bitcast i32 %173 to float, !dbg !28
188
+ br i1 %53, label %178, label %179, !dbg !29
189
+
190
+ 178: ; preds = %62
191
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
192
+ br label %179, !dbg !29
193
+
194
+ 179: ; preds = %178, %62
195
+ %180 = zext nneg i32 %128 to i64, !dbg !30
196
+ %181 = zext nneg i32 %129 to i64, !dbg !30
197
+ %182 = getelementptr float, ptr addrspace(1) %60, i64 %180, !dbg !31
198
+ %183 = getelementptr float, ptr addrspace(1) %60, i64 %181, !dbg !31
199
+ %184 = getelementptr float, ptr addrspace(1) %61, i64 %180, !dbg !31
200
+ %185 = getelementptr float, ptr addrspace(1) %61, i64 %181, !dbg !31
201
+ %186 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %182, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
202
+ %187 = extractvalue { i32, i32, i32, i32 } %186, 0, !dbg !32
203
+ %188 = extractvalue { i32, i32, i32, i32 } %186, 1, !dbg !32
204
+ %189 = extractvalue { i32, i32, i32, i32 } %186, 2, !dbg !32
205
+ %190 = extractvalue { i32, i32, i32, i32 } %186, 3, !dbg !32
206
+ %191 = bitcast i32 %187 to float, !dbg !32
207
+ %192 = bitcast i32 %188 to float, !dbg !32
208
+ %193 = bitcast i32 %189 to float, !dbg !32
209
+ %194 = bitcast i32 %190 to float, !dbg !32
210
+ %195 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %183, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
211
+ %196 = extractvalue { i32, i32, i32, i32 } %195, 0, !dbg !32
212
+ %197 = extractvalue { i32, i32, i32, i32 } %195, 1, !dbg !32
213
+ %198 = extractvalue { i32, i32, i32, i32 } %195, 2, !dbg !32
214
+ %199 = extractvalue { i32, i32, i32, i32 } %195, 3, !dbg !32
215
+ %200 = bitcast i32 %196 to float, !dbg !32
216
+ %201 = bitcast i32 %197 to float, !dbg !32
217
+ %202 = bitcast i32 %198 to float, !dbg !32
218
+ %203 = bitcast i32 %199 to float, !dbg !32
219
+ %204 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
220
+ %205 = extractvalue { i32, i32, i32, i32 } %204, 0, !dbg !32
221
+ %206 = extractvalue { i32, i32, i32, i32 } %204, 1, !dbg !32
222
+ %207 = extractvalue { i32, i32, i32, i32 } %204, 2, !dbg !32
223
+ %208 = extractvalue { i32, i32, i32, i32 } %204, 3, !dbg !32
224
+ %209 = bitcast i32 %205 to float, !dbg !32
225
+ %210 = bitcast i32 %206 to float, !dbg !32
226
+ %211 = bitcast i32 %207 to float, !dbg !32
227
+ %212 = bitcast i32 %208 to float, !dbg !32
228
+ %213 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %185, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
229
+ %214 = extractvalue { i32, i32, i32, i32 } %213, 0, !dbg !32
230
+ %215 = extractvalue { i32, i32, i32, i32 } %213, 1, !dbg !32
231
+ %216 = extractvalue { i32, i32, i32, i32 } %213, 2, !dbg !32
232
+ %217 = extractvalue { i32, i32, i32, i32 } %213, 3, !dbg !32
233
+ %218 = bitcast i32 %214 to float, !dbg !32
234
+ %219 = bitcast i32 %215 to float, !dbg !32
235
+ %220 = bitcast i32 %216 to float, !dbg !32
236
+ %221 = bitcast i32 %217 to float, !dbg !32
237
+ %222 = fadd float %147, %191, !dbg !33
238
+ %223 = fadd float %148, %192, !dbg !33
239
+ %224 = fadd float %149, %193, !dbg !33
240
+ %225 = fadd float %150, %194, !dbg !33
241
+ %226 = fadd float %156, %200, !dbg !33
242
+ %227 = fadd float %157, %201, !dbg !33
243
+ %228 = fadd float %158, %202, !dbg !33
244
+ %229 = fadd float %159, %203, !dbg !33
245
+ %230 = fadd float %165, %209, !dbg !33
246
+ %231 = fadd float %166, %210, !dbg !33
247
+ %232 = fadd float %167, %211, !dbg !33
248
+ %233 = fadd float %168, %212, !dbg !33
249
+ %234 = fadd float %174, %218, !dbg !33
250
+ %235 = fadd float %175, %219, !dbg !33
251
+ %236 = fadd float %176, %220, !dbg !33
252
+ %237 = fadd float %177, %221, !dbg !33
253
+ %238 = fsub float %222, %111, !dbg !34
254
+ %239 = fsub float %223, %112, !dbg !34
255
+ %240 = fsub float %224, %113, !dbg !34
256
+ %241 = fsub float %225, %114, !dbg !34
257
+ %242 = fsub float %226, %115, !dbg !34
258
+ %243 = fsub float %227, %116, !dbg !34
259
+ %244 = fsub float %228, %117, !dbg !34
260
+ %245 = fsub float %229, %118, !dbg !34
261
+ %246 = fsub float %230, %119, !dbg !34
262
+ %247 = fsub float %231, %120, !dbg !34
263
+ %248 = fsub float %232, %121, !dbg !34
264
+ %249 = fsub float %233, %122, !dbg !34
265
+ %250 = fsub float %234, %123, !dbg !34
266
+ %251 = fsub float %235, %124, !dbg !34
267
+ %252 = fsub float %236, %125, !dbg !34
268
+ %253 = fsub float %237, %126, !dbg !34
269
+ %254 = fadd float %63, 1.000000e+00, !dbg !38
270
+ %255 = fadd float %64, 1.000000e+00, !dbg !38
271
+ %256 = fadd float %65, 1.000000e+00, !dbg !38
272
+ %257 = fadd float %66, 1.000000e+00, !dbg !38
273
+ %258 = fadd float %67, 1.000000e+00, !dbg !38
274
+ %259 = fadd float %68, 1.000000e+00, !dbg !38
275
+ %260 = fadd float %69, 1.000000e+00, !dbg !38
276
+ %261 = fadd float %70, 1.000000e+00, !dbg !38
277
+ %262 = fadd float %71, 1.000000e+00, !dbg !38
278
+ %263 = fadd float %72, 1.000000e+00, !dbg !38
279
+ %264 = fadd float %73, 1.000000e+00, !dbg !38
280
+ %265 = fadd float %74, 1.000000e+00, !dbg !38
281
+ %266 = fadd float %75, 1.000000e+00, !dbg !38
282
+ %267 = fadd float %76, 1.000000e+00, !dbg !38
283
+ %268 = fadd float %77, 1.000000e+00, !dbg !38
284
+ %269 = fadd float %78, 1.000000e+00, !dbg !38
285
+ %270 = fadd float %79, 1.000000e+00, !dbg !38
286
+ %271 = fadd float %80, 1.000000e+00, !dbg !38
287
+ %272 = fadd float %81, 1.000000e+00, !dbg !38
288
+ %273 = fadd float %82, 1.000000e+00, !dbg !38
289
+ %274 = fadd float %83, 1.000000e+00, !dbg !38
290
+ %275 = fadd float %84, 1.000000e+00, !dbg !38
291
+ %276 = fadd float %85, 1.000000e+00, !dbg !38
292
+ %277 = fadd float %86, 1.000000e+00, !dbg !38
293
+ %278 = fadd float %87, 1.000000e+00, !dbg !38
294
+ %279 = fadd float %88, 1.000000e+00, !dbg !38
295
+ %280 = fadd float %89, 1.000000e+00, !dbg !38
296
+ %281 = fadd float %90, 1.000000e+00, !dbg !38
297
+ %282 = fadd float %91, 1.000000e+00, !dbg !38
298
+ %283 = fadd float %92, 1.000000e+00, !dbg !38
299
+ %284 = fadd float %93, 1.000000e+00, !dbg !38
300
+ %285 = fadd float %94, 1.000000e+00, !dbg !38
301
+ %286 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %238, float %254) #6, !dbg !39
302
+ %287 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %239, float %255) #6, !dbg !39
303
+ %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %256) #6, !dbg !39
304
+ %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %241, float %257) #6, !dbg !39
305
+ %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %242, float %258) #6, !dbg !39
306
+ %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %243, float %259) #6, !dbg !39
307
+ %292 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %260) #6, !dbg !39
308
+ %293 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float %261) #6, !dbg !39
309
+ %294 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %246, float %262) #6, !dbg !39
310
+ %295 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %247, float %263) #6, !dbg !39
311
+ %296 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %264) #6, !dbg !39
312
+ %297 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %249, float %265) #6, !dbg !39
313
+ %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %266) #6, !dbg !39
314
+ %299 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %251, float %267) #6, !dbg !39
315
+ %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %268) #6, !dbg !39
316
+ %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %269) #6, !dbg !39
317
+ %302 = fadd float %111, %286, !dbg !40
318
+ %303 = fadd float %112, %287, !dbg !40
319
+ %304 = fadd float %113, %288, !dbg !40
320
+ %305 = fadd float %114, %289, !dbg !40
321
+ %306 = fadd float %115, %290, !dbg !40
322
+ %307 = fadd float %116, %291, !dbg !40
323
+ %308 = fadd float %117, %292, !dbg !40
324
+ %309 = fadd float %118, %293, !dbg !40
325
+ %310 = fadd float %119, %294, !dbg !40
326
+ %311 = fadd float %120, %295, !dbg !40
327
+ %312 = fadd float %121, %296, !dbg !40
328
+ %313 = fadd float %122, %297, !dbg !40
329
+ %314 = fadd float %123, %298, !dbg !40
330
+ %315 = fadd float %124, %299, !dbg !40
331
+ %316 = fadd float %125, %300, !dbg !40
332
+ %317 = fadd float %126, %301, !dbg !40
333
+ %318 = fsub float %222, %302, !dbg !41
334
+ %319 = fsub float %223, %303, !dbg !41
335
+ %320 = fsub float %224, %304, !dbg !41
336
+ %321 = fsub float %225, %305, !dbg !41
337
+ %322 = fsub float %226, %306, !dbg !41
338
+ %323 = fsub float %227, %307, !dbg !41
339
+ %324 = fsub float %228, %308, !dbg !41
340
+ %325 = fsub float %229, %309, !dbg !41
341
+ %326 = fsub float %230, %310, !dbg !41
342
+ %327 = fsub float %231, %311, !dbg !41
343
+ %328 = fsub float %232, %312, !dbg !41
344
+ %329 = fsub float %233, %313, !dbg !41
345
+ %330 = fsub float %234, %314, !dbg !41
346
+ %331 = fsub float %235, %315, !dbg !41
347
+ %332 = fsub float %236, %316, !dbg !41
348
+ %333 = fsub float %237, %317, !dbg !41
349
+ %334 = fmul float %238, %318, !dbg !42
350
+ %335 = fmul float %239, %319, !dbg !42
351
+ %336 = fmul float %240, %320, !dbg !42
352
+ %337 = fmul float %241, %321, !dbg !42
353
+ %338 = fmul float %242, %322, !dbg !42
354
+ %339 = fmul float %243, %323, !dbg !42
355
+ %340 = fmul float %244, %324, !dbg !42
356
+ %341 = fmul float %245, %325, !dbg !42
357
+ %342 = fmul float %246, %326, !dbg !42
358
+ %343 = fmul float %247, %327, !dbg !42
359
+ %344 = fmul float %248, %328, !dbg !42
360
+ %345 = fmul float %249, %329, !dbg !42
361
+ %346 = fmul float %250, %330, !dbg !42
362
+ %347 = fmul float %251, %331, !dbg !42
363
+ %348 = fmul float %252, %332, !dbg !42
364
+ %349 = fmul float %253, %333, !dbg !42
365
+ %350 = fadd float %95, %334, !dbg !43
366
+ %351 = fadd float %96, %335, !dbg !43
367
+ %352 = fadd float %97, %336, !dbg !43
368
+ %353 = fadd float %98, %337, !dbg !43
369
+ %354 = fadd float %99, %338, !dbg !43
370
+ %355 = fadd float %100, %339, !dbg !43
371
+ %356 = fadd float %101, %340, !dbg !43
372
+ %357 = fadd float %102, %341, !dbg !43
373
+ %358 = fadd float %103, %342, !dbg !43
374
+ %359 = fadd float %104, %343, !dbg !43
375
+ %360 = fadd float %105, %344, !dbg !43
376
+ %361 = fadd float %106, %345, !dbg !43
377
+ %362 = fadd float %107, %346, !dbg !43
378
+ %363 = fadd float %108, %347, !dbg !43
379
+ %364 = fadd float %109, %348, !dbg !43
380
+ %365 = fadd float %110, %349, !dbg !43
381
+ %366 = add nuw nsw i32 %127, 64, !dbg !12
382
+ %367 = icmp ult i32 %127, 192, !dbg !12
383
+ br i1 %367, label %62, label %368, !dbg !12
384
+
385
+ 368: ; preds = %179
386
+ %369 = and i32 %15, 3, !dbg !12
387
+ %370 = mul nuw nsw i32 %369, 72, !dbg !12
388
+ %371 = add nuw nsw i32 %370, %11, !dbg !12
389
+ %372 = zext nneg i32 %371 to i64, !dbg !12
390
+ %373 = getelementptr float, ptr addrspace(3) @global_smem, i64 %372, !dbg !12
391
+ %374 = insertelement <1 x float> undef, float %270, i64 0, !dbg !12
392
+ store <1 x float> %374, ptr addrspace(3) %373, align 4, !dbg !12
393
+ %375 = add nuw nsw i32 %11, 288, !dbg !12
394
+ %376 = add nuw nsw i32 %375, %370, !dbg !12
395
+ %377 = zext nneg i32 %376 to i64, !dbg !12
396
+ %378 = getelementptr float, ptr addrspace(3) @global_smem, i64 %377, !dbg !12
397
+ %379 = insertelement <1 x float> undef, float %271, i64 0, !dbg !12
398
+ store <1 x float> %379, ptr addrspace(3) %378, align 4, !dbg !12
399
+ %380 = or i32 %11, 576, !dbg !12
400
+ %381 = add nuw nsw i32 %380, %370, !dbg !12
401
+ %382 = zext nneg i32 %381 to i64, !dbg !12
402
+ %383 = getelementptr float, ptr addrspace(3) @global_smem, i64 %382, !dbg !12
403
+ %384 = insertelement <1 x float> undef, float %272, i64 0, !dbg !12
404
+ store <1 x float> %384, ptr addrspace(3) %383, align 4, !dbg !12
405
+ %385 = add nuw nsw i32 %11, 864, !dbg !12
406
+ %386 = add nuw nsw i32 %385, %370, !dbg !12
407
+ %387 = zext nneg i32 %386 to i64, !dbg !12
408
+ %388 = getelementptr float, ptr addrspace(3) @global_smem, i64 %387, !dbg !12
409
+ %389 = insertelement <1 x float> undef, float %273, i64 0, !dbg !12
410
+ store <1 x float> %389, ptr addrspace(3) %388, align 4, !dbg !12
411
+ %390 = or i32 %11, 1152, !dbg !12
412
+ %391 = add nuw nsw i32 %390, %370, !dbg !12
413
+ %392 = zext nneg i32 %391 to i64, !dbg !12
414
+ %393 = getelementptr float, ptr addrspace(3) @global_smem, i64 %392, !dbg !12
415
+ %394 = insertelement <1 x float> undef, float %274, i64 0, !dbg !12
416
+ store <1 x float> %394, ptr addrspace(3) %393, align 4, !dbg !12
417
+ %395 = add nuw nsw i32 %11, 1440, !dbg !12
418
+ %396 = add nuw nsw i32 %395, %370, !dbg !12
419
+ %397 = zext nneg i32 %396 to i64, !dbg !12
420
+ %398 = getelementptr float, ptr addrspace(3) @global_smem, i64 %397, !dbg !12
421
+ %399 = insertelement <1 x float> undef, float %275, i64 0, !dbg !12
422
+ store <1 x float> %399, ptr addrspace(3) %398, align 4, !dbg !12
423
+ %400 = or i32 %11, 1728, !dbg !12
424
+ %401 = add nuw nsw i32 %400, %370, !dbg !12
425
+ %402 = zext nneg i32 %401 to i64, !dbg !12
426
+ %403 = getelementptr float, ptr addrspace(3) @global_smem, i64 %402, !dbg !12
427
+ %404 = insertelement <1 x float> undef, float %276, i64 0, !dbg !12
428
+ store <1 x float> %404, ptr addrspace(3) %403, align 4, !dbg !12
429
+ %405 = add nuw nsw i32 %11, 2016, !dbg !12
430
+ %406 = add nuw nsw i32 %405, %370, !dbg !12
431
+ %407 = zext nneg i32 %406 to i64, !dbg !12
432
+ %408 = getelementptr float, ptr addrspace(3) @global_smem, i64 %407, !dbg !12
433
+ %409 = insertelement <1 x float> undef, float %277, i64 0, !dbg !12
434
+ store <1 x float> %409, ptr addrspace(3) %408, align 4, !dbg !12
435
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
436
+ %410 = mul nuw nsw i32 %10, 72, !dbg !12
437
+ %411 = add nuw nsw i32 %410, %13, !dbg !12
438
+ %412 = zext nneg i32 %411 to i64, !dbg !12
439
+ %413 = getelementptr float, ptr addrspace(3) @global_smem, i64 %412, !dbg !12
440
+ %414 = load float, ptr addrspace(3) %413, align 32, !dbg !12
441
+ %415 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 1, !dbg !12
442
+ %416 = load float, ptr addrspace(3) %415, align 4, !dbg !12
443
+ %417 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 2, !dbg !12
444
+ %418 = load float, ptr addrspace(3) %417, align 8, !dbg !12
445
+ %419 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 3, !dbg !12
446
+ %420 = load float, ptr addrspace(3) %419, align 4, !dbg !12
447
+ %421 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 4, !dbg !12
448
+ %422 = load float, ptr addrspace(3) %421, align 16, !dbg !12
449
+ %423 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 5, !dbg !12
450
+ %424 = load float, ptr addrspace(3) %423, align 4, !dbg !12
451
+ %425 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 6, !dbg !12
452
+ %426 = load float, ptr addrspace(3) %425, align 8, !dbg !12
453
+ %427 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 7, !dbg !12
454
+ %428 = load float, ptr addrspace(3) %427, align 4, !dbg !12
455
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
456
+ %429 = insertelement <1 x float> undef, float %278, i64 0, !dbg !12
457
+ store <1 x float> %429, ptr addrspace(3) %373, align 4, !dbg !12
458
+ %430 = insertelement <1 x float> undef, float %279, i64 0, !dbg !12
459
+ store <1 x float> %430, ptr addrspace(3) %378, align 4, !dbg !12
460
+ %431 = insertelement <1 x float> undef, float %280, i64 0, !dbg !12
461
+ store <1 x float> %431, ptr addrspace(3) %383, align 4, !dbg !12
462
+ %432 = insertelement <1 x float> undef, float %281, i64 0, !dbg !12
463
+ store <1 x float> %432, ptr addrspace(3) %388, align 4, !dbg !12
464
+ %433 = insertelement <1 x float> undef, float %282, i64 0, !dbg !12
465
+ store <1 x float> %433, ptr addrspace(3) %393, align 4, !dbg !12
466
+ %434 = insertelement <1 x float> undef, float %283, i64 0, !dbg !12
467
+ store <1 x float> %434, ptr addrspace(3) %398, align 4, !dbg !12
468
+ %435 = insertelement <1 x float> undef, float %284, i64 0, !dbg !12
469
+ store <1 x float> %435, ptr addrspace(3) %403, align 4, !dbg !12
470
+ %436 = insertelement <1 x float> undef, float %285, i64 0, !dbg !12
471
+ store <1 x float> %436, ptr addrspace(3) %408, align 4, !dbg !12
472
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
473
+ %437 = load float, ptr addrspace(3) %413, align 32, !dbg !12
474
+ %438 = load float, ptr addrspace(3) %415, align 4, !dbg !12
475
+ %439 = load float, ptr addrspace(3) %417, align 8, !dbg !12
476
+ %440 = load float, ptr addrspace(3) %419, align 4, !dbg !12
477
+ %441 = load float, ptr addrspace(3) %421, align 16, !dbg !12
478
+ %442 = load float, ptr addrspace(3) %423, align 4, !dbg !12
479
+ %443 = load float, ptr addrspace(3) %425, align 8, !dbg !12
480
+ %444 = load float, ptr addrspace(3) %427, align 4, !dbg !12
481
+ %445 = fsub float %303, %302, !dbg !44
482
+ %446 = fadd float %414, %416, !dbg !48
483
+ %447 = fcmp oeq float %446, 0.000000e+00, !dbg !49
484
+ %448 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %446) #6, !dbg !50
485
+ %449 = select i1 %447, float 0.000000e+00, float %448, !dbg !51
486
+ %450 = fmul float %445, %449, !dbg !52
487
+ %451 = fadd float %302, %450, !dbg !53
488
+ %452 = fadd float %350, %351, !dbg !54
489
+ %453 = fmul float %445, %445, !dbg !55
490
+ %454 = fmul float %453, %414, !dbg !56
491
+ %455 = fmul float %454, %449, !dbg !57
492
+ %456 = fadd float %452, %455, !dbg !58
493
+ %457 = fsub float %304, %451, !dbg !44
494
+ %458 = fadd float %418, %446, !dbg !48
495
+ %459 = fcmp oeq float %458, 0.000000e+00, !dbg !49
496
+ %460 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %418, float %458) #6, !dbg !50
497
+ %461 = select i1 %459, float 0.000000e+00, float %460, !dbg !51
498
+ %462 = fmul float %461, %457, !dbg !52
499
+ %463 = fadd float %451, %462, !dbg !53
500
+ %464 = fadd float %352, %456, !dbg !54
501
+ %465 = fmul float %457, %457, !dbg !55
502
+ %466 = fmul float %446, %465, !dbg !56
503
+ %467 = fmul float %461, %466, !dbg !57
504
+ %468 = fadd float %464, %467, !dbg !58
505
+ %469 = fsub float %305, %463, !dbg !44
506
+ %470 = fadd float %420, %458, !dbg !48
507
+ %471 = fcmp oeq float %470, 0.000000e+00, !dbg !49
508
+ %472 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %420, float %470) #6, !dbg !50
509
+ %473 = select i1 %471, float 0.000000e+00, float %472, !dbg !51
510
+ %474 = fmul float %473, %469, !dbg !52
511
+ %475 = fadd float %463, %474, !dbg !53
512
+ %476 = fadd float %353, %468, !dbg !54
513
+ %477 = fmul float %469, %469, !dbg !55
514
+ %478 = fmul float %458, %477, !dbg !56
515
+ %479 = fmul float %473, %478, !dbg !57
516
+ %480 = fadd float %476, %479, !dbg !58
517
+ %481 = fsub float %306, %475, !dbg !44
518
+ %482 = fadd float %422, %470, !dbg !48
519
+ %483 = fcmp oeq float %482, 0.000000e+00, !dbg !49
520
+ %484 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %422, float %482) #6, !dbg !50
521
+ %485 = select i1 %483, float 0.000000e+00, float %484, !dbg !51
522
+ %486 = fmul float %485, %481, !dbg !52
523
+ %487 = fadd float %475, %486, !dbg !53
524
+ %488 = fadd float %354, %480, !dbg !54
525
+ %489 = fmul float %481, %481, !dbg !55
526
+ %490 = fmul float %470, %489, !dbg !56
527
+ %491 = fmul float %485, %490, !dbg !57
528
+ %492 = fadd float %488, %491, !dbg !58
529
+ %493 = fsub float %307, %487, !dbg !44
530
+ %494 = fadd float %424, %482, !dbg !48
531
+ %495 = fcmp oeq float %494, 0.000000e+00, !dbg !49
532
+ %496 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %424, float %494) #6, !dbg !50
533
+ %497 = select i1 %495, float 0.000000e+00, float %496, !dbg !51
534
+ %498 = fmul float %497, %493, !dbg !52
535
+ %499 = fadd float %487, %498, !dbg !53
536
+ %500 = fadd float %355, %492, !dbg !54
537
+ %501 = fmul float %493, %493, !dbg !55
538
+ %502 = fmul float %482, %501, !dbg !56
539
+ %503 = fmul float %497, %502, !dbg !57
540
+ %504 = fadd float %500, %503, !dbg !58
541
+ %505 = fsub float %308, %499, !dbg !44
542
+ %506 = fadd float %426, %494, !dbg !48
543
+ %507 = fcmp oeq float %506, 0.000000e+00, !dbg !49
544
+ %508 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %426, float %506) #6, !dbg !50
545
+ %509 = select i1 %507, float 0.000000e+00, float %508, !dbg !51
546
+ %510 = fmul float %509, %505, !dbg !52
547
+ %511 = fadd float %499, %510, !dbg !53
548
+ %512 = fadd float %356, %504, !dbg !54
549
+ %513 = fmul float %505, %505, !dbg !55
550
+ %514 = fmul float %494, %513, !dbg !56
551
+ %515 = fmul float %509, %514, !dbg !57
552
+ %516 = fadd float %512, %515, !dbg !58
553
+ %517 = fsub float %309, %511, !dbg !44
554
+ %518 = fadd float %428, %506, !dbg !48
555
+ %519 = fcmp oeq float %518, 0.000000e+00, !dbg !49
556
+ %520 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float %518) #6, !dbg !50
557
+ %521 = select i1 %519, float 0.000000e+00, float %520, !dbg !51
558
+ %522 = fmul float %521, %517, !dbg !52
559
+ %523 = fadd float %511, %522, !dbg !53
560
+ %524 = fadd float %357, %516, !dbg !54
561
+ %525 = fmul float %517, %517, !dbg !55
562
+ %526 = fmul float %506, %525, !dbg !56
563
+ %527 = fmul float %521, %526, !dbg !57
564
+ %528 = fadd float %524, %527, !dbg !58
565
+ %529 = fsub float %311, %310, !dbg !44
566
+ %530 = fadd float %437, %438, !dbg !48
567
+ %531 = fcmp oeq float %530, 0.000000e+00, !dbg !49
568
+ %532 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %438, float %530) #6, !dbg !50
569
+ %533 = select i1 %531, float 0.000000e+00, float %532, !dbg !51
570
+ %534 = fmul float %529, %533, !dbg !52
571
+ %535 = fadd float %310, %534, !dbg !53
572
+ %536 = fadd float %358, %359, !dbg !54
573
+ %537 = fmul float %529, %529, !dbg !55
574
+ %538 = fmul float %537, %437, !dbg !56
575
+ %539 = fmul float %538, %533, !dbg !57
576
+ %540 = fadd float %536, %539, !dbg !58
577
+ %541 = fsub float %312, %535, !dbg !44
578
+ %542 = fadd float %439, %530, !dbg !48
579
+ %543 = fcmp oeq float %542, 0.000000e+00, !dbg !49
580
+ %544 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %439, float %542) #6, !dbg !50
581
+ %545 = select i1 %543, float 0.000000e+00, float %544, !dbg !51
582
+ %546 = fmul float %545, %541, !dbg !52
583
+ %547 = fadd float %535, %546, !dbg !53
584
+ %548 = fadd float %360, %540, !dbg !54
585
+ %549 = fmul float %541, %541, !dbg !55
586
+ %550 = fmul float %530, %549, !dbg !56
587
+ %551 = fmul float %545, %550, !dbg !57
588
+ %552 = fadd float %548, %551, !dbg !58
589
+ %553 = fsub float %313, %547, !dbg !44
590
+ %554 = fadd float %440, %542, !dbg !48
591
+ %555 = fcmp oeq float %554, 0.000000e+00, !dbg !49
592
+ %556 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %440, float %554) #6, !dbg !50
593
+ %557 = select i1 %555, float 0.000000e+00, float %556, !dbg !51
594
+ %558 = fmul float %557, %553, !dbg !52
595
+ %559 = fadd float %547, %558, !dbg !53
596
+ %560 = fadd float %361, %552, !dbg !54
597
+ %561 = fmul float %553, %553, !dbg !55
598
+ %562 = fmul float %542, %561, !dbg !56
599
+ %563 = fmul float %557, %562, !dbg !57
600
+ %564 = fadd float %560, %563, !dbg !58
601
+ %565 = fsub float %314, %559, !dbg !44
602
+ %566 = fadd float %441, %554, !dbg !48
603
+ %567 = fcmp oeq float %566, 0.000000e+00, !dbg !49
604
+ %568 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %441, float %566) #6, !dbg !50
605
+ %569 = select i1 %567, float 0.000000e+00, float %568, !dbg !51
606
+ %570 = fmul float %569, %565, !dbg !52
607
+ %571 = fadd float %559, %570, !dbg !53
608
+ %572 = fadd float %362, %564, !dbg !54
609
+ %573 = fmul float %565, %565, !dbg !55
610
+ %574 = fmul float %554, %573, !dbg !56
611
+ %575 = fmul float %569, %574, !dbg !57
612
+ %576 = fadd float %572, %575, !dbg !58
613
+ %577 = fsub float %315, %571, !dbg !44
614
+ %578 = fadd float %442, %566, !dbg !48
615
+ %579 = fcmp oeq float %578, 0.000000e+00, !dbg !49
616
+ %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %442, float %578) #6, !dbg !50
617
+ %581 = select i1 %579, float 0.000000e+00, float %580, !dbg !51
618
+ %582 = fmul float %581, %577, !dbg !52
619
+ %583 = fadd float %571, %582, !dbg !53
620
+ %584 = fadd float %363, %576, !dbg !54
621
+ %585 = fmul float %577, %577, !dbg !55
622
+ %586 = fmul float %566, %585, !dbg !56
623
+ %587 = fmul float %581, %586, !dbg !57
624
+ %588 = fadd float %584, %587, !dbg !58
625
+ %589 = fsub float %316, %583, !dbg !44
626
+ %590 = fadd float %443, %578, !dbg !48
627
+ %591 = fcmp oeq float %590, 0.000000e+00, !dbg !49
628
+ %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %443, float %590) #6, !dbg !50
629
+ %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !51
630
+ %594 = fmul float %593, %589, !dbg !52
631
+ %595 = fadd float %583, %594, !dbg !53
632
+ %596 = fadd float %364, %588, !dbg !54
633
+ %597 = fmul float %589, %589, !dbg !55
634
+ %598 = fmul float %578, %597, !dbg !56
635
+ %599 = fmul float %593, %598, !dbg !57
636
+ %600 = fadd float %596, %599, !dbg !58
637
+ %601 = fsub float %317, %595, !dbg !44
638
+ %602 = fadd float %444, %590, !dbg !48
639
+ %603 = fcmp oeq float %602, 0.000000e+00, !dbg !49
640
+ %604 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %444, float %602) #6, !dbg !50
641
+ %605 = select i1 %603, float 0.000000e+00, float %604, !dbg !51
642
+ %606 = fmul float %605, %601, !dbg !52
643
+ %607 = fadd float %595, %606, !dbg !53
644
+ %608 = fadd float %365, %600, !dbg !54
645
+ %609 = fmul float %601, %601, !dbg !55
646
+ %610 = fmul float %590, %609, !dbg !56
647
+ %611 = fmul float %605, %610, !dbg !57
648
+ %612 = fadd float %608, %611, !dbg !58
649
+ %613 = bitcast float %523 to i32, !dbg !59
650
+ %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !59
651
+ %615 = bitcast i32 %614 to float, !dbg !59
652
+ %616 = bitcast float %528 to i32, !dbg !59
653
+ %617 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %616, i32 4, i32 31), !dbg !59
654
+ %618 = bitcast i32 %617 to float, !dbg !59
655
+ %619 = bitcast float %518 to i32, !dbg !59
656
+ %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !59
657
+ %621 = bitcast i32 %620 to float, !dbg !59
658
+ %622 = fsub float %615, %523, !dbg !44
659
+ %623 = fadd float %518, %621, !dbg !48
660
+ %624 = fcmp oeq float %623, 0.000000e+00, !dbg !49
661
+ %625 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %621, float %623) #6, !dbg !50
662
+ %626 = select i1 %624, float 0.000000e+00, float %625, !dbg !51
663
+ %627 = fmul float %626, %622, !dbg !52
664
+ %628 = fadd float %523, %627, !dbg !53
665
+ %629 = fadd float %528, %618, !dbg !54
666
+ %630 = fmul float %622, %622, !dbg !55
667
+ %631 = fmul float %518, %630, !dbg !56
668
+ %632 = fmul float %626, %631, !dbg !57
669
+ %633 = fadd float %629, %632, !dbg !58
670
+ %634 = bitcast float %628 to i32, !dbg !59
671
+ %635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %634, i32 2, i32 31), !dbg !59
672
+ %636 = bitcast i32 %635 to float, !dbg !59
673
+ %637 = bitcast float %633 to i32, !dbg !59
674
+ %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 2, i32 31), !dbg !59
675
+ %639 = bitcast i32 %638 to float, !dbg !59
676
+ %640 = bitcast float %623 to i32, !dbg !59
677
+ %641 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 2, i32 31), !dbg !59
678
+ %642 = bitcast i32 %641 to float, !dbg !59
679
+ %643 = fsub float %636, %628, !dbg !44
680
+ %644 = fadd float %623, %642, !dbg !48
681
+ %645 = fcmp oeq float %644, 0.000000e+00, !dbg !49
682
+ %646 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %642, float %644) #6, !dbg !50
683
+ %647 = select i1 %645, float 0.000000e+00, float %646, !dbg !51
684
+ %648 = fmul float %647, %643, !dbg !52
685
+ %649 = fadd float %628, %648, !dbg !53
686
+ %650 = fadd float %633, %639, !dbg !54
687
+ %651 = fmul float %643, %643, !dbg !55
688
+ %652 = fmul float %623, %651, !dbg !56
689
+ %653 = fmul float %647, %652, !dbg !57
690
+ %654 = fadd float %650, %653, !dbg !58
691
+ %655 = bitcast float %649 to i32, !dbg !59
692
+ %656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 1, i32 31), !dbg !59
693
+ %657 = bitcast i32 %656 to float, !dbg !59
694
+ %658 = bitcast float %654 to i32, !dbg !59
695
+ %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 1, i32 31), !dbg !59
696
+ %660 = bitcast i32 %659 to float, !dbg !59
697
+ %661 = bitcast float %644 to i32, !dbg !59
698
+ %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !59
699
+ %663 = bitcast i32 %662 to float, !dbg !59
700
+ %664 = fsub float %657, %649, !dbg !44
701
+ %665 = fadd float %644, %663, !dbg !48
702
+ %666 = fcmp oeq float %665, 0.000000e+00, !dbg !49
703
+ %667 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %663, float %665) #6, !dbg !50
704
+ %668 = select i1 %666, float 0.000000e+00, float %667, !dbg !51
705
+ %669 = fmul float %664, %668, !dbg !52
706
+ %670 = fadd float %649, %669, !dbg !53
707
+ %671 = fadd float %654, %660, !dbg !54
708
+ %672 = fmul float %664, %664, !dbg !55
709
+ %673 = fmul float %644, %672, !dbg !56
710
+ %674 = fmul float %668, %673, !dbg !57
711
+ %675 = fadd float %671, %674, !dbg !58
712
+ %676 = bitcast float %607 to i32, !dbg !59
713
+ %677 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 4, i32 31), !dbg !59
714
+ %678 = bitcast i32 %677 to float, !dbg !59
715
+ %679 = bitcast float %612 to i32, !dbg !59
716
+ %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %679, i32 4, i32 31), !dbg !59
717
+ %681 = bitcast i32 %680 to float, !dbg !59
718
+ %682 = bitcast float %602 to i32, !dbg !59
719
+ %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 4, i32 31), !dbg !59
720
+ %684 = bitcast i32 %683 to float, !dbg !59
721
+ %685 = fsub float %678, %607, !dbg !44
722
+ %686 = fadd float %602, %684, !dbg !48
723
+ %687 = fcmp oeq float %686, 0.000000e+00, !dbg !49
724
+ %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %684, float %686) #6, !dbg !50
725
+ %689 = select i1 %687, float 0.000000e+00, float %688, !dbg !51
726
+ %690 = fmul float %685, %689, !dbg !52
727
+ %691 = fadd float %607, %690, !dbg !53
728
+ %692 = fadd float %612, %681, !dbg !54
729
+ %693 = fmul float %685, %685, !dbg !55
730
+ %694 = fmul float %602, %693, !dbg !56
731
+ %695 = fmul float %694, %689, !dbg !57
732
+ %696 = fadd float %692, %695, !dbg !58
733
+ %697 = bitcast float %691 to i32, !dbg !59
734
+ %698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 2, i32 31), !dbg !59
735
+ %699 = bitcast i32 %698 to float, !dbg !59
736
+ %700 = bitcast float %696 to i32, !dbg !59
737
+ %701 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %700, i32 2, i32 31), !dbg !59
738
+ %702 = bitcast i32 %701 to float, !dbg !59
739
+ %703 = bitcast float %686 to i32, !dbg !59
740
+ %704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 2, i32 31), !dbg !59
741
+ %705 = bitcast i32 %704 to float, !dbg !59
742
+ %706 = fsub float %699, %691, !dbg !44
743
+ %707 = fadd float %686, %705, !dbg !48
744
+ %708 = fcmp oeq float %707, 0.000000e+00, !dbg !49
745
+ %709 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %705, float %707) #6, !dbg !50
746
+ %710 = select i1 %708, float 0.000000e+00, float %709, !dbg !51
747
+ %711 = fmul float %706, %710, !dbg !52
748
+ %712 = fadd float %691, %711, !dbg !53
749
+ %713 = fadd float %696, %702, !dbg !54
750
+ %714 = fmul float %706, %706, !dbg !55
751
+ %715 = fmul float %686, %714, !dbg !56
752
+ %716 = fmul float %710, %715, !dbg !57
753
+ %717 = fadd float %713, %716, !dbg !58
754
+ %718 = bitcast float %712 to i32, !dbg !59
755
+ %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !59
756
+ %720 = bitcast i32 %719 to float, !dbg !59
757
+ %721 = bitcast float %717 to i32, !dbg !59
758
+ %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !59
759
+ %723 = bitcast i32 %722 to float, !dbg !59
760
+ %724 = bitcast float %707 to i32, !dbg !59
761
+ %725 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %724, i32 1, i32 31), !dbg !59
762
+ %726 = bitcast i32 %725 to float, !dbg !59
763
+ %727 = fsub float %720, %712, !dbg !44
764
+ %728 = fadd float %707, %726, !dbg !48
765
+ %729 = fcmp oeq float %728, 0.000000e+00, !dbg !49
766
+ %730 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %726, float %728) #6, !dbg !50
767
+ %731 = select i1 %729, float 0.000000e+00, float %730, !dbg !51
768
+ %732 = fmul float %727, %731, !dbg !52
769
+ %733 = fadd float %712, %732, !dbg !53
770
+ %734 = fadd float %717, %723, !dbg !54
771
+ %735 = fmul float %727, %727, !dbg !55
772
+ %736 = fmul float %707, %735, !dbg !56
773
+ %737 = fmul float %731, %736, !dbg !57
774
+ %738 = fadd float %734, %737, !dbg !58
775
+ %739 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
776
+ %740 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
777
+ %741 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
778
+ %742 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
779
+ %743 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
780
+ %744 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
781
+ %745 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
782
+ %746 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
783
+ %747 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
784
+ %748 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
785
+ %749 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
786
+ %750 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
787
+ %751 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
788
+ %752 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
789
+ %753 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
790
+ %754 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
791
+ %755 = fadd float %739, 0x3EE4F8B580000000, !dbg !62
792
+ %756 = fadd float %747, 0x3EE4F8B580000000, !dbg !62
793
+ %757 = shl i32 %18, 8, !dbg !63
794
+ %758 = shl i32 %19, 8, !dbg !63
795
+ br label %759, !dbg !64
796
+
797
+ 759: ; preds = %368, %__nv_rsqrtf.exit25
798
+ %760 = phi i32 [ 0, %368 ], [ %1009, %__nv_rsqrtf.exit25 ]
799
+ %761 = or i32 %760, %13, !dbg !65
800
+ %762 = or i32 %760, %14, !dbg !65
801
+ %763 = add i32 %761, %46, !dbg !66
802
+ %764 = add i32 %762, %46, !dbg !66
803
+ %765 = add i32 %761, %47, !dbg !66
804
+ %766 = add i32 %762, %47, !dbg !66
805
+ %767 = sext i32 %763 to i64, !dbg !67
806
+ %768 = getelementptr float, ptr addrspace(1) %2, i64 %767, !dbg !67
807
+ %769 = sext i32 %764 to i64, !dbg !67
808
+ %770 = getelementptr float, ptr addrspace(1) %2, i64 %769, !dbg !67
809
+ %771 = sext i32 %765 to i64, !dbg !67
810
+ %772 = getelementptr float, ptr addrspace(1) %2, i64 %771, !dbg !67
811
+ %773 = sext i32 %766 to i64, !dbg !67
812
+ %774 = getelementptr float, ptr addrspace(1) %2, i64 %773, !dbg !67
813
+ %775 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %768, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
814
+ %776 = extractvalue { i32, i32, i32, i32 } %775, 0, !dbg !68
815
+ %777 = extractvalue { i32, i32, i32, i32 } %775, 1, !dbg !68
816
+ %778 = extractvalue { i32, i32, i32, i32 } %775, 2, !dbg !68
817
+ %779 = extractvalue { i32, i32, i32, i32 } %775, 3, !dbg !68
818
+ %780 = bitcast i32 %776 to float, !dbg !68
819
+ %781 = bitcast i32 %777 to float, !dbg !68
820
+ %782 = bitcast i32 %778 to float, !dbg !68
821
+ %783 = bitcast i32 %779 to float, !dbg !68
822
+ %784 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %770, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
823
+ %785 = extractvalue { i32, i32, i32, i32 } %784, 0, !dbg !68
824
+ %786 = extractvalue { i32, i32, i32, i32 } %784, 1, !dbg !68
825
+ %787 = extractvalue { i32, i32, i32, i32 } %784, 2, !dbg !68
826
+ %788 = extractvalue { i32, i32, i32, i32 } %784, 3, !dbg !68
827
+ %789 = bitcast i32 %785 to float, !dbg !68
828
+ %790 = bitcast i32 %786 to float, !dbg !68
829
+ %791 = bitcast i32 %787 to float, !dbg !68
830
+ %792 = bitcast i32 %788 to float, !dbg !68
831
+ %793 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %772, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
832
+ %794 = extractvalue { i32, i32, i32, i32 } %793, 0, !dbg !68
833
+ %795 = extractvalue { i32, i32, i32, i32 } %793, 1, !dbg !68
834
+ %796 = extractvalue { i32, i32, i32, i32 } %793, 2, !dbg !68
835
+ %797 = extractvalue { i32, i32, i32, i32 } %793, 3, !dbg !68
836
+ %798 = bitcast i32 %794 to float, !dbg !68
837
+ %799 = bitcast i32 %795 to float, !dbg !68
838
+ %800 = bitcast i32 %796 to float, !dbg !68
839
+ %801 = bitcast i32 %797 to float, !dbg !68
840
+ %802 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %774, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
841
+ %803 = extractvalue { i32, i32, i32, i32 } %802, 0, !dbg !68
842
+ %804 = extractvalue { i32, i32, i32, i32 } %802, 1, !dbg !68
843
+ %805 = extractvalue { i32, i32, i32, i32 } %802, 2, !dbg !68
844
+ %806 = extractvalue { i32, i32, i32, i32 } %802, 3, !dbg !68
845
+ %807 = bitcast i32 %803 to float, !dbg !68
846
+ %808 = bitcast i32 %804 to float, !dbg !68
847
+ %809 = bitcast i32 %805 to float, !dbg !68
848
+ %810 = bitcast i32 %806 to float, !dbg !68
849
+ %811 = zext nneg i32 %761 to i64, !dbg !69
850
+ %812 = getelementptr float, ptr addrspace(1) %3, i64 %811, !dbg !69
851
+ %813 = zext nneg i32 %762 to i64, !dbg !69
852
+ %814 = getelementptr float, ptr addrspace(1) %3, i64 %813, !dbg !69
853
+ %815 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %812, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
854
+ %816 = extractvalue { i32, i32, i32, i32 } %815, 0, !dbg !70
855
+ %817 = extractvalue { i32, i32, i32, i32 } %815, 1, !dbg !70
856
+ %818 = extractvalue { i32, i32, i32, i32 } %815, 2, !dbg !70
857
+ %819 = extractvalue { i32, i32, i32, i32 } %815, 3, !dbg !70
858
+ %820 = bitcast i32 %816 to float, !dbg !70
859
+ %821 = bitcast i32 %817 to float, !dbg !70
860
+ %822 = bitcast i32 %818 to float, !dbg !70
861
+ %823 = bitcast i32 %819 to float, !dbg !70
862
+ %824 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %814, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
863
+ %825 = extractvalue { i32, i32, i32, i32 } %824, 0, !dbg !70
864
+ %826 = extractvalue { i32, i32, i32, i32 } %824, 1, !dbg !70
865
+ %827 = extractvalue { i32, i32, i32, i32 } %824, 2, !dbg !70
866
+ %828 = extractvalue { i32, i32, i32, i32 } %824, 3, !dbg !70
867
+ %829 = bitcast i32 %825 to float, !dbg !70
868
+ %830 = bitcast i32 %826 to float, !dbg !70
869
+ %831 = bitcast i32 %827 to float, !dbg !70
870
+ %832 = bitcast i32 %828 to float, !dbg !70
871
+ br i1 %53, label %833, label %834, !dbg !71
872
+
873
+ 833: ; preds = %759
874
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
875
+ br label %834, !dbg !71
876
+
877
+ 834: ; preds = %833, %759
878
+ %835 = getelementptr float, ptr addrspace(1) %60, i64 %811, !dbg !72
879
+ %836 = getelementptr float, ptr addrspace(1) %60, i64 %813, !dbg !72
880
+ %837 = getelementptr float, ptr addrspace(1) %61, i64 %811, !dbg !72
881
+ %838 = getelementptr float, ptr addrspace(1) %61, i64 %813, !dbg !72
882
+ %839 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
883
+ %840 = extractvalue { i32, i32, i32, i32 } %839, 0, !dbg !73
884
+ %841 = extractvalue { i32, i32, i32, i32 } %839, 1, !dbg !73
885
+ %842 = extractvalue { i32, i32, i32, i32 } %839, 2, !dbg !73
886
+ %843 = extractvalue { i32, i32, i32, i32 } %839, 3, !dbg !73
887
+ %844 = bitcast i32 %840 to float, !dbg !73
888
+ %845 = bitcast i32 %841 to float, !dbg !73
889
+ %846 = bitcast i32 %842 to float, !dbg !73
890
+ %847 = bitcast i32 %843 to float, !dbg !73
891
+ %848 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %836, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
892
+ %849 = extractvalue { i32, i32, i32, i32 } %848, 0, !dbg !73
893
+ %850 = extractvalue { i32, i32, i32, i32 } %848, 1, !dbg !73
894
+ %851 = extractvalue { i32, i32, i32, i32 } %848, 2, !dbg !73
895
+ %852 = extractvalue { i32, i32, i32, i32 } %848, 3, !dbg !73
896
+ %853 = bitcast i32 %849 to float, !dbg !73
897
+ %854 = bitcast i32 %850 to float, !dbg !73
898
+ %855 = bitcast i32 %851 to float, !dbg !73
899
+ %856 = bitcast i32 %852 to float, !dbg !73
900
+ %857 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
901
+ %858 = extractvalue { i32, i32, i32, i32 } %857, 0, !dbg !73
902
+ %859 = extractvalue { i32, i32, i32, i32 } %857, 1, !dbg !73
903
+ %860 = extractvalue { i32, i32, i32, i32 } %857, 2, !dbg !73
904
+ %861 = extractvalue { i32, i32, i32, i32 } %857, 3, !dbg !73
905
+ %862 = bitcast i32 %858 to float, !dbg !73
906
+ %863 = bitcast i32 %859 to float, !dbg !73
907
+ %864 = bitcast i32 %860 to float, !dbg !73
908
+ %865 = bitcast i32 %861 to float, !dbg !73
909
+ %866 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %838, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
910
+ %867 = extractvalue { i32, i32, i32, i32 } %866, 0, !dbg !73
911
+ %868 = extractvalue { i32, i32, i32, i32 } %866, 1, !dbg !73
912
+ %869 = extractvalue { i32, i32, i32, i32 } %866, 2, !dbg !73
913
+ %870 = extractvalue { i32, i32, i32, i32 } %866, 3, !dbg !73
914
+ %871 = bitcast i32 %867 to float, !dbg !73
915
+ %872 = bitcast i32 %868 to float, !dbg !73
916
+ %873 = bitcast i32 %869 to float, !dbg !73
917
+ %874 = bitcast i32 %870 to float, !dbg !73
918
+ %875 = fadd float %780, %844, !dbg !74
919
+ %876 = fadd float %781, %845, !dbg !74
920
+ %877 = fadd float %782, %846, !dbg !74
921
+ %878 = fadd float %783, %847, !dbg !74
922
+ %879 = fadd float %789, %853, !dbg !74
923
+ %880 = fadd float %790, %854, !dbg !74
924
+ %881 = fadd float %791, %855, !dbg !74
925
+ %882 = fadd float %792, %856, !dbg !74
926
+ %883 = fadd float %798, %862, !dbg !74
927
+ %884 = fadd float %799, %863, !dbg !74
928
+ %885 = fadd float %800, %864, !dbg !74
929
+ %886 = fadd float %801, %865, !dbg !74
930
+ %887 = fadd float %807, %871, !dbg !74
931
+ %888 = fadd float %808, %872, !dbg !74
932
+ %889 = fadd float %809, %873, !dbg !74
933
+ %890 = fadd float %810, %874, !dbg !74
934
+ %891 = fsub float %875, %670, !dbg !75
935
+ %892 = fsub float %876, %670, !dbg !75
936
+ %893 = fsub float %877, %670, !dbg !75
937
+ %894 = fsub float %878, %670, !dbg !75
938
+ %895 = fsub float %879, %670, !dbg !75
939
+ %896 = fsub float %880, %670, !dbg !75
940
+ %897 = fsub float %881, %670, !dbg !75
941
+ %898 = fsub float %882, %670, !dbg !75
942
+ %899 = fsub float %883, %733, !dbg !75
943
+ %900 = fsub float %884, %733, !dbg !75
944
+ %901 = fsub float %885, %733, !dbg !75
945
+ %902 = fsub float %886, %733, !dbg !75
946
+ %903 = fsub float %887, %733, !dbg !75
947
+ %904 = fsub float %888, %733, !dbg !75
948
+ %905 = fsub float %889, %733, !dbg !75
949
+ %906 = fsub float %890, %733, !dbg !75
950
+ %907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
951
+ %.not.i = icmp eq i32 %907, 0, !dbg !76
952
+ br i1 %.not.i, label %910, label %908, !dbg !76
953
+
954
+ 908: ; preds = %834
955
+ %909 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %755), !dbg !76
956
+ br label %__nv_rsqrtf.exit, !dbg !76
957
+
958
+ 910: ; preds = %834
959
+ %911 = tail call float @llvm.nvvm.rsqrt.approx.f(float %755), !dbg !76
960
+ br label %__nv_rsqrtf.exit, !dbg !76
961
+
962
+ __nv_rsqrtf.exit: ; preds = %908, %910
963
+ %.0.i = phi float [ %909, %908 ], [ %911, %910 ], !dbg !76
964
+ %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
965
+ %913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
966
+ %914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
967
+ %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
968
+ %916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
969
+ %917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
970
+ %918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
971
+ %919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
972
+ %.not.i23 = icmp eq i32 %919, 0, !dbg !76
973
+ br i1 %.not.i23, label %922, label %920, !dbg !76
974
+
975
+ 920: ; preds = %__nv_rsqrtf.exit
976
+ %921 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %756), !dbg !76
977
+ br label %__nv_rsqrtf.exit25, !dbg !76
978
+
979
+ 922: ; preds = %__nv_rsqrtf.exit
980
+ %923 = tail call float @llvm.nvvm.rsqrt.approx.f(float %756), !dbg !76
981
+ br label %__nv_rsqrtf.exit25, !dbg !76
982
+
983
+ __nv_rsqrtf.exit25: ; preds = %920, %922
984
+ %.0.i24 = phi float [ %921, %920 ], [ %923, %922 ], !dbg !76
985
+ %924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
986
+ %925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
987
+ %926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
988
+ %927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
989
+ %928 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
990
+ %929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
991
+ %930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
992
+ %931 = fmul float %891, %.0.i, !dbg !77
993
+ %932 = fmul float %892, %.0.i, !dbg !77
994
+ %933 = fmul float %893, %.0.i, !dbg !77
995
+ %934 = fmul float %894, %.0.i, !dbg !77
996
+ %935 = fmul float %895, %.0.i, !dbg !77
997
+ %936 = fmul float %896, %.0.i, !dbg !77
998
+ %937 = fmul float %897, %.0.i, !dbg !77
999
+ %938 = fmul float %898, %.0.i, !dbg !77
1000
+ %939 = fmul float %899, %.0.i24, !dbg !77
1001
+ %940 = fmul float %900, %.0.i24, !dbg !77
1002
+ %941 = fmul float %901, %.0.i24, !dbg !77
1003
+ %942 = fmul float %902, %.0.i24, !dbg !77
1004
+ %943 = fmul float %903, %.0.i24, !dbg !77
1005
+ %944 = fmul float %904, %.0.i24, !dbg !77
1006
+ %945 = fmul float %905, %.0.i24, !dbg !77
1007
+ %946 = fmul float %906, %.0.i24, !dbg !77
1008
+ %947 = fmul float %931, %820, !dbg !78
1009
+ %948 = fmul float %932, %821, !dbg !78
1010
+ %949 = fmul float %933, %822, !dbg !78
1011
+ %950 = fmul float %934, %823, !dbg !78
1012
+ %951 = fmul float %935, %829, !dbg !78
1013
+ %952 = fmul float %936, %830, !dbg !78
1014
+ %953 = fmul float %937, %831, !dbg !78
1015
+ %954 = fmul float %938, %832, !dbg !78
1016
+ %955 = fmul float %939, %820, !dbg !78
1017
+ %956 = fmul float %940, %821, !dbg !78
1018
+ %957 = fmul float %941, %822, !dbg !78
1019
+ %958 = fmul float %942, %823, !dbg !78
1020
+ %959 = fmul float %943, %829, !dbg !78
1021
+ %960 = fmul float %944, %830, !dbg !78
1022
+ %961 = fmul float %945, %831, !dbg !78
1023
+ %962 = fmul float %946, %832, !dbg !78
1024
+ %963 = add i32 %761, %757, !dbg !79
1025
+ %964 = add i32 %761, %758, !dbg !79
1026
+ %965 = sext i32 %963 to i64, !dbg !80
1027
+ %966 = getelementptr i16, ptr addrspace(1) %4, i64 %965, !dbg !80
1028
+ %967 = sext i32 %964 to i64, !dbg !80
1029
+ %968 = getelementptr i16, ptr addrspace(1) %4, i64 %967, !dbg !80
1030
+ %969 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #6, !dbg !81
1031
+ %970 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #6, !dbg !81
1032
+ %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #6, !dbg !81
1033
+ %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #6, !dbg !81
1034
+ %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #6, !dbg !81
1035
+ %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #6, !dbg !81
1036
+ %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #6, !dbg !81
1037
+ %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #6, !dbg !81
1038
+ %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %955) #6, !dbg !81
1039
+ %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %956) #6, !dbg !81
1040
+ %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %957) #6, !dbg !81
1041
+ %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %958) #6, !dbg !81
1042
+ %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %959) #6, !dbg !81
1043
+ %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %960) #6, !dbg !81
1044
+ %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %961) #6, !dbg !81
1045
+ %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %962) #6, !dbg !81
1046
+ %985 = insertelement <2 x i16> undef, i16 %969, i64 0, !dbg !81
1047
+ %986 = insertelement <2 x i16> %985, i16 %970, i64 1, !dbg !81
1048
+ %987 = bitcast <2 x i16> %986 to i32, !dbg !81
1049
+ %988 = insertelement <2 x i16> undef, i16 %971, i64 0, !dbg !81
1050
+ %989 = insertelement <2 x i16> %988, i16 %972, i64 1, !dbg !81
1051
+ %990 = bitcast <2 x i16> %989 to i32, !dbg !81
1052
+ %991 = insertelement <2 x i16> undef, i16 %973, i64 0, !dbg !81
1053
+ %992 = insertelement <2 x i16> %991, i16 %974, i64 1, !dbg !81
1054
+ %993 = bitcast <2 x i16> %992 to i32, !dbg !81
1055
+ %994 = insertelement <2 x i16> undef, i16 %975, i64 0, !dbg !81
1056
+ %995 = insertelement <2 x i16> %994, i16 %976, i64 1, !dbg !81
1057
+ %996 = bitcast <2 x i16> %995 to i32, !dbg !81
1058
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %987, i32 %990, i32 %993, i32 %996, ptr addrspace(1) %966, i1 true) #6, !dbg !81
1059
+ %997 = insertelement <2 x i16> undef, i16 %977, i64 0, !dbg !81
1060
+ %998 = insertelement <2 x i16> %997, i16 %978, i64 1, !dbg !81
1061
+ %999 = bitcast <2 x i16> %998 to i32, !dbg !81
1062
+ %1000 = insertelement <2 x i16> undef, i16 %979, i64 0, !dbg !81
1063
+ %1001 = insertelement <2 x i16> %1000, i16 %980, i64 1, !dbg !81
1064
+ %1002 = bitcast <2 x i16> %1001 to i32, !dbg !81
1065
+ %1003 = insertelement <2 x i16> undef, i16 %981, i64 0, !dbg !81
1066
+ %1004 = insertelement <2 x i16> %1003, i16 %982, i64 1, !dbg !81
1067
+ %1005 = bitcast <2 x i16> %1004 to i32, !dbg !81
1068
+ %1006 = insertelement <2 x i16> undef, i16 %983, i64 0, !dbg !81
1069
+ %1007 = insertelement <2 x i16> %1006, i16 %984, i64 1, !dbg !81
1070
+ %1008 = bitcast <2 x i16> %1007 to i32, !dbg !81
1071
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %999, i32 %1002, i32 %1005, i32 %1008, ptr addrspace(1) %968, i1 true) #6, !dbg !81
1072
+ %1009 = add nuw nsw i32 %760, 64, !dbg !64
1073
+ %1010 = icmp ult i32 %760, 192, !dbg !64
1074
+ br i1 %1010, label %759, label %1011, !dbg !64
1075
+
1076
+ 1011: ; preds = %__nv_rsqrtf.exit25
1077
+ ret void, !dbg !82
1078
+ }
1079
+
1080
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1081
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
1082
+
1083
+ ; Function Attrs: convergent nocallback nounwind
1084
+ declare void @llvm.nvvm.barrier0() #1
1085
+
1086
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1087
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
1088
+
1089
+ ; Function Attrs: alwaysinline nounwind
1090
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
1091
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
1092
+ %.not = icmp eq i32 %1, 0
1093
+ br i1 %.not, label %4, label %2
1094
+
1095
+ 2: ; preds = %0
1096
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
1097
+ br label %6
1098
+
1099
+ 4: ; preds = %0
1100
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
1101
+ br label %6
1102
+
1103
+ 6: ; preds = %4, %2
1104
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
1105
+ ret float %.0
1106
+ }
1107
+
1108
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1109
+
1110
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1111
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
1112
+
1113
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1114
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
1115
+
1116
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1117
+ attributes #1 = { convergent nocallback nounwind }
1118
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1119
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1120
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1121
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1122
+ attributes #6 = { nounwind }
1123
+
1124
+ !llvm.module.flags = !{!0, !1}
1125
+ !llvm.dbg.cu = !{!2}
1126
+ !nvvm.annotations = !{!4, !5, !5, !4}
1127
+ !llvm.ident = !{!6}
1128
+
1129
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1130
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1131
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1132
+ !3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
1133
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
1134
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
1135
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1136
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
1137
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
1138
+ !9 = !{}
1139
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
1140
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
1141
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
1142
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
1143
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
1144
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
1145
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
1146
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
1147
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
1148
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
1149
+ !20 = !DILocation(line: 36, column: 22, scope: !7)
1150
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
1151
+ !22 = !DILocation(line: 38, column: 36, scope: !7)
1152
+ !23 = !DILocation(line: 39, column: 40, scope: !7)
1153
+ !24 = !DILocation(line: 40, column: 44, scope: !7)
1154
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
1155
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
1156
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
1157
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
1158
+ !29 = !DILocation(line: 39, column: 55, scope: !7)
1159
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
1160
+ !31 = !DILocation(line: 40, column: 34, scope: !7)
1161
+ !32 = !DILocation(line: 40, column: 52, scope: !7)
1162
+ !33 = !DILocation(line: 41, column: 22, scope: !7)
1163
+ !34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
1164
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
1165
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
1166
+ !37 = !DILocation(line: 44, column: 38, scope: !35)
1167
+ !38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
1168
+ !39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
1169
+ !40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
1170
+ !41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
1171
+ !42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
1172
+ !43 = !DILocation(line: 47, column: 48, scope: !7)
1173
+ !44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
1174
+ !45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
1175
+ !46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
1176
+ !47 = !DILocation(line: 50, column: 41, scope: !45)
1177
+ !48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
1178
+ !49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
1179
+ !50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
1180
+ !51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
1181
+ !52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
1182
+ !53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
1183
+ !54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
1184
+ !55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
1185
+ !56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
1186
+ !57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
1187
+ !58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
1188
+ !59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
1189
+ !60 = !DILocation(line: 50, column: 41, scope: !35)
1190
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
1191
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
1192
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
1193
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
1194
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
1195
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
1196
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
1197
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
1198
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
1199
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
1200
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
1201
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
1202
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
1203
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
1204
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
1205
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
1206
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
1207
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
1208
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
1209
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
1210
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
1211
+ !82 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant 0.000000e+00 : f32
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %c64_i32 = arith.constant 64 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
12
+ %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
15
+ %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
16
+ %cst_9 = arith.constant dense<256> : tensor<1x64xi32>
17
+ %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
18
+ %0 = tt.get_program_id x : i32
19
+ %1 = arith.muli %0, %c64_i32 : i32
20
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
21
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
22
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
23
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
24
+ %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
25
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
26
+ %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
27
+ %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
28
+ %10 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
29
+ %11 = arith.muli %10, %cst_8 : tensor<64x1xi32>
30
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
31
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
32
+ %14 = arith.addi %9, %cst_3 : tensor<64x1xi64>
33
+ %15 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
34
+ %16 = arith.select %15, %14, %9 : tensor<64x1xi1>, tensor<64x1xi64>
35
+ %17 = arith.cmpi sge, %16, %cst_2 : tensor<64x1xi64>
36
+ %18 = arith.cmpi slt, %16, %cst_3 : tensor<64x1xi64>
37
+ %19 = arith.andi %17, %18 : tensor<64x1xi1>
38
+ %20 = arith.muli %16, %cst_1 : tensor<64x1xi64>
39
+ %21 = tt.broadcast %20 : (tensor<64x1xi64>) -> tensor<64x64xi64>
40
+ %22 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
41
+ %23:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 {
42
+ %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
43
+ %47 = arith.addi %46, %6 : tensor<1x64xi32>
44
+ %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
45
+ %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
46
+ %50 = arith.addi %49, %12 : tensor<64x64xi32>
47
+ %51 = tt.addptr %13, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
48
+ %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
49
+ %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
50
+ tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
51
+ %54 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
52
+ %55 = tt.broadcast %54 : (tensor<1x64xi64>) -> tensor<64x64xi64>
53
+ %56 = arith.addi %55, %21 : tensor<64x64xi64>
54
+ %57 = tt.addptr %22, %56 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
55
+ %58 = tt.load %57, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
56
+ %59 = arith.addf %58, %53 : tensor<64x64xf32>
57
+ %60 = arith.subf %59, %arg8 : tensor<64x64xf32>
58
+ %61 = arith.addf %arg10, %cst_0 : tensor<64x64xf32>
59
+ %62 = arith.divf %60, %61 : tensor<64x64xf32>
60
+ %63 = arith.addf %arg8, %62 : tensor<64x64xf32>
61
+ %64 = arith.subf %59, %63 : tensor<64x64xf32>
62
+ %65 = arith.mulf %60, %64 : tensor<64x64xf32>
63
+ %66 = arith.addf %arg9, %65 : tensor<64x64xf32>
64
+ %67 = arith.select %52, %63, %arg8 : tensor<64x64xi1>, tensor<64x64xf32>
65
+ %68 = arith.select %52, %66, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
66
+ %69 = arith.select %52, %61, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
67
+ scf.yield %67, %68, %69 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
68
+ }
69
+ %24:3 = "tt.reduce"(%23#0, %23#1, %23#2) <{axis = 1 : i32}> ({
70
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
71
+ %46 = arith.subf %arg10, %arg7 : f32
72
+ %47 = arith.addf %arg9, %arg12 : f32
73
+ %48 = arith.cmpf oeq, %47, %cst : f32
74
+ %49 = arith.divf %arg12, %47 : f32
75
+ %50 = arith.select %48, %cst, %49 : f32
76
+ %51 = arith.mulf %46, %50 : f32
77
+ %52 = arith.addf %arg7, %51 : f32
78
+ %53 = arith.addf %arg8, %arg11 : f32
79
+ %54 = arith.mulf %46, %46 : f32
80
+ %55 = arith.mulf %54, %arg9 : f32
81
+ %56 = arith.mulf %55, %50 : f32
82
+ %57 = arith.addf %53, %56 : f32
83
+ tt.reduce.return %52, %57, %47 : f32, f32, f32
84
+ }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
85
+ %25 = tt.expand_dims %24#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
86
+ %26 = tt.expand_dims %24#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
87
+ %27 = arith.muli %10, %cst_8 : tensor<64x1xi32>
88
+ %28 = tt.broadcast %27 : (tensor<64x1xi32>) -> tensor<64x64xi32>
89
+ %29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
90
+ %30 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
91
+ %31 = arith.addi %9, %cst_3 : tensor<64x1xi64>
92
+ %32 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
93
+ %33 = arith.select %32, %31, %9 : tensor<64x1xi1>, tensor<64x1xi64>
94
+ %34 = arith.cmpi sge, %33, %cst_2 : tensor<64x1xi64>
95
+ %35 = arith.cmpi slt, %33, %cst_3 : tensor<64x1xi64>
96
+ %36 = arith.andi %34, %35 : tensor<64x1xi1>
97
+ %37 = arith.muli %33, %cst_1 : tensor<64x1xi64>
98
+ %38 = tt.broadcast %37 : (tensor<64x1xi64>) -> tensor<64x64xi64>
99
+ %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
100
+ %40 = tt.broadcast %25 : (tensor<64x1xf32>) -> tensor<64x64xf32>
101
+ %41 = arith.divf %26, %cst_5 : tensor<64x1xf32>
102
+ %42 = arith.addf %41, %cst_4 : tensor<64x1xf32>
103
+ %43 = arith.muli %5, %cst_8 : tensor<64x1xi32>
104
+ %44 = tt.broadcast %43 : (tensor<64x1xi32>) -> tensor<64x64xi32>
105
+ %45 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
106
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
107
+ %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
108
+ %47 = arith.addi %46, %6 : tensor<1x64xi32>
109
+ %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
110
+ %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
111
+ %50 = arith.addi %49, %28 : tensor<64x64xi32>
112
+ %51 = tt.addptr %29, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
113
+ %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
114
+ %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
115
+ %54 = tt.addptr %30, %47 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
116
+ %55 = tt.load %54, %48, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
117
+ tt.assert %36, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
118
+ %56 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
119
+ %57 = tt.broadcast %56 : (tensor<1x64xi64>) -> tensor<64x64xi64>
120
+ %58 = arith.addi %57, %38 : tensor<64x64xi64>
121
+ %59 = tt.addptr %39, %58 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
122
+ %60 = tt.load %59, %52, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
123
+ %61 = arith.addf %60, %53 : tensor<64x64xf32>
124
+ %62 = arith.subf %61, %40 : tensor<64x64xf32>
125
+ %63 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
126
+ %64 = tt.broadcast %63 : (tensor<64x1xf32>) -> tensor<64x64xf32>
127
+ %65 = arith.mulf %62, %64 : tensor<64x64xf32>
128
+ %66 = tt.broadcast %55 : (tensor<1x64xf32>) -> tensor<64x64xf32>
129
+ %67 = arith.mulf %65, %66 : tensor<64x64xf32>
130
+ %68 = arith.addi %49, %44 : tensor<64x64xi32>
131
+ %69 = tt.addptr %45, %68 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
132
+ %70 = arith.truncf %67 : tensor<64x64xf32> to tensor<64x64xbf16>
133
+ tt.store %69, %70, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
134
+ }
135
+ tt.return
136
+ }
137
+ }
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin ADDED
Binary file (36.4 kB). View file
 
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx ADDED
@@ -0,0 +1,1154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62};
20
+ .global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62};
23
+ .global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<65>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<188>;
43
+ .reg .f32 %f<166>;
44
+ .reg .b64 %rd<99>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_3];
50
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_2];
51
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r1, %tid.x;
55
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6de7de_param_1];
56
+ bfe.u32 %r3, %r1, 6, 1;
57
+ and.b32 %r4, %r1, 1;
58
+ .loc 1 24 33
59
+ shl.b32 %r23, %r1, 1;
60
+ and.b32 %r5, %r23, 126;
61
+ .loc 1 21 28
62
+ mov.u32 %r14, %ctaid.x;
63
+ .loc 1 21 33
64
+ shl.b32 %r24, %r14, 1;
65
+ .loc 1 22 23
66
+ or.b32 %r25, %r24, %r3;
67
+ or.b32 %r26, %r24, %r4;
68
+ .loc 1 26 30
69
+ mul.wide.s32 %rd26, %r25, 8;
70
+ add.s64 %rd17, %rd24, %rd26;
71
+ mul.wide.s32 %rd27, %r26, 8;
72
+ add.s64 %rd21, %rd24, %rd27;
73
+ mov.pred %p61, -1;
74
+ .loc 1 26 35
75
+ mov.u64 %rd16, 0x0;
76
+ @%p61 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
77
+ mov.u64 %rd18, 0x0;
78
+ @%p61 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd17 + 0 ];
79
+ mov.u64 %rd20, 0x0;
80
+ @%p61 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
81
+ .loc 1 27 18
82
+ bfe.s32 %r27, %r14, 30, 1;
83
+ shr.u32 %r28, %r27, 23;
84
+ add.s32 %r29, %r25, %r28;
85
+ and.b32 %r30, %r29, 16776704;
86
+ sub.s32 %r31, %r25, %r30;
87
+ .loc 1 35 44
88
+ shl.b32 %r6, %r31, 8;
89
+ .loc 1 36 44
90
+ shl.b32 %r7, %r25, 8;
91
+ .loc 1 37 22
92
+ add.s64 %rd28, %rd20, 50257;
93
+ .loc 1 38 22
94
+ setp.lt.s64 %p9, %rd16, 0;
95
+ setp.lt.s64 %p10, %rd20, 0;
96
+ .loc 1 39 36
97
+ selp.b64 %rd1, %rd28, %rd20, %p10;
98
+ .loc 1 40 40
99
+ setp.lt.u64 %p11, %rd1, 50257;
100
+ .loc 1 41 44
101
+ shl.b64 %rd29, %rd16, 8;
102
+ add.s64 %rd30, %rd29, 12865792;
103
+ selp.b64 %rd31, %rd30, %rd29, %p9;
104
+ shl.b64 %rd32, %rd31, 2;
105
+ add.s64 %rd2, %rd25, %rd32;
106
+ .loc 1 35 40
107
+ or.b32 %r32, %r5, %r6;
108
+ .loc 1 35 34
109
+ mul.wide.s32 %rd33, %r32, 4;
110
+ add.s64 %rd62, %rd12, %rd33;
111
+ mov.b32 %r179, 0;
112
+ .loc 1 35 50
113
+ mov.u32 %r15, 0x0;
114
+ mov.u32 %r16, 0x0;
115
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r15, %r16 }, [ %rd62 + 0 ];
116
+ @!%p61 mov.u32 %r15, %r179;
117
+ @!%p61 mov.u32 %r16, %r179;
118
+ mov.b32 %f2, %r16;
119
+ mov.b32 %f1, %r15;
120
+ .loc 1 36 40
121
+ or.b32 %r33, %r5, %r7;
122
+ .loc 1 36 34
123
+ mul.wide.s32 %rd34, %r33, 2;
124
+ add.s64 %rd63, %rd13, %rd34;
125
+ .loc 1 36 50
126
+ mov.u32 %r19, 0x0;
127
+ @%p61 ld.global.L1::evict_last.b32 { %r19 }, [ %rd63 + 0 ];
128
+ @!%p61 mov.u32 %r19, %r179;
129
+ cvt.u16.u32 %rs1, %r19;
130
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; }
131
+ .loc 1 36 101
132
+ cvt.f32.bf16 %r21, %rs1;
133
+ mov.b32 %f3, %r21;
134
+ cvt.f32.bf16 %r22, %rs2;
135
+ mov.b32 %f4, %r22;
136
+ mov.u64 %rd95, assertMessage_0;
137
+ mov.u64 %rd96, assertFile_0;
138
+ mov.u64 %rd97, assertFunc_0;
139
+ mov.b32 %r187, 1892;
140
+ mov.u64 %rd98, 1;
141
+ .loc 1 40 55
142
+ @%p11 bra $L__BB0_2;
143
+ cvta.global.u64 %rd36, %rd95;
144
+ cvta.global.u64 %rd38, %rd96;
145
+ cvta.global.u64 %rd40, %rd97;
146
+ { // callseq 2, 0
147
+ .reg .b32 temp_param_reg;
148
+ .param .b64 param0;
149
+ st.param.b64 [param0+0], %rd36;
150
+ .param .b64 param1;
151
+ st.param.b64 [param1+0], %rd38;
152
+ .param .b32 param2;
153
+ st.param.b32 [param2+0], %r187;
154
+ .param .b64 param3;
155
+ st.param.b64 [param3+0], %rd40;
156
+ .param .b64 param4;
157
+ st.param.b64 [param4+0], %rd98;
158
+ call.uni
159
+ __assertfail,
160
+ (
161
+ param0,
162
+ param1,
163
+ param2,
164
+ param3,
165
+ param4
166
+ );
167
+ } // callseq 2
168
+ $L__BB0_2:
169
+ .loc 1 0 55
170
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_4];
171
+ and.b32 %r2, %r1, 31;
172
+ .loc 1 41 40
173
+ cvt.u64.u32 %rd45, %r5;
174
+ .loc 1 41 34
175
+ mul.wide.u32 %rd46, %r5, 4;
176
+ add.s64 %rd73, %rd2, %rd46;
177
+ .loc 1 41 52
178
+ mov.u32 %r35, 0x0;
179
+ mov.u32 %r36, 0x0;
180
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r35, %r36 }, [ %rd73 + 0 ];
181
+ @!%p61 mov.u32 %r35, %r179;
182
+ @!%p61 mov.u32 %r36, %r179;
183
+ mov.b32 %f21, %r36;
184
+ mov.b32 %f22, %r35;
185
+ .loc 1 42 22
186
+ add.f32 %f23, %f1, %f22;
187
+ add.f32 %f24, %f2, %f21;
188
+ .loc 1 44 22
189
+ add.f32 %f25, %f4, %f24;
190
+ mov.b32 %r43, %f25;
191
+ add.f32 %f26, %f3, %f23;
192
+ mov.b32 %r40, %f26;
193
+ mov.b32 %r41, 1065353216;
194
+ $L__tmp1:
195
+ .loc 2 98 30
196
+ div.full.f32 %r39, %r40, %r41;
197
+ mov.b32 %f27, %r39;
198
+ div.full.f32 %r42, %r43, %r41;
199
+ mov.b32 %f28, %r42;
200
+ .loc 2 98 22
201
+ add.f32 %f6, %f28, 0f00000000;
202
+ add.f32 %f5, %f27, 0f00000000;
203
+ .loc 2 101 30
204
+ sub.f32 %f29, %f26, %f5;
205
+ sub.f32 %f30, %f25, %f6;
206
+ $L__tmp2:
207
+ .loc 1 50 50
208
+ fma.rn.f32 %f8, %f25, %f30, 0f00000000;
209
+ fma.rn.f32 %f7, %f26, %f29, 0f00000000;
210
+ .loc 1 35 34
211
+ cvt.s64.s32 %rd47, %r6;
212
+ add.s64 %rd48, %rd45, %rd47;
213
+ shl.b64 %rd49, %rd48, 2;
214
+ add.s64 %rd50, %rd12, %rd49;
215
+ add.s64 %rd75, %rd50, 512;
216
+ .loc 1 35 50
217
+ mov.u32 %r45, 0x0;
218
+ mov.u32 %r46, 0x0;
219
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r45, %r46 }, [ %rd75 + 0 ];
220
+ @!%p61 mov.u32 %r45, %r179;
221
+ @!%p61 mov.u32 %r46, %r179;
222
+ mov.b32 %f10, %r46;
223
+ mov.b32 %f9, %r45;
224
+ .loc 1 36 34
225
+ cvt.s64.s32 %rd51, %r7;
226
+ add.s64 %rd8, %rd45, %rd51;
227
+ shl.b64 %rd52, %rd8, 1;
228
+ add.s64 %rd53, %rd13, %rd52;
229
+ add.s64 %rd76, %rd53, 256;
230
+ .loc 1 36 50
231
+ mov.u32 %r49, 0x0;
232
+ @%p61 ld.global.L1::evict_last.b32 { %r49 }, [ %rd76 + 0 ];
233
+ @!%p61 mov.u32 %r49, %r179;
234
+ cvt.u16.u32 %rs3, %r49;
235
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r49; }
236
+ .loc 1 36 101
237
+ cvt.f32.bf16 %r51, %rs3;
238
+ mov.b32 %f11, %r51;
239
+ cvt.f32.bf16 %r52, %rs4;
240
+ mov.b32 %f12, %r52;
241
+ .loc 1 40 55
242
+ @%p11 bra $L__BB0_4;
243
+ cvta.global.u64 %rd55, %rd95;
244
+ cvta.global.u64 %rd57, %rd96;
245
+ cvta.global.u64 %rd59, %rd97;
246
+ { // callseq 3, 0
247
+ .reg .b32 temp_param_reg;
248
+ .param .b64 param0;
249
+ st.param.b64 [param0+0], %rd55;
250
+ .param .b64 param1;
251
+ st.param.b64 [param1+0], %rd57;
252
+ .param .b32 param2;
253
+ st.param.b32 [param2+0], %r187;
254
+ .param .b64 param3;
255
+ st.param.b64 [param3+0], %rd59;
256
+ .param .b64 param4;
257
+ st.param.b64 [param4+0], %rd98;
258
+ call.uni
259
+ __assertfail,
260
+ (
261
+ param0,
262
+ param1,
263
+ param2,
264
+ param3,
265
+ param4
266
+ );
267
+ } // callseq 3
268
+ $L__BB0_4:
269
+ .loc 1 0 55
270
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_5];
271
+ cvt.s64.s32 %rd4, %r33;
272
+ .loc 1 41 34
273
+ add.s64 %rd86, %rd73, 512;
274
+ .loc 1 41 52
275
+ mov.u32 %r54, 0x0;
276
+ mov.u32 %r55, 0x0;
277
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r54, %r55 }, [ %rd86 + 0 ];
278
+ @!%p61 mov.u32 %r54, %r179;
279
+ @!%p61 mov.u32 %r55, %r179;
280
+ mov.b32 %f31, %r54;
281
+ mov.b32 %f32, %r55;
282
+ .loc 1 42 22
283
+ add.f32 %f33, %f10, %f32;
284
+ add.f32 %f34, %f9, %f31;
285
+ .loc 1 44 22
286
+ add.f32 %f35, %f11, %f34;
287
+ add.f32 %f36, %f12, %f33;
288
+ $L__tmp3:
289
+ .loc 2 96 20
290
+ sub.f32 %f37, %f36, %f6;
291
+ mov.b32 %r62, %f37;
292
+ sub.f32 %f38, %f35, %f5;
293
+ mov.b32 %r59, %f38;
294
+ mov.b32 %r60, 1073741824;
295
+ .loc 2 98 30
296
+ div.full.f32 %r58, %r59, %r60;
297
+ mov.b32 %f39, %r58;
298
+ div.full.f32 %r61, %r62, %r60;
299
+ mov.b32 %f40, %r61;
300
+ .loc 2 98 22
301
+ add.f32 %f41, %f6, %f40;
302
+ add.f32 %f42, %f5, %f39;
303
+ .loc 2 101 30
304
+ sub.f32 %f43, %f35, %f42;
305
+ sub.f32 %f44, %f36, %f41;
306
+ $L__tmp4:
307
+ .loc 1 50 50
308
+ fma.rn.f32 %f45, %f37, %f44, %f8;
309
+ fma.rn.f32 %f46, %f38, %f43, %f7;
310
+ .loc 1 24 33
311
+ and.b32 %r119, %r1, 127;
312
+ .loc 1 31 36
313
+ shl.b32 %r120, %r119, 2;
314
+ mov.u32 %r121, global_smem;
315
+ add.s32 %r8, %r121, %r120;
316
+ st.shared.u32 [%r8], %r60;
317
+ st.shared.u32 [%r8+520], %r60;
318
+ bar.sync 0;
319
+ mad.lo.s32 %r122, %r3, 130, %r5;
320
+ shl.b32 %r123, %r122, 2;
321
+ add.s32 %r124, %r121, %r123;
322
+ ld.shared.v2.f32 {%f47, %f48}, [%r124];
323
+ $L__tmp5:
324
+ .loc 2 120 46
325
+ bar.sync 0;
326
+ $L__tmp6:
327
+ .loc 2 108 21
328
+ sub.f32 %f49, %f41, %f42;
329
+ .loc 2 109 28
330
+ add.f32 %f50, %f47, %f48;
331
+ .loc 2 110 39
332
+ setp.eq.f32 %p41, %f50, 0f00000000;
333
+ .loc 2 110 60
334
+ mov.b32 %r65, %f48;
335
+ mov.b32 %r66, %f50;
336
+ div.full.f32 %r64, %r65, %r66;
337
+ mov.b32 %f51, %r64;
338
+ .loc 2 110 49
339
+ selp.f32 %f52, 0f00000000, %f51, %p41;
340
+ .loc 2 112 17
341
+ fma.rn.f32 %f53, %f49, %f52, %f42;
342
+ .loc 2 113 15
343
+ add.f32 %f54, %f46, %f45;
344
+ .loc 2 113 30
345
+ mul.f32 %f55, %f49, %f49;
346
+ .loc 2 113 38
347
+ mul.f32 %f56, %f55, %f47;
348
+ .loc 2 113 22
349
+ fma.rn.f32 %f57, %f56, %f52, %f54;
350
+ $L__tmp7:
351
+ .loc 2 120 46
352
+ mov.b32 %r125, %f53;
353
+ shfl.sync.bfly.b32 %r126, %r125, 16, 31, -1;
354
+ mov.b32 %f58, %r126;
355
+ mov.b32 %r127, %f57;
356
+ shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
357
+ mov.b32 %f59, %r128;
358
+ shfl.sync.bfly.b32 %r68, %r66, 16, 31, -1;
359
+ mov.b32 %f60, %r68;
360
+ $L__tmp8:
361
+ .loc 2 108 21
362
+ sub.f32 %f61, %f58, %f53;
363
+ .loc 2 109 28
364
+ add.f32 %f62, %f50, %f60;
365
+ .loc 2 110 39
366
+ setp.eq.f32 %p42, %f62, 0f00000000;
367
+ .loc 2 110 60
368
+ mov.b32 %r69, %f62;
369
+ div.full.f32 %r67, %r68, %r69;
370
+ mov.b32 %f63, %r67;
371
+ .loc 2 110 49
372
+ selp.f32 %f64, 0f00000000, %f63, %p42;
373
+ .loc 2 112 17
374
+ fma.rn.f32 %f65, %f61, %f64, %f53;
375
+ .loc 2 113 15
376
+ add.f32 %f66, %f57, %f59;
377
+ .loc 2 113 30
378
+ mul.f32 %f67, %f61, %f61;
379
+ .loc 2 113 38
380
+ mul.f32 %f68, %f50, %f67;
381
+ .loc 2 113 22
382
+ fma.rn.f32 %f69, %f68, %f64, %f66;
383
+ $L__tmp9:
384
+ .loc 2 120 46
385
+ mov.b32 %r129, %f65;
386
+ shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
387
+ mov.b32 %f70, %r130;
388
+ mov.b32 %r131, %f69;
389
+ shfl.sync.bfly.b32 %r132, %r131, 8, 31, -1;
390
+ mov.b32 %f71, %r132;
391
+ shfl.sync.bfly.b32 %r71, %r69, 8, 31, -1;
392
+ mov.b32 %f72, %r71;
393
+ $L__tmp10:
394
+ .loc 2 108 21
395
+ sub.f32 %f73, %f70, %f65;
396
+ .loc 2 109 28
397
+ add.f32 %f74, %f62, %f72;
398
+ .loc 2 110 39
399
+ setp.eq.f32 %p43, %f74, 0f00000000;
400
+ .loc 2 110 60
401
+ mov.b32 %r72, %f74;
402
+ div.full.f32 %r70, %r71, %r72;
403
+ mov.b32 %f75, %r70;
404
+ .loc 2 110 49
405
+ selp.f32 %f76, 0f00000000, %f75, %p43;
406
+ .loc 2 112 17
407
+ fma.rn.f32 %f77, %f73, %f76, %f65;
408
+ .loc 2 113 15
409
+ add.f32 %f78, %f69, %f71;
410
+ .loc 2 113 30
411
+ mul.f32 %f79, %f73, %f73;
412
+ .loc 2 113 38
413
+ mul.f32 %f80, %f62, %f79;
414
+ .loc 2 113 22
415
+ fma.rn.f32 %f81, %f76, %f80, %f78;
416
+ $L__tmp11:
417
+ .loc 2 120 46
418
+ mov.b32 %r133, %f77;
419
+ shfl.sync.bfly.b32 %r134, %r133, 4, 31, -1;
420
+ mov.b32 %f82, %r134;
421
+ mov.b32 %r135, %f81;
422
+ shfl.sync.bfly.b32 %r136, %r135, 4, 31, -1;
423
+ mov.b32 %f83, %r136;
424
+ shfl.sync.bfly.b32 %r74, %r72, 4, 31, -1;
425
+ mov.b32 %f84, %r74;
426
+ $L__tmp12:
427
+ .loc 2 108 21
428
+ sub.f32 %f85, %f82, %f77;
429
+ .loc 2 109 28
430
+ add.f32 %f86, %f74, %f84;
431
+ .loc 2 110 39
432
+ setp.eq.f32 %p44, %f86, 0f00000000;
433
+ .loc 2 110 60
434
+ mov.b32 %r75, %f86;
435
+ div.full.f32 %r73, %r74, %r75;
436
+ mov.b32 %f87, %r73;
437
+ .loc 2 110 49
438
+ selp.f32 %f88, 0f00000000, %f87, %p44;
439
+ .loc 2 112 17
440
+ fma.rn.f32 %f89, %f85, %f88, %f77;
441
+ .loc 2 113 15
442
+ add.f32 %f90, %f81, %f83;
443
+ .loc 2 113 30
444
+ mul.f32 %f91, %f85, %f85;
445
+ .loc 2 113 38
446
+ mul.f32 %f92, %f74, %f91;
447
+ .loc 2 113 22
448
+ fma.rn.f32 %f93, %f88, %f92, %f90;
449
+ $L__tmp13:
450
+ .loc 2 120 46
451
+ mov.b32 %r137, %f89;
452
+ shfl.sync.bfly.b32 %r138, %r137, 2, 31, -1;
453
+ mov.b32 %f94, %r138;
454
+ mov.b32 %r139, %f93;
455
+ shfl.sync.bfly.b32 %r140, %r139, 2, 31, -1;
456
+ mov.b32 %f95, %r140;
457
+ shfl.sync.bfly.b32 %r77, %r75, 2, 31, -1;
458
+ mov.b32 %f96, %r77;
459
+ $L__tmp14:
460
+ .loc 2 108 21
461
+ sub.f32 %f97, %f94, %f89;
462
+ .loc 2 109 28
463
+ add.f32 %f98, %f86, %f96;
464
+ .loc 2 110 39
465
+ setp.eq.f32 %p45, %f98, 0f00000000;
466
+ .loc 2 110 60
467
+ mov.b32 %r78, %f98;
468
+ div.full.f32 %r76, %r77, %r78;
469
+ mov.b32 %f99, %r76;
470
+ .loc 2 110 49
471
+ selp.f32 %f100, 0f00000000, %f99, %p45;
472
+ .loc 2 112 17
473
+ fma.rn.f32 %f101, %f97, %f100, %f89;
474
+ .loc 2 113 15
475
+ add.f32 %f102, %f93, %f95;
476
+ .loc 2 113 30
477
+ mul.f32 %f103, %f97, %f97;
478
+ .loc 2 113 38
479
+ mul.f32 %f104, %f86, %f103;
480
+ .loc 2 113 22
481
+ fma.rn.f32 %f105, %f100, %f104, %f102;
482
+ $L__tmp15:
483
+ .loc 2 120 46
484
+ mov.b32 %r141, %f101;
485
+ shfl.sync.bfly.b32 %r142, %r141, 1, 31, -1;
486
+ mov.b32 %f106, %r142;
487
+ mov.b32 %r143, %f105;
488
+ shfl.sync.bfly.b32 %r144, %r143, 1, 31, -1;
489
+ mov.b32 %f107, %r144;
490
+ shfl.sync.bfly.b32 %r80, %r78, 1, 31, -1;
491
+ mov.b32 %f108, %r80;
492
+ $L__tmp16:
493
+ .loc 2 108 21
494
+ sub.f32 %f109, %f106, %f101;
495
+ .loc 2 109 28
496
+ add.f32 %f110, %f98, %f108;
497
+ .loc 2 110 39
498
+ setp.eq.f32 %p46, %f110, 0f00000000;
499
+ .loc 2 110 60
500
+ mov.b32 %r81, %f110;
501
+ div.full.f32 %r79, %r80, %r81;
502
+ mov.b32 %f111, %r79;
503
+ .loc 2 110 49
504
+ selp.f32 %f112, 0f00000000, %f111, %p46;
505
+ .loc 2 112 17
506
+ fma.rn.f32 %f113, %f109, %f112, %f101;
507
+ .loc 2 113 15
508
+ add.f32 %f114, %f105, %f107;
509
+ .loc 2 113 30
510
+ mul.f32 %f115, %f109, %f109;
511
+ .loc 2 113 38
512
+ mul.f32 %f116, %f98, %f115;
513
+ .loc 2 113 22
514
+ fma.rn.f32 %f117, %f112, %f116, %f114;
515
+ $L__tmp17:
516
+ .loc 2 120 46
517
+ setp.eq.s32 %p24, %r2, 0;
518
+ shr.u32 %r145, %r1, 3;
519
+ and.b32 %r146, %r145, 4;
520
+ shl.b32 %r147, %r3, 3;
521
+ or.b32 %r148, %r147, %r146;
522
+ add.s32 %r82, %r121, %r148;
523
+ mov.b32 %r83, %f113;
524
+ @%p24 st.shared.b32 [ %r82 + 0 ], %r83;
525
+ add.s32 %r149, %r121, 16;
526
+ add.s32 %r84, %r149, %r148;
527
+ mov.b32 %r85, %f117;
528
+ @%p24 st.shared.b32 [ %r84 + 0 ], %r85;
529
+ add.s32 %r150, %r121, 32;
530
+ add.s32 %r86, %r150, %r148;
531
+ @%p24 st.shared.b32 [ %r86 + 0 ], %r81;
532
+ bar.sync 0;
533
+ setp.lt.s32 %p27, %r1, 4;
534
+ shl.b32 %r151, %r1, 2;
535
+ add.s32 %r89, %r121, %r151;
536
+ @%p27 ld.shared.b32 %r88, [ %r89 + 0 ];
537
+ mov.b32 %f118, %r88;
538
+ add.s32 %r91, %r149, %r151;
539
+ @%p27 ld.shared.b32 %r90, [ %r91 + 0 ];
540
+ mov.b32 %f119, %r90;
541
+ add.s32 %r93, %r150, %r151;
542
+ @%p27 ld.shared.b32 %r92, [ %r93 + 0 ];
543
+ mov.b32 %f120, %r92;
544
+ shfl.sync.bfly.b32 %r152, %r88, 1, 31, -1;
545
+ mov.b32 %f121, %r152;
546
+ shfl.sync.bfly.b32 %r153, %r90, 1, 31, -1;
547
+ mov.b32 %f122, %r153;
548
+ shfl.sync.bfly.b32 %r95, %r92, 1, 31, -1;
549
+ mov.b32 %f123, %r95;
550
+ $L__tmp18:
551
+ .loc 2 108 21
552
+ sub.f32 %f124, %f121, %f118;
553
+ .loc 2 109 28
554
+ add.f32 %f125, %f120, %f123;
555
+ .loc 2 110 39
556
+ setp.eq.f32 %p47, %f125, 0f00000000;
557
+ .loc 2 110 60
558
+ mov.b32 %r96, %f125;
559
+ div.full.f32 %r94, %r95, %r96;
560
+ mov.b32 %f126, %r94;
561
+ .loc 2 110 49
562
+ selp.f32 %f127, 0f00000000, %f126, %p47;
563
+ .loc 2 112 17
564
+ fma.rn.f32 %f128, %f124, %f127, %f118;
565
+ .loc 2 113 15
566
+ add.f32 %f129, %f119, %f122;
567
+ .loc 2 113 30
568
+ mul.f32 %f130, %f124, %f124;
569
+ .loc 2 113 38
570
+ mul.f32 %f131, %f120, %f130;
571
+ .loc 2 113 22
572
+ fma.rn.f32 %f132, %f131, %f127, %f129;
573
+ $L__tmp19:
574
+ .loc 2 120 46
575
+ setp.eq.s32 %p48, %r4, 0;
576
+ and.pred %p30, %p27, %p48;
577
+ mov.b32 %r98, %f128;
578
+ @%p30 st.shared.b32 [ %r89 + 0 ], %r98;
579
+ mov.b32 %r100, %f132;
580
+ @%p30 st.shared.b32 [ %r91 + 0 ], %r100;
581
+ @%p30 st.shared.b32 [ %r93 + 0 ], %r96;
582
+ bar.sync 0;
583
+ add.s32 %r154, %r121, %r147;
584
+ ld.shared.f32 %f13, [%r154];
585
+ add.s32 %r155, %r149, %r147;
586
+ $L__tmp20:
587
+ .loc 1 75 24
588
+ ld.shared.u32 %r104, [%r155];
589
+ mov.b32 %r105, 1132462080;
590
+ div.full.f32 %r103, %r104, %r105;
591
+ mov.b32 %f133, %r103;
592
+ .loc 1 77 24
593
+ add.f32 %f14, %f133, 0f3727C5AC;
594
+ shl.b32 %r156, %r5, 2;
595
+ add.s32 %r9, %r121, %r156;
596
+ .loc 1 62 51
597
+ mov.u32 %r109, 0x0;
598
+ mov.u32 %r110, 0x0;
599
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r109, %r110 }, [ %rd62 + 0 ];
600
+ @!%p61 mov.u32 %r109, %r179;
601
+ @!%p61 mov.u32 %r110, %r179;
602
+ mov.b32 %f15, %r109;
603
+ mov.b32 %f16, %r110;
604
+ .loc 1 63 51
605
+ mov.u32 %r113, 0x0;
606
+ @%p61 ld.global.L1::evict_first.b32 { %r113 }, [ %rd63 + 0 ];
607
+ @!%p61 mov.u32 %r113, %r179;
608
+ cvt.u16.u32 %rs5, %r113;
609
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r113; }
610
+ .loc 1 63 103
611
+ cvt.f32.bf16 %r115, %rs5;
612
+ mov.b32 %f17, %r115;
613
+ cvt.f32.bf16 %r116, %rs6;
614
+ mov.b32 %f18, %r116;
615
+ .loc 1 64 35
616
+ mul.wide.u32 %rd65, %r119, 4;
617
+ add.s64 %rd64, %rd14, %rd65;
618
+ .loc 1 64 40
619
+ mov.u32 %r117, 0x0;
620
+ @%p61 ld.global.L1::evict_last.b32 { %r117 }, [ %rd64 + 0 ];
621
+ @!%p61 mov.u32 %r117, %r179;
622
+ mov.u64 %rd90, assertMessage_1;
623
+ mov.u64 %rd91, assertFile_1;
624
+ mov.u64 %rd92, assertFunc_1;
625
+ .loc 1 68 57
626
+ @%p11 bra $L__BB0_6;
627
+ cvta.global.u64 %rd67, %rd90;
628
+ cvta.global.u64 %rd69, %rd91;
629
+ cvta.global.u64 %rd71, %rd92;
630
+ { // callseq 4, 0
631
+ .reg .b32 temp_param_reg;
632
+ .param .b64 param0;
633
+ st.param.b64 [param0+0], %rd67;
634
+ .param .b64 param1;
635
+ st.param.b64 [param1+0], %rd69;
636
+ .param .b32 param2;
637
+ st.param.b32 [param2+0], %r187;
638
+ .param .b64 param3;
639
+ st.param.b64 [param3+0], %rd71;
640
+ .param .b64 param4;
641
+ st.param.b64 [param4+0], %rd98;
642
+ call.uni
643
+ __assertfail,
644
+ (
645
+ param0,
646
+ param1,
647
+ param2,
648
+ param3,
649
+ param4
650
+ );
651
+ } // callseq 4
652
+ $L__BB0_6:
653
+ .loc 1 69 54
654
+ mov.u32 %r158, 0x0;
655
+ mov.u32 %r159, 0x0;
656
+ @%p61 ld.global.L1::evict_first.v2.b32 { %r158, %r159 }, [ %rd73 + 0 ];
657
+ @!%p61 mov.u32 %r158, %r179;
658
+ @!%p61 mov.u32 %r159, %r179;
659
+ mov.b32 %f134, %r158;
660
+ mov.b32 %f135, %r159;
661
+ .loc 1 70 24
662
+ add.f32 %f136, %f15, %f134;
663
+ add.f32 %f137, %f16, %f135;
664
+ .loc 1 72 24
665
+ add.f32 %f138, %f17, %f136;
666
+ add.f32 %f139, %f18, %f137;
667
+ .loc 1 73 24
668
+ sub.f32 %f140, %f138, %f13;
669
+ sub.f32 %f141, %f139, %f13;
670
+ .loc 1 78 30
671
+ rsqrt.approx.ftz.f32 %f142, %f14;
672
+ .loc 1 79 24
673
+ mul.f32 %f143, %f140, %f142;
674
+ mul.f32 %f144, %f141, %f142;
675
+ .loc 1 80 24
676
+ bar.sync 0;
677
+ st.shared.u32 [%r8], %r117;
678
+ bar.sync 0;
679
+ ld.shared.v2.f32 {%f145, %f146}, [%r9];
680
+ mul.f32 %f147, %f143, %f145;
681
+ mul.f32 %f148, %f144, %f146;
682
+ .loc 1 82 29
683
+ shl.b64 %rd78, %rd4, 1;
684
+ add.s64 %rd74, %rd15, %rd78;
685
+ .loc 1 82 52
686
+ mov.b32 %r162, %f147;
687
+ cvt.rn.bf16.f32 %rs7, %r162;
688
+ mov.b32 %r163, %f148;
689
+ cvt.rn.bf16.f32 %rs8, %r163;
690
+ mov.b32 %r175, {%rs7, %rs8};
691
+ @%p61 st.global.b32 [ %rd74 + 0 ], { %r175 };
692
+ .loc 1 62 51
693
+ mov.u32 %r165, 0x0;
694
+ mov.u32 %r166, 0x0;
695
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r165, %r166 }, [ %rd75 + 0 ];
696
+ @!%p61 mov.u32 %r165, %r179;
697
+ @!%p61 mov.u32 %r166, %r179;
698
+ .loc 1 63 51
699
+ mov.u32 %r169, 0x0;
700
+ @%p61 ld.global.L1::evict_first.b32 { %r169 }, [ %rd76 + 0 ];
701
+ @!%p61 mov.u32 %r169, %r179;
702
+ cvt.u16.u32 %rs9, %r169;
703
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r169; }
704
+ .loc 1 63 103
705
+ cvt.f32.bf16 %r171, %rs9;
706
+ mov.b32 %f19, %r171;
707
+ cvt.f32.bf16 %r172, %rs10;
708
+ mov.b32 %f20, %r172;
709
+ .loc 1 64 35
710
+ add.s64 %rd77, %rd64, 512;
711
+ .loc 1 64 40
712
+ mov.u32 %r173, 0x0;
713
+ @%p61 ld.global.L1::evict_last.b32 { %r173 }, [ %rd77 + 0 ];
714
+ @!%p61 mov.u32 %r173, %r179;
715
+ .loc 1 68 57
716
+ @%p11 bra $L__BB0_8;
717
+ cvta.global.u64 %rd80, %rd90;
718
+ cvta.global.u64 %rd82, %rd91;
719
+ cvta.global.u64 %rd84, %rd92;
720
+ { // callseq 5, 0
721
+ .reg .b32 temp_param_reg;
722
+ .param .b64 param0;
723
+ st.param.b64 [param0+0], %rd80;
724
+ .param .b64 param1;
725
+ st.param.b64 [param1+0], %rd82;
726
+ .param .b32 param2;
727
+ st.param.b32 [param2+0], %r187;
728
+ .param .b64 param3;
729
+ st.param.b64 [param3+0], %rd84;
730
+ .param .b64 param4;
731
+ st.param.b64 [param4+0], %rd98;
732
+ call.uni
733
+ __assertfail,
734
+ (
735
+ param0,
736
+ param1,
737
+ param2,
738
+ param3,
739
+ param4
740
+ );
741
+ } // callseq 5
742
+ $L__BB0_8:
743
+ .loc 1 69 54
744
+ mov.u32 %r177, 0x0;
745
+ mov.u32 %r178, 0x0;
746
+ @%p61 ld.global.L1::evict_first.v2.b32 { %r177, %r178 }, [ %rd86 + 0 ];
747
+ @!%p61 mov.u32 %r177, %r179;
748
+ @!%p61 mov.u32 %r178, %r179;
749
+ .loc 1 62 51
750
+ mov.b32 %f150, %r166;
751
+ .loc 1 69 54
752
+ mov.b32 %f151, %r178;
753
+ .loc 1 70 24
754
+ add.f32 %f152, %f150, %f151;
755
+ .loc 1 72 24
756
+ add.f32 %f153, %f20, %f152;
757
+ .loc 1 73 24
758
+ sub.f32 %f154, %f153, %f13;
759
+ .loc 1 62 51
760
+ mov.b32 %f155, %r165;
761
+ .loc 1 69 54
762
+ mov.b32 %f156, %r177;
763
+ .loc 1 70 24
764
+ add.f32 %f157, %f155, %f156;
765
+ .loc 1 72 24
766
+ add.f32 %f158, %f19, %f157;
767
+ .loc 1 73 24
768
+ sub.f32 %f159, %f158, %f13;
769
+ .loc 1 79 24
770
+ mul.f32 %f160, %f159, %f142;
771
+ mul.f32 %f161, %f154, %f142;
772
+ .loc 1 80 24
773
+ bar.sync 0;
774
+ st.shared.u32 [%r8], %r173;
775
+ bar.sync 0;
776
+ ld.shared.v2.f32 {%f162, %f163}, [%r9];
777
+ mul.f32 %f164, %f160, %f162;
778
+ mul.f32 %f165, %f161, %f163;
779
+ .loc 1 82 29
780
+ add.s64 %rd89, %rd15, %rd52;
781
+ add.s64 %rd87, %rd89, 256;
782
+ .loc 1 82 52
783
+ mov.b32 %r181, %f164;
784
+ cvt.rn.bf16.f32 %rs11, %r181;
785
+ mov.b32 %r182, %f165;
786
+ cvt.rn.bf16.f32 %rs12, %r182;
787
+ mov.b32 %r184, {%rs11, %rs12};
788
+ @%p61 st.global.b32 [ %rd87 + 0 ], { %r184 };
789
+ .loc 1 58 4
790
+ ret;
791
+ $L__tmp21:
792
+ $L__func_end0:
793
+
794
+ }
795
+ // .globl __nv_rsqrtf
796
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
797
+ .param .b32 __nv_rsqrtf_param_0
798
+ )
799
+ {
800
+ .reg .f32 %f<3>;
801
+ $L__func_begin1:
802
+
803
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
804
+ rsqrt.approx.ftz.f32 %f2, %f1;
805
+ st.param.f32 [func_retval0+0], %f2;
806
+ ret;
807
+ $L__func_end1:
808
+
809
+ }
810
+ .file 1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
811
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
812
+ .section .debug_abbrev
813
+ {
814
+ .b8 1
815
+ .b8 17
816
+ .b8 1
817
+ .b8 37
818
+ .b8 8
819
+ .b8 19
820
+ .b8 5
821
+ .b8 3
822
+ .b8 8
823
+ .b8 16
824
+ .b8 6
825
+ .b8 27
826
+ .b8 8
827
+ .b8 180
828
+ .b8 66
829
+ .b8 12
830
+ .b8 17
831
+ .b8 1
832
+ .b8 18
833
+ .b8 1
834
+ .b8 0
835
+ .b8 0
836
+ .b8 2
837
+ .b8 46
838
+ .b8 0
839
+ .b8 135
840
+ .b8 64
841
+ .b8 8
842
+ .b8 3
843
+ .b8 8
844
+ .b8 58
845
+ .b8 11
846
+ .b8 59
847
+ .b8 11
848
+ .b8 63
849
+ .b8 12
850
+ .b8 32
851
+ .b8 11
852
+ .b8 0
853
+ .b8 0
854
+ .b8 3
855
+ .b8 46
856
+ .b8 1
857
+ .b8 17
858
+ .b8 1
859
+ .b8 18
860
+ .b8 1
861
+ .b8 64
862
+ .b8 10
863
+ .b8 49
864
+ .b8 19
865
+ .b8 0
866
+ .b8 0
867
+ .b8 4
868
+ .b8 29
869
+ .b8 0
870
+ .b8 49
871
+ .b8 19
872
+ .b8 17
873
+ .b8 1
874
+ .b8 18
875
+ .b8 1
876
+ .b8 88
877
+ .b8 11
878
+ .b8 89
879
+ .b8 11
880
+ .b8 87
881
+ .b8 11
882
+ .b8 0
883
+ .b8 0
884
+ .b8 5
885
+ .b8 29
886
+ .b8 1
887
+ .b8 49
888
+ .b8 19
889
+ .b8 17
890
+ .b8 1
891
+ .b8 18
892
+ .b8 1
893
+ .b8 88
894
+ .b8 11
895
+ .b8 89
896
+ .b8 11
897
+ .b8 87
898
+ .b8 11
899
+ .b8 0
900
+ .b8 0
901
+ .b8 0
902
+ }
903
+ .section .debug_info
904
+ {
905
+ .b32 302
906
+ .b8 2
907
+ .b8 0
908
+ .b32 .debug_abbrev
909
+ .b8 8
910
+ .b8 1
911
+ .b8 116
912
+ .b8 114
913
+ .b8 105
914
+ .b8 116
915
+ .b8 111
916
+ .b8 110
917
+ .b8 0
918
+ .b8 2
919
+ .b8 0
920
+ .b8 99
921
+ .b8 99
922
+ .b8 105
923
+ .b8 103
924
+ .b8 54
925
+ .b8 102
926
+ .b8 107
927
+ .b8 105
928
+ .b8 54
929
+ .b8 112
930
+ .b8 52
931
+ .b8 108
932
+ .b8 120
933
+ .b8 114
934
+ .b8 100
935
+ .b8 109
936
+ .b8 103
937
+ .b8 103
938
+ .b8 54
939
+ .b8 101
940
+ .b8 117
941
+ .b8 100
942
+ .b8 97
943
+ .b8 104
944
+ .b8 105
945
+ .b8 101
946
+ .b8 120
947
+ .b8 99
948
+ .b8 118
949
+ .b8 117
950
+ .b8 101
951
+ .b8 101
952
+ .b8 111
953
+ .b8 108
954
+ .b8 50
955
+ .b8 112
956
+ .b8 52
957
+ .b8 113
958
+ .b8 112
959
+ .b8 53
960
+ .b8 51
961
+ .b8 50
962
+ .b8 112
963
+ .b8 118
964
+ .b8 118
965
+ .b8 101
966
+ .b8 50
967
+ .b8 121
968
+ .b8 52
969
+ .b8 54
970
+ .b8 51
971
+ .b8 121
972
+ .b8 46
973
+ .b8 112
974
+ .b8 121
975
+ .b8 0
976
+ .b32 .debug_line
977
+ .b8 47
978
+ .b8 116
979
+ .b8 109
980
+ .b8 112
981
+ .b8 47
982
+ .b8 116
983
+ .b8 111
984
+ .b8 114
985
+ .b8 99
986
+ .b8 104
987
+ .b8 105
988
+ .b8 110
989
+ .b8 100
990
+ .b8 117
991
+ .b8 99
992
+ .b8 116
993
+ .b8 111
994
+ .b8 114
995
+ .b8 95
996
+ .b8 114
997
+ .b8 111
998
+ .b8 111
999
+ .b8 116
1000
+ .b8 47
1001
+ .b8 99
1002
+ .b8 105
1003
+ .b8 0
1004
+ .b8 1
1005
+ .b64 $L__func_begin0
1006
+ .b64 $L__func_end0
1007
+ .b8 2
1008
+ .b8 116
1009
+ .b8 114
1010
+ .b8 105
1011
+ .b8 116
1012
+ .b8 111
1013
+ .b8 110
1014
+ .b8 95
1015
+ .b8 95
1016
+ .b8 48
1017
+ .b8 100
1018
+ .b8 49
1019
+ .b8 100
1020
+ .b8 50
1021
+ .b8 100
1022
+ .b8 51
1023
+ .b8 100
1024
+ .b8 52
1025
+ .b8 100
1026
+ .b8 53
1027
+ .b8 100
1028
+ .b8 54
1029
+ .b8 100
1030
+ .b8 101
1031
+ .b8 55
1032
+ .b8 100
1033
+ .b8 101
1034
+ .b8 0
1035
+ .b8 116
1036
+ .b8 114
1037
+ .b8 105
1038
+ .b8 116
1039
+ .b8 111
1040
+ .b8 110
1041
+ .b8 95
1042
+ .b8 95
1043
+ .b8 48
1044
+ .b8 100
1045
+ .b8 49
1046
+ .b8 100
1047
+ .b8 50
1048
+ .b8 100
1049
+ .b8 51
1050
+ .b8 100
1051
+ .b8 52
1052
+ .b8 100
1053
+ .b8 53
1054
+ .b8 100
1055
+ .b8 54
1056
+ .b8 100
1057
+ .b8 101
1058
+ .b8 55
1059
+ .b8 100
1060
+ .b8 101
1061
+ .b8 0
1062
+ .b8 1
1063
+ .b8 18
1064
+ .b8 1
1065
+ .b8 1
1066
+ .b8 3
1067
+ .b64 $L__func_begin0
1068
+ .b64 $L__func_end0
1069
+ .b8 1
1070
+ .b8 156
1071
+ .b32 125
1072
+ .b8 4
1073
+ .b32 125
1074
+ .b64 $L__tmp1
1075
+ .b64 $L__tmp4
1076
+ .b8 2
1077
+ .b8 47
1078
+ .b8 41
1079
+ .b8 4
1080
+ .b32 125
1081
+ .b64 $L__tmp5
1082
+ .b64 $L__tmp20
1083
+ .b8 2
1084
+ .b8 53
1085
+ .b8 44
1086
+ .b8 5
1087
+ .b32 125
1088
+ .b64 $L__tmp6
1089
+ .b64 $L__tmp19
1090
+ .b8 2
1091
+ .b8 53
1092
+ .b8 44
1093
+ .b8 4
1094
+ .b32 125
1095
+ .b64 $L__tmp6
1096
+ .b64 $L__tmp19
1097
+ .b8 2
1098
+ .b8 120
1099
+ .b8 46
1100
+ .b8 0
1101
+ .b8 0
1102
+ .b8 0
1103
+ }
1104
+ .section .debug_pubnames
1105
+ {
1106
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1107
+ $L__pubNames_start0:
1108
+ .b8 2
1109
+ .b8 0
1110
+ .b32 .debug_info
1111
+ .b32 306
1112
+ .b32 125
1113
+ .b8 116
1114
+ .b8 114
1115
+ .b8 105
1116
+ .b8 116
1117
+ .b8 111
1118
+ .b8 110
1119
+ .b8 95
1120
+ .b8 95
1121
+ .b8 48
1122
+ .b8 100
1123
+ .b8 49
1124
+ .b8 100
1125
+ .b8 50
1126
+ .b8 100
1127
+ .b8 51
1128
+ .b8 100
1129
+ .b8 52
1130
+ .b8 100
1131
+ .b8 53
1132
+ .b8 100
1133
+ .b8 54
1134
+ .b8 100
1135
+ .b8 101
1136
+ .b8 55
1137
+ .b8 100
1138
+ .b8 101
1139
+ .b8 0
1140
+ .b32 0
1141
+ $L__pubNames_end0:
1142
+ }
1143
+ .section .debug_pubtypes
1144
+ {
1145
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1146
+ $L__pubTypes_start0:
1147
+ .b8 2
1148
+ .b8 0
1149
+ .b32 .debug_info
1150
+ .b32 306
1151
+ .b32 0
1152
+ $L__pubTypes_end0:
1153
+ }
1154
+ .section .debug_loc { }
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin ADDED
Binary file (15.2 kB). View file
 
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
14
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
19
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
21
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
22
+ )
23
+ .maxntid 64, 1, 1
24
+ {
25
+ .reg .pred %p<29>;
26
+ .reg .b16 %rs<17>;
27
+ .reg .b32 %r<100>;
28
+ .reg .f32 %f<86>;
29
+ .reg .b64 %rd<16>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
35
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
36
+ $L__tmp0:
37
+ .loc 1 26 26
38
+ mov.u32 %r66, %tid.x;
39
+ and.b32 %r67, %r66, 31;
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
42
+ ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
43
+ shl.b32 %r68, %r66, 2;
44
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
45
+ and.b32 %r69, %r68, 252;
46
+ .loc 1 23 28
47
+ mov.u32 %r1, %ctaid.x;
48
+ .loc 1 30 40
49
+ shl.b32 %r70, %r1, 8;
50
+ .loc 1 30 36
51
+ or.b32 %r71, %r70, %r69;
52
+ .loc 1 30 30
53
+ mul.wide.s32 %rd13, %r71, 4;
54
+ add.s64 %rd1, %rd7, %rd13;
55
+ mov.b32 %r6, 0;
56
+ mov.pred %p1, -1;
57
+ .loc 1 30 46
58
+ mov.u32 %r2, 0x0;
59
+ mov.u32 %r3, 0x0;
60
+ mov.u32 %r4, 0x0;
61
+ mov.u32 %r5, 0x0;
62
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
63
+ @!%p1 mov.u32 %r2, %r6;
64
+ @!%p1 mov.u32 %r3, %r6;
65
+ @!%p1 mov.u32 %r4, %r6;
66
+ @!%p1 mov.u32 %r5, %r6;
67
+ mov.b32 %f1, %r4;
68
+ mov.b32 %f2, %r5;
69
+ .loc 1 31 30
70
+ mul.wide.s32 %rd14, %r71, 2;
71
+ add.s64 %rd2, %rd8, %rd14;
72
+ .loc 1 31 46
73
+ mov.u32 %r10, 0x0;
74
+ mov.u32 %r11, 0x0;
75
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
76
+ @!%p1 mov.u32 %r10, %r6;
77
+ @!%p1 mov.u32 %r11, %r6;
78
+ cvt.u16.u32 %rs1, %r10;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
80
+ cvt.u16.u32 %rs3, %r11;
81
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
82
+ .loc 1 31 67
83
+ cvt.f32.bf16 %r14, %rs1;
84
+ mov.b32 %f3, %r14;
85
+ cvt.f32.bf16 %r15, %rs2;
86
+ mov.b32 %f4, %r15;
87
+ cvt.f32.bf16 %r16, %rs3;
88
+ mov.b32 %f5, %r16;
89
+ cvt.f32.bf16 %r17, %rs4;
90
+ mov.b32 %f6, %r17;
91
+ .loc 1 32 30
92
+ add.s64 %rd3, %rd9, %rd14;
93
+ .loc 1 32 46
94
+ mov.u32 %r18, 0x0;
95
+ mov.u32 %r19, 0x0;
96
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
97
+ @!%p1 mov.u32 %r18, %r6;
98
+ @!%p1 mov.u32 %r19, %r6;
99
+ cvt.u16.u32 %rs5, %r18;
100
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
101
+ cvt.u16.u32 %rs7, %r19;
102
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
103
+ .loc 1 32 67
104
+ cvt.f32.bf16 %r22, %rs5;
105
+ mov.b32 %f7, %r22;
106
+ cvt.f32.bf16 %r23, %rs6;
107
+ mov.b32 %f8, %r23;
108
+ cvt.f32.bf16 %r24, %rs7;
109
+ mov.b32 %f9, %r24;
110
+ cvt.f32.bf16 %r25, %rs8;
111
+ mov.b32 %f10, %r25;
112
+ .loc 1 33 30
113
+ add.s64 %rd4, %rd10, %rd14;
114
+ .loc 1 33 46
115
+ mov.u32 %r26, 0x0;
116
+ mov.u32 %r27, 0x0;
117
+ @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
118
+ @!%p1 mov.u32 %r26, %r6;
119
+ @!%p1 mov.u32 %r27, %r6;
120
+ cvt.u16.u32 %rs9, %r26;
121
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
122
+ cvt.u16.u32 %rs11, %r27;
123
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
124
+ .loc 1 33 67
125
+ cvt.f32.bf16 %r30, %rs9;
126
+ mov.b32 %f11, %r30;
127
+ cvt.f32.bf16 %r31, %rs10;
128
+ mov.b32 %f12, %r31;
129
+ cvt.f32.bf16 %r32, %rs11;
130
+ mov.b32 %f13, %r32;
131
+ cvt.f32.bf16 %r33, %rs12;
132
+ mov.b32 %f14, %r33;
133
+ .loc 1 34 31
134
+ mul.wide.u32 %rd15, %r69, 4;
135
+ add.s64 %rd5, %rd11, %rd15;
136
+ .loc 1 34 36
137
+ mov.u32 %r34, 0x0;
138
+ mov.u32 %r35, 0x0;
139
+ mov.u32 %r36, 0x0;
140
+ mov.u32 %r37, 0x0;
141
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
142
+ @!%p1 mov.u32 %r34, %r6;
143
+ @!%p1 mov.u32 %r35, %r6;
144
+ @!%p1 mov.u32 %r36, %r6;
145
+ @!%p1 mov.u32 %r37, %r6;
146
+ .loc 1 36 18
147
+ add.f32 %f15, %f5, %f1;
148
+ add.f32 %f16, %f6, %f2;
149
+ .loc 1 38 18
150
+ add.f32 %f17, %f15, %f9;
151
+ add.f32 %f18, %f16, %f10;
152
+ .loc 1 30 46
153
+ mov.b32 %f19, %r2;
154
+ mov.b32 %f20, %r3;
155
+ .loc 1 36 18
156
+ add.f32 %f21, %f4, %f20;
157
+ add.f32 %f22, %f3, %f19;
158
+ .loc 1 38 18
159
+ add.f32 %f23, %f22, %f7;
160
+ add.f32 %f24, %f21, %f8;
161
+ .loc 1 40 18
162
+ add.f32 %f25, %f24, %f12;
163
+ add.f32 %f26, %f23, %f11;
164
+ add.f32 %f27, %f17, %f13;
165
+ add.f32 %f28, %f18, %f14;
166
+ $L__tmp1:
167
+ .loc 2 233 15
168
+ add.f32 %f29, %f26, %f25;
169
+ add.f32 %f30, %f29, %f27;
170
+ add.f32 %f31, %f30, %f28;
171
+ $L__tmp2:
172
+ .loc 2 243 36
173
+ mov.b32 %r72, %f31;
174
+ shfl.sync.bfly.b32 %r73, %r72, 16, 31, -1;
175
+ mov.b32 %f32, %r73;
176
+ $L__tmp3:
177
+ .loc 2 233 15
178
+ add.f32 %f33, %f31, %f32;
179
+ $L__tmp4:
180
+ .loc 2 243 36
181
+ mov.b32 %r74, %f33;
182
+ shfl.sync.bfly.b32 %r75, %r74, 8, 31, -1;
183
+ mov.b32 %f34, %r75;
184
+ $L__tmp5:
185
+ .loc 2 233 15
186
+ add.f32 %f35, %f33, %f34;
187
+ $L__tmp6:
188
+ .loc 2 243 36
189
+ mov.b32 %r76, %f35;
190
+ shfl.sync.bfly.b32 %r77, %r76, 4, 31, -1;
191
+ mov.b32 %f36, %r77;
192
+ $L__tmp7:
193
+ .loc 2 233 15
194
+ add.f32 %f37, %f35, %f36;
195
+ $L__tmp8:
196
+ .loc 2 243 36
197
+ mov.b32 %r78, %f37;
198
+ shfl.sync.bfly.b32 %r79, %r78, 2, 31, -1;
199
+ mov.b32 %f38, %r79;
200
+ $L__tmp9:
201
+ .loc 2 233 15
202
+ add.f32 %f39, %f37, %f38;
203
+ $L__tmp10:
204
+ .loc 2 243 36
205
+ mov.b32 %r80, %f39;
206
+ shfl.sync.bfly.b32 %r81, %r80, 1, 31, -1;
207
+ mov.b32 %f40, %r81;
208
+ $L__tmp11:
209
+ .loc 2 233 15
210
+ add.f32 %f41, %f39, %f40;
211
+ $L__tmp12:
212
+ .loc 2 243 36
213
+ setp.eq.s32 %p20, %r67, 0;
214
+ shr.u32 %r82, %r66, 3;
215
+ and.b32 %r83, %r82, 4;
216
+ mov.u32 %r84, global_smem;
217
+ add.s32 %r42, %r84, %r83;
218
+ mov.b32 %r43, %f41;
219
+ @%p20 st.shared.b32 [ %r42 + 0 ], %r43;
220
+ bar.sync 0;
221
+ setp.lt.s32 %p21, %r66, 2;
222
+ add.s32 %r45, %r84, %r68;
223
+ @%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
224
+ mov.b32 %f42, %r44;
225
+ shfl.sync.bfly.b32 %r85, %r44, 1, 31, -1;
226
+ mov.b32 %f43, %r85;
227
+ $L__tmp13:
228
+ .loc 2 233 15
229
+ add.f32 %f44, %f42, %f43;
230
+ $L__tmp14:
231
+ .loc 2 243 36
232
+ and.b32 %r86, %r66, 1;
233
+ setp.eq.b32 %p27, %r86, 1;
234
+ not.pred %p28, %p27;
235
+ and.pred %p22, %p21, %p28;
236
+ mov.b32 %r47, %f44;
237
+ @%p22 st.shared.b32 [ %r45 + 0 ], %r47;
238
+ bar.sync 0;
239
+ ld.shared.f32 %f45, [global_smem];
240
+ $L__tmp15:
241
+ .loc 3 8 15
242
+ add.f32 %f46, %f45, 0f00000000;
243
+ $L__tmp16:
244
+ .loc 1 48 20
245
+ mov.b32 %r49, %f46;
246
+ mov.b32 %r50, 1132462080;
247
+ div.full.f32 %r48, %r49, %r50;
248
+ mov.b32 %f47, %r48;
249
+ .loc 1 49 20
250
+ sub.f32 %f48, %f26, %f47;
251
+ sub.f32 %f49, %f25, %f47;
252
+ sub.f32 %f50, %f27, %f47;
253
+ sub.f32 %f51, %f28, %f47;
254
+ .loc 1 50 20
255
+ mul.f32 %f52, %f49, %f49;
256
+ $L__tmp17:
257
+ .loc 2 243 36
258
+ bar.sync 0;
259
+ $L__tmp18:
260
+ .loc 2 233 15
261
+ fma.rn.f32 %f53, %f48, %f48, %f52;
262
+ fma.rn.f32 %f54, %f50, %f50, %f53;
263
+ fma.rn.f32 %f55, %f51, %f51, %f54;
264
+ $L__tmp19:
265
+ .loc 2 243 36
266
+ mov.b32 %r87, %f55;
267
+ shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
268
+ mov.b32 %f56, %r88;
269
+ $L__tmp20:
270
+ .loc 2 233 15
271
+ add.f32 %f57, %f55, %f56;
272
+ $L__tmp21:
273
+ .loc 2 243 36
274
+ mov.b32 %r89, %f57;
275
+ shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
276
+ mov.b32 %f58, %r90;
277
+ $L__tmp22:
278
+ .loc 2 233 15
279
+ add.f32 %f59, %f57, %f58;
280
+ $L__tmp23:
281
+ .loc 2 243 36
282
+ mov.b32 %r91, %f59;
283
+ shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
284
+ mov.b32 %f60, %r92;
285
+ $L__tmp24:
286
+ .loc 2 233 15
287
+ add.f32 %f61, %f59, %f60;
288
+ $L__tmp25:
289
+ .loc 2 243 36
290
+ mov.b32 %r93, %f61;
291
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
292
+ mov.b32 %f62, %r94;
293
+ $L__tmp26:
294
+ .loc 2 233 15
295
+ add.f32 %f63, %f61, %f62;
296
+ $L__tmp27:
297
+ .loc 2 243 36
298
+ mov.b32 %r95, %f63;
299
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
300
+ mov.b32 %f64, %r96;
301
+ $L__tmp28:
302
+ .loc 2 233 15
303
+ add.f32 %f65, %f63, %f64;
304
+ $L__tmp29:
305
+ .loc 2 243 36
306
+ mov.b32 %r52, %f65;
307
+ @%p20 st.shared.b32 [ %r42 + 0 ], %r52;
308
+ bar.sync 0;
309
+ @%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
310
+ mov.b32 %f66, %r53;
311
+ shfl.sync.bfly.b32 %r97, %r53, 1, 31, -1;
312
+ mov.b32 %f67, %r97;
313
+ $L__tmp30:
314
+ .loc 2 233 15
315
+ add.f32 %f68, %f66, %f67;
316
+ $L__tmp31:
317
+ .loc 2 243 36
318
+ mov.b32 %r56, %f68;
319
+ @%p22 st.shared.b32 [ %r45 + 0 ], %r56;
320
+ bar.sync 0;
321
+ ld.shared.f32 %f69, [global_smem];
322
+ $L__tmp32:
323
+ .loc 3 8 15
324
+ add.f32 %f70, %f69, 0f00000000;
325
+ $L__tmp33:
326
+ .loc 1 56 20
327
+ mov.b32 %r58, %f70;
328
+ div.full.f32 %r57, %r58, %r50;
329
+ mov.b32 %f71, %r57;
330
+ .loc 1 58 20
331
+ add.f32 %f72, %f71, 0f3727C5AC;
332
+ .loc 1 59 26
333
+ rsqrt.approx.ftz.f32 %f73, %f72;
334
+ .loc 1 34 36
335
+ mov.b32 %f74, %r37;
336
+ mov.b32 %f75, %r36;
337
+ mov.b32 %f76, %r35;
338
+ mov.b32 %f77, %r34;
339
+ .loc 1 60 20
340
+ mul.f32 %f78, %f48, %f73;
341
+ mul.f32 %f79, %f49, %f73;
342
+ mul.f32 %f80, %f50, %f73;
343
+ mul.f32 %f81, %f51, %f73;
344
+ .loc 1 61 20
345
+ mul.f32 %f82, %f78, %f77;
346
+ mul.f32 %f83, %f79, %f76;
347
+ mul.f32 %f84, %f80, %f75;
348
+ mul.f32 %f85, %f81, %f74;
349
+ .loc 1 63 25
350
+ add.s64 %rd6, %rd12, %rd14;
351
+ .loc 1 63 48
352
+ mov.b32 %r60, %f82;
353
+ cvt.rn.bf16.f32 %rs13, %r60;
354
+ mov.b32 %r61, %f83;
355
+ cvt.rn.bf16.f32 %rs14, %r61;
356
+ mov.b32 %r62, %f84;
357
+ cvt.rn.bf16.f32 %rs15, %r62;
358
+ mov.b32 %r63, %f85;
359
+ cvt.rn.bf16.f32 %rs16, %r63;
360
+ mov.b32 %r98, {%rs13, %rs14};
361
+ mov.b32 %r99, {%rs15, %rs16};
362
+ @%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r98, %r99 };
363
+ .loc 1 63 4
364
+ ret;
365
+ $L__tmp34:
366
+ $L__func_end0:
367
+
368
+ }
369
+ // .globl __nv_rsqrtf
370
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
371
+ .param .b32 __nv_rsqrtf_param_0
372
+ )
373
+ {
374
+ .reg .f32 %f<3>;
375
+ $L__func_begin1:
376
+
377
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
378
+ rsqrt.approx.ftz.f32 %f2, %f1;
379
+ st.param.f32 [func_retval0+0], %f2;
380
+ ret;
381
+ $L__func_end1:
382
+
383
+ }
384
+ .file 1 "/tmp/torchinductor_root/4q/c4qmi2qsgi5mnuig7w3wx5jmjnmvktjlgcv4c6q7w2vaw3bk6qzb.py"
385
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
386
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
387
+ .section .debug_abbrev
388
+ {
389
+ .b8 1
390
+ .b8 17
391
+ .b8 1
392
+ .b8 37
393
+ .b8 8
394
+ .b8 19
395
+ .b8 5
396
+ .b8 3
397
+ .b8 8
398
+ .b8 16
399
+ .b8 6
400
+ .b8 27
401
+ .b8 8
402
+ .b8 180
403
+ .b8 66
404
+ .b8 12
405
+ .b8 17
406
+ .b8 1
407
+ .b8 18
408
+ .b8 1
409
+ .b8 0
410
+ .b8 0
411
+ .b8 2
412
+ .b8 46
413
+ .b8 0
414
+ .b8 135
415
+ .b8 64
416
+ .b8 8
417
+ .b8 3
418
+ .b8 8
419
+ .b8 58
420
+ .b8 11
421
+ .b8 59
422
+ .b8 11
423
+ .b8 63
424
+ .b8 12
425
+ .b8 32
426
+ .b8 11
427
+ .b8 0
428
+ .b8 0
429
+ .b8 3
430
+ .b8 46
431
+ .b8 1
432
+ .b8 17
433
+ .b8 1
434
+ .b8 18
435
+ .b8 1
436
+ .b8 64
437
+ .b8 10
438
+ .b8 49
439
+ .b8 19
440
+ .b8 0
441
+ .b8 0
442
+ .b8 4
443
+ .b8 29
444
+ .b8 1
445
+ .b8 49
446
+ .b8 19
447
+ .b8 17
448
+ .b8 1
449
+ .b8 18
450
+ .b8 1
451
+ .b8 88
452
+ .b8 11
453
+ .b8 89
454
+ .b8 11
455
+ .b8 87
456
+ .b8 11
457
+ .b8 0
458
+ .b8 0
459
+ .b8 5
460
+ .b8 29
461
+ .b8 0
462
+ .b8 49
463
+ .b8 19
464
+ .b8 17
465
+ .b8 1
466
+ .b8 18
467
+ .b8 1
468
+ .b8 88
469
+ .b8 11
470
+ .b8 89
471
+ .b8 11
472
+ .b8 87
473
+ .b8 11
474
+ .b8 0
475
+ .b8 0
476
+ .b8 0
477
+ }
478
+ .section .debug_info
479
+ {
480
+ .b32 399
481
+ .b8 2
482
+ .b8 0
483
+ .b32 .debug_abbrev
484
+ .b8 8
485
+ .b8 1
486
+ .b8 116
487
+ .b8 114
488
+ .b8 105
489
+ .b8 116
490
+ .b8 111
491
+ .b8 110
492
+ .b8 0
493
+ .b8 2
494
+ .b8 0
495
+ .b8 99
496
+ .b8 52
497
+ .b8 113
498
+ .b8 109
499
+ .b8 105
500
+ .b8 50
501
+ .b8 113
502
+ .b8 115
503
+ .b8 103
504
+ .b8 105
505
+ .b8 53
506
+ .b8 109
507
+ .b8 110
508
+ .b8 117
509
+ .b8 105
510
+ .b8 103
511
+ .b8 55
512
+ .b8 119
513
+ .b8 51
514
+ .b8 119
515
+ .b8 120
516
+ .b8 53
517
+ .b8 106
518
+ .b8 109
519
+ .b8 106
520
+ .b8 110
521
+ .b8 109
522
+ .b8 118
523
+ .b8 107
524
+ .b8 116
525
+ .b8 106
526
+ .b8 108
527
+ .b8 103
528
+ .b8 99
529
+ .b8 118
530
+ .b8 52
531
+ .b8 99
532
+ .b8 54
533
+ .b8 113
534
+ .b8 55
535
+ .b8 119
536
+ .b8 50
537
+ .b8 118
538
+ .b8 97
539
+ .b8 119
540
+ .b8 51
541
+ .b8 98
542
+ .b8 107
543
+ .b8 54
544
+ .b8 113
545
+ .b8 122
546
+ .b8 98
547
+ .b8 46
548
+ .b8 112
549
+ .b8 121
550
+ .b8 0
551
+ .b32 .debug_line
552
+ .b8 47
553
+ .b8 116
554
+ .b8 109
555
+ .b8 112
556
+ .b8 47
557
+ .b8 116
558
+ .b8 111
559
+ .b8 114
560
+ .b8 99
561
+ .b8 104
562
+ .b8 105
563
+ .b8 110
564
+ .b8 100
565
+ .b8 117
566
+ .b8 99
567
+ .b8 116
568
+ .b8 111
569
+ .b8 114
570
+ .b8 95
571
+ .b8 114
572
+ .b8 111
573
+ .b8 111
574
+ .b8 116
575
+ .b8 47
576
+ .b8 52
577
+ .b8 113
578
+ .b8 0
579
+ .b8 1
580
+ .b64 $L__func_begin0
581
+ .b64 $L__func_end0
582
+ .b8 2
583
+ .b8 116
584
+ .b8 114
585
+ .b8 105
586
+ .b8 116
587
+ .b8 111
588
+ .b8 110
589
+ .b8 95
590
+ .b8 95
591
+ .b8 48
592
+ .b8 100
593
+ .b8 49
594
+ .b8 100
595
+ .b8 50
596
+ .b8 100
597
+ .b8 51
598
+ .b8 100
599
+ .b8 52
600
+ .b8 100
601
+ .b8 53
602
+ .b8 100
603
+ .b8 54
604
+ .b8 100
605
+ .b8 101
606
+ .b8 55
607
+ .b8 100
608
+ .b8 101
609
+ .b8 0
610
+ .b8 116
611
+ .b8 114
612
+ .b8 105
613
+ .b8 116
614
+ .b8 111
615
+ .b8 110
616
+ .b8 95
617
+ .b8 95
618
+ .b8 48
619
+ .b8 100
620
+ .b8 49
621
+ .b8 100
622
+ .b8 50
623
+ .b8 100
624
+ .b8 51
625
+ .b8 100
626
+ .b8 52
627
+ .b8 100
628
+ .b8 53
629
+ .b8 100
630
+ .b8 54
631
+ .b8 100
632
+ .b8 101
633
+ .b8 55
634
+ .b8 100
635
+ .b8 101
636
+ .b8 0
637
+ .b8 1
638
+ .b8 18
639
+ .b8 1
640
+ .b8 1
641
+ .b8 3
642
+ .b64 $L__func_begin0
643
+ .b64 $L__func_end0
644
+ .b8 1
645
+ .b8 156
646
+ .b32 125
647
+ .b8 4
648
+ .b32 125
649
+ .b64 $L__tmp1
650
+ .b64 $L__tmp14
651
+ .b8 2
652
+ .b8 45
653
+ .b8 59
654
+ .b8 5
655
+ .b32 125
656
+ .b64 $L__tmp1
657
+ .b64 $L__tmp14
658
+ .b8 2
659
+ .b8 243
660
+ .b8 36
661
+ .b8 0
662
+ .b8 5
663
+ .b32 125
664
+ .b64 $L__tmp2
665
+ .b64 $L__tmp15
666
+ .b8 2
667
+ .b8 45
668
+ .b8 59
669
+ .b8 5
670
+ .b32 125
671
+ .b64 $L__tmp15
672
+ .b64 $L__tmp16
673
+ .b8 3
674
+ .b8 45
675
+ .b8 45
676
+ .b8 5
677
+ .b32 125
678
+ .b64 $L__tmp17
679
+ .b64 $L__tmp32
680
+ .b8 2
681
+ .b8 53
682
+ .b8 59
683
+ .b8 4
684
+ .b32 125
685
+ .b64 $L__tmp18
686
+ .b64 $L__tmp31
687
+ .b8 2
688
+ .b8 53
689
+ .b8 59
690
+ .b8 5
691
+ .b32 125
692
+ .b64 $L__tmp18
693
+ .b64 $L__tmp31
694
+ .b8 2
695
+ .b8 243
696
+ .b8 36
697
+ .b8 0
698
+ .b8 5
699
+ .b32 125
700
+ .b64 $L__tmp32
701
+ .b64 $L__tmp33
702
+ .b8 3
703
+ .b8 53
704
+ .b8 45
705
+ .b8 0
706
+ .b8 0
707
+ }
708
+ .section .debug_pubnames
709
+ {
710
+ .b32 $L__pubNames_end0-$L__pubNames_start0
711
+ $L__pubNames_start0:
712
+ .b8 2
713
+ .b8 0
714
+ .b32 .debug_info
715
+ .b32 403
716
+ .b32 125
717
+ .b8 116
718
+ .b8 114
719
+ .b8 105
720
+ .b8 116
721
+ .b8 111
722
+ .b8 110
723
+ .b8 95
724
+ .b8 95
725
+ .b8 48
726
+ .b8 100
727
+ .b8 49
728
+ .b8 100
729
+ .b8 50
730
+ .b8 100
731
+ .b8 51
732
+ .b8 100
733
+ .b8 52
734
+ .b8 100
735
+ .b8 53
736
+ .b8 100
737
+ .b8 54
738
+ .b8 100
739
+ .b8 101
740
+ .b8 55
741
+ .b8 100
742
+ .b8 101
743
+ .b8 0
744
+ .b32 0
745
+ $L__pubNames_end0:
746
+ }
747
+ .section .debug_pubtypes
748
+ {
749
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
750
+ $L__pubTypes_start0:
751
+ .b8 2
752
+ .b8 0
753
+ .b32 .debug_info
754
+ .b32 403
755
+ .b32 0
756
+ $L__pubTypes_end0:
757
+ }
758
+ .section .debug_loc { }
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
29
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
31
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
32
+ %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
33
+ %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
34
+ %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
35
+ %24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
36
+ %25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
37
+ %26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
38
+ %27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
39
+ %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
40
+ ^bb0(%arg8: f32, %arg9: f32):
41
+ %46 = arith.addf %arg8, %arg9 : f32
42
+ tt.reduce.return %46 : f32
43
+ }) : (tensor<256xf32, #blocked>) -> f32
44
+ %29 = arith.addf %28, %cst_2 : f32
45
+ %30 = arith.divf %29, %cst_1 : f32
46
+ %31 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
47
+ %32 = arith.subf %26, %31 : tensor<256xf32, #blocked>
48
+ %33 = arith.mulf %32, %32 : tensor<256xf32, #blocked>
49
+ %34 = arith.select %2, %33, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
50
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
51
+ ^bb0(%arg8: f32, %arg9: f32):
52
+ %46 = arith.addf %arg8, %arg9 : f32
53
+ tt.reduce.return %46 : f32
54
+ }) : (tensor<256xf32, #blocked>) -> f32
55
+ %36 = arith.addf %35, %cst_2 : f32
56
+ %37 = arith.divf %36, %cst_1 : f32
57
+ %38 = arith.addf %37, %cst_0 : f32
58
+ %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
59
+ %40 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
60
+ %41 = arith.mulf %32, %40 : tensor<256xf32, #blocked>
61
+ %42 = arith.mulf %41, %23 : tensor<256xf32, #blocked>
62
+ %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
63
+ %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
64
+ %45 = arith.truncf %42 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
65
+ tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
66
+ tt.return
67
+ }
68
+ }
.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttir ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
28
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
30
+ %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
31
+ %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
32
+ %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
33
+ %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
34
+ %24 = arith.addf %8, %12 : tensor<256xf32>
35
+ %25 = arith.addf %24, %16 : tensor<256xf32>
36
+ %26 = arith.addf %25, %20 : tensor<256xf32>
37
+ %27 = arith.select %2, %26, %cst_3 : tensor<256xi1>, tensor<256xf32>
38
+ %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
39
+ ^bb0(%arg8: f32, %arg9: f32):
40
+ %46 = arith.addf %arg8, %arg9 : f32
41
+ tt.reduce.return %46 : f32
42
+ }) : (tensor<256xf32>) -> f32
43
+ %29 = arith.addf %28, %cst_0 : f32
44
+ %30 = arith.divf %29, %cst_1 : f32
45
+ %31 = tt.splat %30 : (f32) -> tensor<256xf32>
46
+ %32 = arith.subf %26, %31 : tensor<256xf32>
47
+ %33 = arith.mulf %32, %32 : tensor<256xf32>
48
+ %34 = arith.select %2, %33, %cst_3 : tensor<256xi1>, tensor<256xf32>
49
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
50
+ ^bb0(%arg8: f32, %arg9: f32):
51
+ %46 = arith.addf %arg8, %arg9 : f32
52
+ tt.reduce.return %46 : f32
53
+ }) : (tensor<256xf32>) -> f32
54
+ %36 = arith.addf %35, %cst_0 : f32
55
+ %37 = arith.divf %36, %cst_1 : f32
56
+ %38 = arith.addf %37, %cst_2 : f32
57
+ %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
58
+ %40 = tt.splat %39 : (f32) -> tensor<256xf32>
59
+ %41 = arith.mulf %32, %40 : tensor<256xf32>
60
+ %42 = arith.mulf %41, %23 : tensor<256xf32>
61
+ %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
62
+ %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
63
+ %45 = arith.truncf %42 : tensor<256xf32> to tensor<256xbf16>
64
+ tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
65
+ tt.return
66
+ }
67
+ }
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin ADDED
Binary file (26 kB). View file