diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..08bd50cb922751e8619df4605235e68580b56281
Binary files /dev/null and b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin differ
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..5c25aed3c1d0756632199dd2e15fd7447081a1ca
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir
@@ -0,0 +1,523 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = and i32 %11, 3, !dbg !10
+  %13 = lshr i32 %10, 1, !dbg !10
+  %14 = shl nuw nsw i32 %12, 4, !dbg !10
+  %15 = or i32 %14, %13, !dbg !10
+  %16 = and i32 %9, 63, !dbg !10
+  %17 = shl i32 %9, 2, !dbg !11
+  %18 = and i32 %17, 4, !dbg !11
+  %19 = and i32 %9, 7, !dbg !11
+  %20 = shl nuw nsw i32 %12, 2, !dbg !12
+  %21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %22 = shl i32 %21, 6, !dbg !14
+  %23 = or i32 %22, %15, !dbg !15
+  %24 = or i32 %22, %16, !dbg !15
+  %25 = sext i32 %23 to i64, !dbg !16
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
+  %27 = sext i32 %24 to i64, !dbg !16
+  %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
+  %34 = srem i32 %23, 512, !dbg !18
+  %35 = shl nsw i32 %34, 8, !dbg !19
+  %36 = shl i32 %23, 8, !dbg !20
+  %37 = add i64 %33, 50257, !dbg !21
+  %38 = icmp slt i64 %29, 0, !dbg !22
+  %39 = icmp slt i64 %33, 0, !dbg !22
+  %40 = select i1 %39, i64 %37, i64 %33, !dbg !23
+  %41 = icmp ugt i64 %40, 50256, !dbg !24
+  %42 = shl i64 %29, 8, !dbg !25
+  %43 = add i64 %42, 12865792, !dbg !25
+  %44 = select i1 %38, i64 %43, i64 %42, !dbg !25
+  %45 = getelementptr float, ptr addrspace(1) %1, i64 %44
+  br label %46, !dbg !12
+
+46:                                               ; preds = %8, %92
+  %47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
+  %48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
+  %49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
+  %50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
+  %51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
+  %52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
+  %53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
+  %54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
+  %55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
+  %56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
+  %57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
+  %58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
+  %59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
+  %60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
+  %61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
+  %62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
+  %63 = phi i32 [ 0, %8 ], [ %144, %92 ]
+  %64 = or i32 %63, %18, !dbg !26
+  %65 = add i32 %64, %35, !dbg !27
+  %66 = sext i32 %65 to i64, !dbg !28
+  %67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
+  %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
+  %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
+  %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
+  %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
+  %73 = bitcast i32 %69 to float, !dbg !29
+  %74 = bitcast i32 %70 to float, !dbg !29
+  %75 = bitcast i32 %71 to float, !dbg !29
+  %76 = bitcast i32 %72 to float, !dbg !29
+  %77 = add i32 %64, %36, !dbg !30
+  %78 = sext i32 %77 to i64, !dbg !31
+  %79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
+  %80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %81 = extractvalue { i32, i32 } %80, 0, !dbg !32
+  %82 = extractvalue { i32, i32 } %80, 1, !dbg !32
+  %83 = trunc i32 %81 to i16, !dbg !32
+  %extelt.offset3 = lshr i32 %81, 16, !dbg !32
+  %84 = trunc i32 %extelt.offset3 to i16, !dbg !32
+  %85 = trunc i32 %82 to i16, !dbg !32
+  %extelt.offset4 = lshr i32 %82, 16, !dbg !32
+  %86 = trunc i32 %extelt.offset4 to i16, !dbg !32
+  %87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
+  %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
+  %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
+  %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
+  br i1 %41, label %91, label %92, !dbg !34
+
+91:                                               ; preds = %46
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
+  br label %92, !dbg !34
+
+92:                                               ; preds = %91, %46
+  %93 = zext nneg i32 %64 to i64, !dbg !35
+  %94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
+  %95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
+  %97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
+  %98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
+  %99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
+  %100 = bitcast i32 %96 to float, !dbg !37
+  %101 = bitcast i32 %97 to float, !dbg !37
+  %102 = bitcast i32 %98 to float, !dbg !37
+  %103 = bitcast i32 %99 to float, !dbg !37
+  %104 = fadd float %73, %100, !dbg !38
+  %105 = fadd float %74, %101, !dbg !38
+  %106 = fadd float %75, %102, !dbg !38
+  %107 = fadd float %76, %103, !dbg !38
+  %108 = fadd float %87, %104, !dbg !39
+  %109 = fadd float %88, %105, !dbg !39
+  %110 = fadd float %89, %106, !dbg !39
+  %111 = fadd float %90, %107, !dbg !39
+  %112 = fsub float %108, %59, !dbg !40
+  %113 = fsub float %109, %60, !dbg !40
+  %114 = fsub float %110, %61, !dbg !40
+  %115 = fsub float %111, %62, !dbg !40
+  %116 = fadd float %47, 1.000000e+00, !dbg !44
+  %117 = fadd float %48, 1.000000e+00, !dbg !44
+  %118 = fadd float %49, 1.000000e+00, !dbg !44
+  %119 = fadd float %50, 1.000000e+00, !dbg !44
+  %120 = fadd float %51, 1.000000e+00, !dbg !44
+  %121 = fadd float %52, 1.000000e+00, !dbg !44
+  %122 = fadd float %53, 1.000000e+00, !dbg !44
+  %123 = fadd float %54, 1.000000e+00, !dbg !44
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
+  %125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
+  %126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
+  %127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
+  %128 = fadd float %59, %124, !dbg !46
+  %129 = fadd float %60, %125, !dbg !46
+  %130 = fadd float %61, %126, !dbg !46
+  %131 = fadd float %62, %127, !dbg !46
+  %132 = fsub float %108, %128, !dbg !47
+  %133 = fsub float %109, %129, !dbg !47
+  %134 = fsub float %110, %130, !dbg !47
+  %135 = fsub float %111, %131, !dbg !47
+  %136 = fmul float %112, %132, !dbg !48
+  %137 = fmul float %113, %133, !dbg !48
+  %138 = fmul float %114, %134, !dbg !48
+  %139 = fmul float %115, %135, !dbg !48
+  %140 = fadd float %55, %136, !dbg !49
+  %141 = fadd float %56, %137, !dbg !49
+  %142 = fadd float %57, %138, !dbg !49
+  %143 = fadd float %58, %139, !dbg !49
+  %144 = add nuw nsw i32 %63, 8, !dbg !12
+  %145 = icmp ult i32 %63, 248, !dbg !12
+  br i1 %145, label %46, label %146, !dbg !12
+
+146:                                              ; preds = %92
+  %147 = lshr i32 %10, 3, !dbg !12
+  %148 = or i32 %20, %147, !dbg !12
+  %149 = mul nuw nsw i32 %148, 12, !dbg !12
+  %150 = add nuw nsw i32 %149, %19, !dbg !12
+  %151 = zext nneg i32 %150 to i64, !dbg !12
+  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
+  %153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
+  store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
+  %154 = or i32 %19, 192, !dbg !12
+  %155 = add nuw nsw i32 %154, %149, !dbg !12
+  %156 = zext nneg i32 %155 to i64, !dbg !12
+  %157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
+  %158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
+  store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
+  %159 = or i32 %19, 384, !dbg !12
+  %160 = add nuw nsw i32 %159, %149, !dbg !12
+  %161 = zext nneg i32 %160 to i64, !dbg !12
+  %162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
+  %163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
+  store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
+  %164 = or i32 %19, 576, !dbg !12
+  %165 = add nuw nsw i32 %164, %149, !dbg !12
+  %166 = zext nneg i32 %165 to i64, !dbg !12
+  %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
+  %168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
+  store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %169 = mul nuw nsw i32 %15, 12, !dbg !12
+  %170 = add nuw nsw i32 %169, %18, !dbg !12
+  %171 = zext nneg i32 %170 to i64, !dbg !12
+  %172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
+  %173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
+  %174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
+  %175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
+  %176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
+  %177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
+  %178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
+  %179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
+  %180 = fsub float %129, %128, !dbg !50
+  %181 = fadd float %173, %175, !dbg !54
+  %182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
+  %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
+  %184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
+  %185 = fmul float %180, %184, !dbg !58
+  %186 = fadd float %128, %185, !dbg !59
+  %187 = fadd float %140, %141, !dbg !60
+  %188 = fmul float %180, %180, !dbg !61
+  %189 = fmul float %188, %173, !dbg !62
+  %190 = fmul float %189, %184, !dbg !63
+  %191 = fadd float %187, %190, !dbg !64
+  %192 = fsub float %130, %186, !dbg !50
+  %193 = fadd float %177, %181, !dbg !54
+  %194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
+  %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
+  %196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
+  %197 = fmul float %196, %192, !dbg !58
+  %198 = fadd float %186, %197, !dbg !59
+  %199 = fadd float %142, %191, !dbg !60
+  %200 = fmul float %192, %192, !dbg !61
+  %201 = fmul float %181, %200, !dbg !62
+  %202 = fmul float %196, %201, !dbg !63
+  %203 = fadd float %199, %202, !dbg !64
+  %204 = fsub float %131, %198, !dbg !50
+  %205 = fadd float %179, %193, !dbg !54
+  %206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
+  %207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
+  %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
+  %209 = fmul float %208, %204, !dbg !58
+  %210 = fadd float %198, %209, !dbg !59
+  %211 = fadd float %143, %203, !dbg !60
+  %212 = fmul float %204, %204, !dbg !61
+  %213 = fmul float %193, %212, !dbg !62
+  %214 = fmul float %208, %213, !dbg !63
+  %215 = fadd float %211, %214, !dbg !64
+  %216 = bitcast float %210 to i32, !dbg !65
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
+  %218 = bitcast i32 %217 to float, !dbg !65
+  %219 = bitcast float %215 to i32, !dbg !65
+  %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
+  %221 = bitcast i32 %220 to float, !dbg !65
+  %222 = bitcast float %205 to i32, !dbg !65
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
+  %224 = bitcast i32 %223 to float, !dbg !65
+  %225 = fsub float %218, %210, !dbg !50
+  %226 = fadd float %205, %224, !dbg !54
+  %227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
+  %228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
+  %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
+  %230 = fmul float %229, %225, !dbg !58
+  %231 = fadd float %210, %230, !dbg !59
+  %232 = fadd float %215, %221, !dbg !60
+  %233 = fmul float %225, %225, !dbg !61
+  %234 = fmul float %205, %233, !dbg !62
+  %235 = fmul float %229, %234, !dbg !63
+  %236 = fadd float %232, %235, !dbg !64
+  %237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
+  br label %242, !dbg !69
+
+242:                                              ; preds = %146, %__nv_rsqrtf.exit
+  %243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
+  %244 = or i32 %243, %18, !dbg !70
+  %245 = add i32 %244, %35, !dbg !71
+  %246 = sext i32 %245 to i64, !dbg !72
+  %247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
+  %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
+  %250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
+  %251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
+  %252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
+  %253 = bitcast i32 %249 to float, !dbg !73
+  %254 = bitcast i32 %250 to float, !dbg !73
+  %255 = bitcast i32 %251 to float, !dbg !73
+  %256 = bitcast i32 %252 to float, !dbg !73
+  %257 = add i32 %244, %36, !dbg !74
+  %258 = sext i32 %257 to i64, !dbg !75
+  %259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
+  %260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
+  %261 = extractvalue { i32, i32 } %260, 0, !dbg !76
+  %262 = extractvalue { i32, i32 } %260, 1, !dbg !76
+  %263 = trunc i32 %261 to i16, !dbg !76
+  %extelt.offset = lshr i32 %261, 16, !dbg !76
+  %264 = trunc i32 %extelt.offset to i16, !dbg !76
+  %265 = trunc i32 %262 to i16, !dbg !76
+  %extelt.offset2 = lshr i32 %262, 16, !dbg !76
+  %266 = trunc i32 %extelt.offset2 to i16, !dbg !76
+  %267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
+  %268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
+  %269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
+  %270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
+  %271 = zext nneg i32 %244 to i64, !dbg !78
+  %272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
+  %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
+  %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
+  %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
+  %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
+  %278 = bitcast i32 %274 to float, !dbg !79
+  %279 = bitcast i32 %275 to float, !dbg !79
+  %280 = bitcast i32 %276 to float, !dbg !79
+  %281 = bitcast i32 %277 to float, !dbg !79
+  br i1 %41, label %282, label %283, !dbg !80
+
+282:                                              ; preds = %242
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
+  br label %283, !dbg !80
+
+283:                                              ; preds = %282, %242
+  %284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
+  %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
+  %287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
+  %288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
+  %289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
+  %290 = bitcast i32 %286 to float, !dbg !82
+  %291 = bitcast i32 %287 to float, !dbg !82
+  %292 = bitcast i32 %288 to float, !dbg !82
+  %293 = bitcast i32 %289 to float, !dbg !82
+  %294 = fadd float %253, %290, !dbg !83
+  %295 = fadd float %254, %291, !dbg !83
+  %296 = fadd float %255, %292, !dbg !83
+  %297 = fadd float %256, %293, !dbg !83
+  %298 = fadd float %267, %294, !dbg !84
+  %299 = fadd float %268, %295, !dbg !84
+  %300 = fadd float %269, %296, !dbg !84
+  %301 = fadd float %270, %297, !dbg !84
+  %302 = fsub float %298, %231, !dbg !85
+  %303 = fsub float %299, %231, !dbg !85
+  %304 = fsub float %300, %231, !dbg !85
+  %305 = fsub float %301, %231, !dbg !85
+  %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %.not.i = icmp eq i32 %306, 0, !dbg !86
+  br i1 %.not.i, label %309, label %307, !dbg !86
+
+307:                                              ; preds = %283
+  %308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+309:                                              ; preds = %283
+  %310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+__nv_rsqrtf.exit:                                 ; preds = %307, %309
+  %.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
+  %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %314 = fmul float %302, %.0.i, !dbg !87
+  %315 = fmul float %303, %.0.i, !dbg !87
+  %316 = fmul float %304, %.0.i, !dbg !87
+  %317 = fmul float %305, %.0.i, !dbg !87
+  %318 = fmul float %314, %278, !dbg !88
+  %319 = fmul float %315, %279, !dbg !88
+  %320 = fmul float %316, %280, !dbg !88
+  %321 = fmul float %317, %281, !dbg !88
+  %322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
+  %323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
+  %324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
+  %325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
+  %326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
+  %327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
+  %328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
+  %329 = bitcast <2 x i16> %328 to i32, !dbg !90
+  %330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
+  %331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
+  %332 = bitcast <2 x i16> %331 to i32, !dbg !90
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
+  %333 = add nuw nsw i32 %243, 8, !dbg !69
+  %334 = icmp ult i32 %243, 248, !dbg !69
+  br i1 %334, label %242, label %335, !dbg !69
+
+335:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !91
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 44, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 22, scope: !7)
+!23 = !DILocation(line: 39, column: 36, scope: !7)
+!24 = !DILocation(line: 40, column: 40, scope: !7)
+!25 = !DILocation(line: 41, column: 44, scope: !7)
+!26 = !DILocation(line: 32, column: 27, scope: !7)
+!27 = !DILocation(line: 35, column: 40, scope: !7)
+!28 = !DILocation(line: 35, column: 34, scope: !7)
+!29 = !DILocation(line: 35, column: 50, scope: !7)
+!30 = !DILocation(line: 36, column: 40, scope: !7)
+!31 = !DILocation(line: 36, column: 34, scope: !7)
+!32 = !DILocation(line: 36, column: 50, scope: !7)
+!33 = !DILocation(line: 36, column: 101, scope: !7)
+!34 = !DILocation(line: 40, column: 55, scope: !7)
+!35 = !DILocation(line: 41, column: 40, scope: !7)
+!36 = !DILocation(line: 41, column: 34, scope: !7)
+!37 = !DILocation(line: 41, column: 52, scope: !7)
+!38 = !DILocation(line: 42, column: 22, scope: !7)
+!39 = !DILocation(line: 44, column: 22, scope: !7)
+!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
+!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
+!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!43 = !DILocation(line: 47, column: 41, scope: !41)
+!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
+!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
+!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
+!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
+!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
+!49 = !DILocation(line: 50, column: 50, scope: !7)
+!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
+!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
+!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
+!53 = !DILocation(line: 53, column: 44, scope: !51)
+!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
+!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
+!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
+!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
+!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
+!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
+!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
+!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
+!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
+!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
+!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
+!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
+!66 = !DILocation(line: 53, column: 44, scope: !41)
+!67 = !DILocation(line: 75, column: 24, scope: !7)
+!68 = !DILocation(line: 77, column: 24, scope: !7)
+!69 = !DILocation(line: 58, column: 36, scope: !7)
+!70 = !DILocation(line: 59, column: 27, scope: !7)
+!71 = !DILocation(line: 62, column: 41, scope: !7)
+!72 = !DILocation(line: 62, column: 35, scope: !7)
+!73 = !DILocation(line: 62, column: 51, scope: !7)
+!74 = !DILocation(line: 63, column: 41, scope: !7)
+!75 = !DILocation(line: 63, column: 35, scope: !7)
+!76 = !DILocation(line: 63, column: 51, scope: !7)
+!77 = !DILocation(line: 63, column: 103, scope: !7)
+!78 = !DILocation(line: 64, column: 35, scope: !7)
+!79 = !DILocation(line: 64, column: 40, scope: !7)
+!80 = !DILocation(line: 68, column: 57, scope: !7)
+!81 = !DILocation(line: 69, column: 35, scope: !7)
+!82 = !DILocation(line: 69, column: 54, scope: !7)
+!83 = !DILocation(line: 70, column: 24, scope: !7)
+!84 = !DILocation(line: 72, column: 24, scope: !7)
+!85 = !DILocation(line: 73, column: 24, scope: !7)
+!86 = !DILocation(line: 78, column: 30, scope: !7)
+!87 = !DILocation(line: 79, column: 24, scope: !7)
+!88 = !DILocation(line: 80, column: 24, scope: !7)
+!89 = !DILocation(line: 82, column: 29, scope: !7)
+!90 = !DILocation(line: 82, column: 52, scope: !7)
+!91 = !DILocation(line: 58, column: 4, scope: !7)
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..5524e17d9798834a0b85a936005c9959a6017442
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx
@@ -0,0 +1,951 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<48>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<158>;
+	.reg .f32 	%f<164>;
+	.reg .b64 	%rd<73>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd19, [triton__0d1d2d3d4d5d6de7de_param_5];
+	ld.param.u64 	%rd18, [triton__0d1d2d3d4d5d6de7de_param_4];
+	ld.param.u64 	%rd17, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd30, [triton__0d1d2d3d4d5d6de7de_param_0];
+	ld.param.u64 	%rd31, [triton__0d1d2d3d4d5d6de7de_param_1];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r13, %tid.x;
+	and.b32  	%r1, %r13, 31;
+	ld.param.u64 	%rd32, [triton__0d1d2d3d4d5d6de7de_param_2];
+	bfe.u32 	%r14, %r13, 5, 2;
+	bfe.u32 	%r15, %r13, 1, 4;
+	shl.b32 	%r16, %r14, 4;
+	or.b32  	%r2, %r16, %r15;
+	and.b32  	%r17, %r13, 63;
+	.loc	1 24 33
+	shl.b32 	%r18, %r13, 2;
+	and.b32  	%r3, %r18, 4;
+	and.b32  	%r4, %r13, 7;
+	.loc	1 31 36
+	shl.b32 	%r5, %r14, 2;
+	.loc	1 21 28
+	mov.u32 %r11, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r19, %r11, 6;
+	.loc	1 22 23
+	or.b32  	%r20, %r19, %r2;
+	or.b32  	%r21, %r19, %r17;
+	.loc	1 26 30
+	mul.wide.s32 	%rd33, %r20, 8;
+	add.s64 	%rd21, %rd30, %rd33;
+	mul.wide.s32 	%rd34, %r21, 8;
+	add.s64 	%rd29, %rd30, %rd34;
+	mov.pred 	%p1, -1;
+	.loc	1 26 35
+	mov.u64 %rd20, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
+	mov.u64 %rd22, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd21 + 0 ];
+	mov.u64 %rd24, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd21 + 0 ];
+	mov.u64 %rd26, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd21 + 0 ];
+	mov.u64 %rd28, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r22, %r11, 25, 1;
+	shr.u32 	%r23, %r22, 23;
+	add.s32 	%r24, %r20, %r23;
+	and.b32  	%r25, %r24, 16776704;
+	sub.s32 	%r26, %r20, %r25;
+	.loc	1 35 44
+	shl.b32 	%r27, %r26, 8;
+	.loc	1 37 22
+	add.s64 	%rd35, %rd28, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p6, %rd20, 0;
+	setp.lt.s64 	%p7, %rd28, 0;
+	.loc	1 39 36
+	selp.b64 	%rd1, %rd35, %rd28, %p7;
+	.loc	1 41 44
+	shl.b64 	%rd36, %rd20, 8;
+	add.s64 	%rd37, %rd36, 12865792;
+	selp.b64 	%rd38, %rd37, %rd36, %p6;
+	.loc	1 31 36
+	and.b32  	%r28, %r13, 1;
+	mul.wide.u32 	%rd2, %r28, 16;
+	shl.b64 	%rd39, %rd38, 2;
+	or.b64  	%rd40, %rd2, %rd39;
+	add.s64 	%rd72, %rd31, %rd40;
+	shl.b32 	%r29, %r11, 14;
+	shl.b32 	%r30, %r14, 12;
+	or.b32  	%r31, %r29, %r30;
+	shl.b32 	%r32, %r15, 8;
+	or.b32  	%r33, %r31, %r32;
+	or.b32  	%r6, %r33, %r3;
+	or.b32  	%r34, %r27, %r3;
+	mul.wide.s32 	%rd41, %r34, 4;
+	add.s64 	%rd70, %rd32, %rd41;
+	mov.f32 	%f148, 0f00000000;
+	mov.b32 	%r156, -8;
+	mov.u64 	%rd68, %rd70;
+	mov.u64 	%rd69, %rd72;
+	mov.f32 	%f149, %f148;
+	mov.f32 	%f150, %f148;
+	mov.f32 	%f151, %f148;
+	mov.f32 	%f152, %f148;
+	mov.f32 	%f153, %f148;
+	mov.f32 	%f154, %f148;
+	mov.f32 	%f155, %f148;
+	mov.f32 	%f156, %f148;
+	mov.f32 	%f157, %f148;
+	mov.f32 	%f158, %f148;
+	mov.f32 	%f159, %f148;
+	mov.f32 	%f160, %f148;
+	mov.f32 	%f161, %f148;
+	mov.f32 	%f162, %f148;
+	mov.f32 	%f163, %f148;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 0 0
+	mov.b32 	%f17, %r35;
+	mov.b32 	%f18, %r36;
+	mov.b32 	%f19, %r37;
+	mov.b32 	%f20, %r38;
+	cvt.u16.u32 	%rs1, %r43;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r43; }
+	cvt.u16.u32 	%rs3, %r44;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r44; }
+	cvt.f32.bf16 %r47, %rs1;
+	mov.b32 	%f21, %r47;
+	cvt.f32.bf16 %r48, %rs2;
+	mov.b32 	%f22, %r48;
+	cvt.f32.bf16 %r49, %rs3;
+	mov.b32 	%f23, %r49;
+	cvt.f32.bf16 %r50, %rs4;
+	mov.b32 	%f24, %r50;
+	.loc	1 41 52
+	mov.u32 %r54, 0x0;
+	mov.u32 %r55, 0x0;
+	mov.u32 %r56, 0x0;
+	mov.u32 %r57, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd69 + 0 ];
+	@!%p1 mov.u32 %r54, %r143;
+	@!%p1 mov.u32 %r55, %r143;
+	@!%p1 mov.u32 %r56, %r143;
+	@!%p1 mov.u32 %r57, %r143;
+	mov.b32 	%f56, %r54;
+	mov.b32 	%f57, %r55;
+	mov.b32 	%f58, %r56;
+	mov.b32 	%f59, %r57;
+	.loc	1 42 22
+	add.f32 	%f60, %f17, %f56;
+	add.f32 	%f61, %f18, %f57;
+	add.f32 	%f62, %f19, %f58;
+	add.f32 	%f63, %f20, %f59;
+	.loc	1 44 22
+	add.f32 	%f64, %f21, %f60;
+	add.f32 	%f65, %f22, %f61;
+	add.f32 	%f66, %f23, %f62;
+	add.f32 	%f67, %f24, %f63;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f68, %f64, %f160;
+	sub.f32 	%f69, %f65, %f161;
+	sub.f32 	%f70, %f66, %f162;
+	sub.f32 	%f71, %f67, %f163;
+	.loc	2 97 26
+	add.f32 	%f148, %f148, 0f3F800000;
+	add.f32 	%f149, %f149, 0f3F800000;
+	add.f32 	%f150, %f150, 0f3F800000;
+	add.f32 	%f151, %f151, 0f3F800000;
+	add.f32 	%f152, %f152, 0f3F800000;
+	add.f32 	%f153, %f153, 0f3F800000;
+	add.f32 	%f154, %f154, 0f3F800000;
+	add.f32 	%f155, %f155, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r63, %f68;
+	mov.b32 	%r64, %f148;
+	div.full.f32 %r62, %r63, %r64;
+	mov.b32 	%f72, %r62;
+	mov.b32 	%r66, %f69;
+	mov.b32 	%r67, %f149;
+	div.full.f32 %r65, %r66, %r67;
+	mov.b32 	%f73, %r65;
+	mov.b32 	%r69, %f70;
+	mov.b32 	%r70, %f150;
+	div.full.f32 %r68, %r69, %r70;
+	mov.b32 	%f74, %r68;
+	mov.b32 	%r72, %f71;
+	mov.b32 	%r73, %f151;
+	div.full.f32 %r71, %r72, %r73;
+	mov.b32 	%f75, %r71;
+	.loc	2 98 22
+	add.f32 	%f160, %f160, %f72;
+	add.f32 	%f161, %f161, %f73;
+	add.f32 	%f162, %f162, %f74;
+	add.f32 	%f163, %f163, %f75;
+	.loc	2 101 30
+	sub.f32 	%f76, %f64, %f160;
+	sub.f32 	%f77, %f65, %f161;
+	sub.f32 	%f78, %f66, %f162;
+	sub.f32 	%f79, %f67, %f163;
+$L__tmp2:
+	.loc	1 50 50
+	fma.rn.f32 	%f156, %f68, %f76, %f156;
+	fma.rn.f32 	%f157, %f69, %f77, %f157;
+	fma.rn.f32 	%f158, %f70, %f78, %f158;
+	fma.rn.f32 	%f159, %f71, %f79, %f159;
+	.loc	1 31 36
+	add.s32 	%r156, %r156, 8;
+	add.s64 	%rd69, %rd69, 32;
+	add.s64 	%rd68, %rd68, 32;
+	setp.lt.u32 	%p22, %r156, 248;
+	@%p22 bra 	$L__BB0_1;
+	bra.uni 	$L__BB0_4;
+$L__BB0_1:
+	.loc	1 40 40
+	setp.lt.u64 	%p16, %rd1, 50257;
+	mov.b32 	%r143, 0;
+	.loc	1 35 50
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd68 + 0 ];
+	@!%p1 mov.u32 %r35, %r143;
+	@!%p1 mov.u32 %r36, %r143;
+	@!%p1 mov.u32 %r37, %r143;
+	@!%p1 mov.u32 %r38, %r143;
+	.loc	1 36 34
+	add.s32 	%r51, %r6, %r156;
+	add.s32 	%r52, %r51, 8;
+	mul.wide.s32 	%rd44, %r52, 2;
+	add.s64 	%rd43, %rd17, %rd44;
+	.loc	1 36 50
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	@%p1 ld.global.L1::evict_last.v2.b32 { %r43, %r44 }, [ %rd43 + 0 ];
+	@!%p1 mov.u32 %r43, %r143;
+	@!%p1 mov.u32 %r44, %r143;
+	mov.b32 	%r155, 883;
+	mov.u64 	%rd67, 1;
+	.loc	1 40 55
+	@%p16 bra 	$L__BB0_3;
+	mov.u64 	%rd45, assertMessage_0;
+	cvta.global.u64 	%rd46, %rd45;
+	mov.u64 	%rd47, assertFile_0;
+	cvta.global.u64 	%rd48, %rd47;
+	mov.u64 	%rd49, assertFunc_0;
+	cvta.global.u64 	%rd50, %rd49;
+	{ // callseq 2, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd46;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd48;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r155;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd50;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd67;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 2
+	bra.uni 	$L__BB0_3;
+$L__BB0_4:
+	.loc	1 31 36
+	shr.u32 	%r99, %r1, 3;
+	or.b32  	%r100, %r5, %r99;
+	mad.lo.s32 	%r101, %r100, 12, %r4;
+	shl.b32 	%r102, %r101, 2;
+	mov.u32 	%r103, global_smem;
+	add.s32 	%r104, %r103, %r102;
+	st.shared.f32 	[%r104], %f152;
+	st.shared.f32 	[%r104+768], %f153;
+	st.shared.f32 	[%r104+1536], %f154;
+	st.shared.f32 	[%r104+2304], %f155;
+	bar.sync 	0;
+	mad.lo.s32 	%r105, %r2, 12, %r3;
+	shl.b32 	%r106, %r105, 2;
+	add.s32 	%r107, %r103, %r106;
+	ld.shared.v4.f32 	{%f80, %f81, %f82, %f83}, [%r107];
+$L__tmp3:
+	.loc	2 108 21
+	sub.f32 	%f84, %f161, %f160;
+	.loc	2 109 28
+	add.f32 	%f85, %f80, %f81;
+	.loc	2 110 39
+	setp.eq.f32 	%p23, %f85, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r75, %f81;
+	mov.b32 	%r76, %f85;
+	div.full.f32 %r74, %r75, %r76;
+	mov.b32 	%f86, %r74;
+	.loc	2 110 49
+	selp.f32 	%f87, 0f00000000, %f86, %p23;
+	.loc	2 112 17
+	fma.rn.f32 	%f88, %f84, %f87, %f160;
+	.loc	2 113 15
+	add.f32 	%f89, %f156, %f157;
+	.loc	2 113 30
+	mul.f32 	%f90, %f84, %f84;
+	.loc	2 113 38
+	mul.f32 	%f91, %f90, %f80;
+	.loc	2 113 22
+	fma.rn.f32 	%f92, %f91, %f87, %f89;
+	.loc	2 108 21
+	sub.f32 	%f93, %f162, %f88;
+	.loc	2 109 28
+	add.f32 	%f94, %f82, %f85;
+	.loc	2 110 39
+	setp.eq.f32 	%p24, %f94, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r79, %f94;
+	mov.b32 	%r78, %f82;
+	div.full.f32 %r77, %r78, %r79;
+	mov.b32 	%f95, %r77;
+	.loc	2 110 49
+	selp.f32 	%f96, 0f00000000, %f95, %p24;
+	.loc	2 112 17
+	fma.rn.f32 	%f97, %f96, %f93, %f88;
+	.loc	2 113 15
+	add.f32 	%f98, %f158, %f92;
+	.loc	2 113 30
+	mul.f32 	%f99, %f93, %f93;
+	.loc	2 113 38
+	mul.f32 	%f100, %f85, %f99;
+	.loc	2 113 22
+	fma.rn.f32 	%f101, %f96, %f100, %f98;
+	.loc	2 108 21
+	sub.f32 	%f102, %f163, %f97;
+	.loc	2 109 28
+	add.f32 	%f103, %f83, %f94;
+	.loc	2 110 39
+	setp.eq.f32 	%p25, %f103, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r82, %f103;
+	mov.b32 	%r81, %f83;
+	div.full.f32 %r80, %r81, %r82;
+	mov.b32 	%f104, %r80;
+	.loc	2 110 49
+	selp.f32 	%f105, 0f00000000, %f104, %p25;
+	.loc	2 112 17
+	fma.rn.f32 	%f106, %f105, %f102, %f97;
+	.loc	2 113 15
+	add.f32 	%f107, %f159, %f101;
+	.loc	2 113 30
+	mul.f32 	%f108, %f102, %f102;
+	.loc	2 113 38
+	mul.f32 	%f109, %f94, %f108;
+	.loc	2 113 22
+	fma.rn.f32 	%f110, %f105, %f109, %f107;
+$L__tmp4:
+	.loc	2 120 46
+	mov.b32 	%r108, %f106;
+	shfl.sync.bfly.b32	%r109, %r108, 1, 31, -1;
+	mov.b32 	%f111, %r109;
+	mov.b32 	%r110, %f110;
+	shfl.sync.bfly.b32	%r111, %r110, 1, 31, -1;
+	mov.b32 	%f112, %r111;
+	shfl.sync.bfly.b32	%r84, %r82, 1, 31, -1;
+	mov.b32 	%f113, %r84;
+$L__tmp5:
+	.loc	2 108 21
+	sub.f32 	%f114, %f111, %f106;
+	.loc	2 109 28
+	add.f32 	%f115, %f103, %f113;
+	.loc	2 110 39
+	setp.eq.f32 	%p26, %f115, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r85, %f115;
+	div.full.f32 %r83, %r84, %r85;
+	mov.b32 	%f116, %r83;
+	.loc	2 110 49
+	selp.f32 	%f117, 0f00000000, %f116, %p26;
+	.loc	2 112 17
+	fma.rn.f32 	%f41, %f117, %f114, %f106;
+	.loc	2 113 15
+	add.f32 	%f118, %f110, %f112;
+	.loc	2 113 30
+	mul.f32 	%f119, %f114, %f114;
+	.loc	2 113 38
+	mul.f32 	%f120, %f103, %f119;
+	.loc	2 113 22
+	fma.rn.f32 	%f121, %f117, %f120, %f118;
+$L__tmp6:
+	.loc	1 75 24
+	mov.b32 	%r87, %f121;
+	mov.b32 	%r88, 1132462080;
+	div.full.f32 %r86, %r87, %r88;
+	mov.b32 	%f122, %r86;
+	.loc	1 77 24
+	add.f32 	%f42, %f122, 0f3727C5AC;
+	.loc	1 58 36
+	add.s64 	%rd71, %rd18, %rd2;
+	mov.b32 	%r157, -8;
+	rsqrt.approx.ftz.f32 	%f139, %f42;
+	bra.uni 	$L__BB0_5;
+$L__BB0_7:
+	.loc	1 0 0
+	mov.b32 	%f43, %r112;
+	mov.b32 	%f44, %r113;
+	mov.b32 	%f45, %r114;
+	mov.b32 	%f46, %r115;
+	cvt.s64.s32 	%rd13, %r137;
+	mov.b32 	%f47, %r124;
+	mov.b32 	%f48, %r125;
+	mov.b32 	%f49, %r126;
+	mov.b32 	%f50, %r127;
+	mov.b32 	%f51, %r128;
+	mov.b32 	%f52, %r129;
+	mov.b32 	%f53, %r130;
+	mov.b32 	%f54, %r131;
+	.loc	1 69 54
+	mov.u32 %r139, 0x0;
+	mov.u32 %r140, 0x0;
+	mov.u32 %r141, 0x0;
+	mov.u32 %r142, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r139, %r140, %r141, %r142 }, [ %rd72 + 0 ];
+	@!%p1 mov.u32 %r139, %r143;
+	@!%p1 mov.u32 %r140, %r143;
+	@!%p1 mov.u32 %r141, %r143;
+	@!%p1 mov.u32 %r142, %r143;
+	mov.b32 	%f123, %r139;
+	mov.b32 	%f124, %r140;
+	mov.b32 	%f125, %r141;
+	mov.b32 	%f126, %r142;
+	.loc	1 70 24
+	add.f32 	%f127, %f43, %f123;
+	add.f32 	%f128, %f44, %f124;
+	add.f32 	%f129, %f45, %f125;
+	add.f32 	%f130, %f46, %f126;
+	.loc	1 72 24
+	add.f32 	%f131, %f47, %f127;
+	add.f32 	%f132, %f48, %f128;
+	add.f32 	%f133, %f49, %f129;
+	add.f32 	%f134, %f50, %f130;
+	.loc	1 73 24
+	sub.f32 	%f135, %f131, %f41;
+	sub.f32 	%f136, %f132, %f41;
+	sub.f32 	%f137, %f133, %f41;
+	sub.f32 	%f138, %f134, %f41;
+	.loc	1 79 24
+	mul.f32 	%f140, %f135, %f139;
+	mul.f32 	%f141, %f136, %f139;
+	mul.f32 	%f142, %f137, %f139;
+	mul.f32 	%f143, %f138, %f139;
+	.loc	1 80 24
+	mul.f32 	%f144, %f140, %f51;
+	mul.f32 	%f145, %f141, %f52;
+	mul.f32 	%f146, %f142, %f53;
+	mul.f32 	%f147, %f143, %f54;
+	.loc	1 82 29
+	shl.b64 	%rd66, %rd13, 1;
+	add.s64 	%rd65, %rd19, %rd66;
+	.loc	1 82 52
+	mov.b32 	%r147, %f144;
+	cvt.rn.bf16.f32 %rs9, %r147;
+	mov.b32 	%r148, %f145;
+	cvt.rn.bf16.f32 %rs10, %r148;
+	mov.b32 	%r149, %f146;
+	cvt.rn.bf16.f32 %rs11, %r149;
+	mov.b32 	%r150, %f147;
+	cvt.rn.bf16.f32 %rs12, %r150;
+	mov.b32 	%r153, {%rs9, %rs10};
+	mov.b32 	%r154, {%rs11, %rs12};
+	@%p1 st.global.v2.b32 [ %rd65 + 0 ], { %r153, %r154 };
+	.loc	1 58 36
+	add.s32 	%r157, %r157, 8;
+	add.s64 	%rd72, %rd72, 32;
+	add.s64 	%rd71, %rd71, 32;
+	add.s64 	%rd70, %rd70, 32;
+	setp.lt.u32 	%p47, %r157, 248;
+	@%p47 bra 	$L__BB0_5;
+	bra.uni 	$L__BB0_8;
+$L__BB0_5:
+	.loc	1 62 51
+	mov.u32 %r112, 0x0;
+	mov.u32 %r113, 0x0;
+	mov.u32 %r114, 0x0;
+	mov.u32 %r115, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r112, %r113, %r114, %r115 }, [ %rd70 + 0 ];
+	@!%p1 mov.u32 %r112, %r143;
+	@!%p1 mov.u32 %r113, %r143;
+	@!%p1 mov.u32 %r114, %r143;
+	@!%p1 mov.u32 %r115, %r143;
+	.loc	1 63 35
+	add.s32 	%r136, %r6, %r157;
+	add.s32 	%r137, %r136, 8;
+	mul.wide.s32 	%rd56, %r137, 2;
+	add.s64 	%rd54, %rd17, %rd56;
+	.loc	1 63 51
+	mov.u32 %r120, 0x0;
+	mov.u32 %r121, 0x0;
+	@%p1 ld.global.L1::evict_first.v2.b32 { %r120, %r121 }, [ %rd54 + 0 ];
+	@!%p1 mov.u32 %r120, %r143;
+	@!%p1 mov.u32 %r121, %r143;
+	cvt.u16.u32 	%rs5, %r120;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r120; }
+	cvt.u16.u32 	%rs7, %r121;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r121; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r124, %rs5;
+	cvt.f32.bf16 %r125, %rs6;
+	cvt.f32.bf16 %r126, %rs7;
+	cvt.f32.bf16 %r127, %rs8;
+	.loc	1 64 40
+	mov.u32 %r128, 0x0;
+	mov.u32 %r129, 0x0;
+	mov.u32 %r130, 0x0;
+	mov.u32 %r131, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r128, %r129, %r130, %r131 }, [ %rd71 + 0 ];
+	@!%p1 mov.u32 %r128, %r143;
+	@!%p1 mov.u32 %r129, %r143;
+	@!%p1 mov.u32 %r130, %r143;
+	@!%p1 mov.u32 %r131, %r143;
+	.loc	1 68 57
+	@%p16 bra 	$L__BB0_7;
+	mov.u64 	%rd57, assertMessage_1;
+	cvta.global.u64 	%rd58, %rd57;
+	mov.u64 	%rd59, assertFile_1;
+	cvta.global.u64 	%rd60, %rd59;
+	mov.u64 	%rd61, assertFunc_1;
+	cvta.global.u64 	%rd62, %rd61;
+	{ // callseq 3, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd58;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd60;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r155;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd62;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd67;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 3
+	bra.uni 	$L__BB0_7;
+$L__BB0_8:
+	.loc	1 58 4
+	ret;
+$L__tmp7:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 103
+.b8 54
+.b8 102
+.b8 107
+.b8 105
+.b8 54
+.b8 112
+.b8 52
+.b8 108
+.b8 120
+.b8 114
+.b8 100
+.b8 109
+.b8 103
+.b8 103
+.b8 54
+.b8 101
+.b8 117
+.b8 100
+.b8 97
+.b8 104
+.b8 105
+.b8 101
+.b8 120
+.b8 99
+.b8 118
+.b8 117
+.b8 101
+.b8 101
+.b8 111
+.b8 108
+.b8 50
+.b8 112
+.b8 52
+.b8 113
+.b8 112
+.b8 53
+.b8 51
+.b8 50
+.b8 112
+.b8 118
+.b8 118
+.b8 101
+.b8 50
+.b8 121
+.b8 52
+.b8 54
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 47
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp6
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp6
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp4
+.b64 $L__tmp5
+.b8 2
+.b8 53
+.b8 44
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..eca3f80d15a8f035a7a5beb8debd0947f148f760
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir
@@ -0,0 +1,165 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x8xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x8xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x8xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<64x8xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<64x8xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<64x8xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<64x8xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<64x8xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<64x8xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_4 : tensor<64x8xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_10 : tensor<64x8xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<64x8xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<64x8xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<64x8xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<64x8xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<64x8xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_11 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_11, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
+    %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
+      %50 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
+      %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
+      %52 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+      %53 = arith.addi %52, %22 : tensor<64x8xi32, #blocked>
+      %54 = tt.addptr %23, %53 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %55 = tt.broadcast %51 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
+      %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %57 = arith.addi %52, %25 : tensor<64x8xi32, #blocked>
+      %58 = tt.addptr %26, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
+      %60 = arith.extf %59 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
+      %61 = tt.addptr %44, %50 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
+      %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %63 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
+      %64 = tt.broadcast %63 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+      %65 = arith.addi %64, %37 : tensor<64x8xi64, #blocked>
+      %66 = tt.addptr %38, %65 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
+      %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %68 = arith.addf %67, %56 : tensor<64x8xf32, #blocked>
+      %69 = arith.addf %68, %60 : tensor<64x8xf32, #blocked>
+      %70 = arith.subf %69, %45 : tensor<64x8xf32, #blocked>
+      %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+      %73 = arith.mulf %70, %72 : tensor<64x8xf32, #blocked>
+      %74 = tt.broadcast %62 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+      %75 = arith.mulf %73, %74 : tensor<64x8xf32, #blocked>
+      %76 = tt.addptr %48, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %77 = arith.truncf %75 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
+      tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..3acd2ad2b8e981c68165a73924f7fb678f459df8
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir
@@ -0,0 +1,153 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x8xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
+    %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x8xi32>
+    %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
+    %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
+    %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
+    %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
+    %23 = arith.andi %21, %22 : tensor<64x1xi1>
+    %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
+    %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x8xi64>
+    %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>)  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
+      %52 = arith.addi %51, %7 : tensor<1x8xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
+      %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %55 = arith.addi %54, %13 : tensor<64x8xi32>
+      %56 = tt.addptr %14, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      %59 = arith.addi %54, %16 : tensor<64x8xi32>
+      %60 = tt.addptr %17, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16>
+      %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
+      tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %63 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
+      %64 = tt.broadcast %63 : (tensor<1x8xi64>) -> tensor<64x8xi64>
+      %65 = arith.addi %64, %25 : tensor<64x8xi64>
+      %66 = tt.addptr %26, %65 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
+      %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      %68 = arith.addf %67, %58 : tensor<64x8xf32>
+      %69 = arith.addf %68, %62 : tensor<64x8xf32>
+      %70 = arith.subf %69, %arg9 : tensor<64x8xf32>
+      %71 = arith.addf %arg11, %cst_1 : tensor<64x8xf32>
+      %72 = arith.divf %70, %71 : tensor<64x8xf32>
+      %73 = arith.addf %arg9, %72 : tensor<64x8xf32>
+      %74 = arith.subf %69, %73 : tensor<64x8xf32>
+      %75 = arith.mulf %70, %74 : tensor<64x8xf32>
+      %76 = arith.addf %arg10, %75 : tensor<64x8xf32>
+      %77 = arith.select %57, %73, %arg9 : tensor<64x8xi1>, tensor<64x8xf32>
+      %78 = arith.select %57, %76, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
+      %79 = arith.select %57, %71, %arg11 : tensor<64x8xi1>, tensor<64x8xf32>
+      scf.yield %77, %78, %79 : tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>
+    }
+    %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %51 = arith.subf %arg11, %arg8 : f32
+      %52 = arith.addf %arg10, %arg13 : f32
+      %53 = arith.cmpf oeq, %52, %cst_0 : f32
+      %54 = arith.divf %arg13, %52 : f32
+      %55 = arith.select %53, %cst_0, %54 : f32
+      %56 = arith.mulf %51, %55 : f32
+      %57 = arith.addf %arg8, %56 : f32
+      %58 = arith.addf %arg9, %arg12 : f32
+      %59 = arith.mulf %51, %51 : f32
+      %60 = arith.mulf %59, %arg10 : f32
+      %61 = arith.mulf %60, %55 : f32
+      %62 = arith.addf %58, %61 : f32
+      tt.reduce.return %57, %62, %52 : f32, f32, f32
+    }) : (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
+    %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
+    %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
+    %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
+    %43 = arith.andi %41, %42 : tensor<64x1xi1>
+    %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
+    %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x8xi64>
+    %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x8xf32>
+    %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
+    %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
+    %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
+      %52 = arith.addi %51, %7 : tensor<1x8xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
+      %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %55 = arith.addi %54, %32 : tensor<64x8xi32>
+      %56 = tt.addptr %33, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      %59 = arith.addi %54, %35 : tensor<64x8xi32>
+      %60 = tt.addptr %36, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
+      %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
+      %63 = tt.addptr %37, %52 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
+      %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32>
+      tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %65 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
+      %66 = tt.broadcast %65 : (tensor<1x8xi64>) -> tensor<64x8xi64>
+      %67 = arith.addi %66, %45 : tensor<64x8xi64>
+      %68 = tt.addptr %46, %67 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
+      %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %70 = arith.addf %69, %58 : tensor<64x8xf32>
+      %71 = arith.addf %70, %62 : tensor<64x8xf32>
+      %72 = arith.subf %71, %47 : tensor<64x8xf32>
+      %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x8xf32>
+      %75 = arith.mulf %72, %74 : tensor<64x8xf32>
+      %76 = tt.broadcast %64 : (tensor<1x8xf32>) -> tensor<64x8xf32>
+      %77 = arith.mulf %75, %76 : tensor<64x8xf32>
+      %78 = tt.addptr %50, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
+      %79 = arith.truncf %77 : tensor<64x8xf32> to tensor<64x8xbf16>
+      tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..3fda91123587ab581270a7e78cdf8f7a68b6f21c
Binary files /dev/null and b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin differ
diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..0a13f05ba11e7af3195bafdd7703f65963e5d35d
--- /dev/null
+++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir
@@ -0,0 +1,75 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
+    %21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
+    %25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %28 = arith.addf %8, %12 : tensor<256xf32>
+    %29 = arith.addf %28, %16 : tensor<256xf32>
+    %30 = arith.addf %29, %20 : tensor<256xf32>
+    %31 = arith.addf %30, %24 : tensor<256xf32>
+    %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %53 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %53 : f32
+    }) : (tensor<256xf32>) -> f32
+    %34 = arith.addf %33, %cst_0 : f32
+    %35 = arith.divf %34, %cst_1 : f32
+    %36 = tt.splat %35 : (f32) -> tensor<256xf32>
+    %37 = arith.subf %31, %36 : tensor<256xf32>
+    %38 = arith.mulf %37, %37 : tensor<256xf32>
+    %39 = arith.select %2, %38, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %53 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %53 : f32
+    }) : (tensor<256xf32>) -> f32
+    %41 = arith.addf %40, %cst_0 : f32
+    %42 = arith.divf %41, %cst_1 : f32
+    %43 = arith.addf %42, %cst_2 : f32
+    %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %45 = tt.splat %44 : (f32) -> tensor<256xf32>
+    %46 = arith.mulf %37, %45 : tensor<256xf32>
+    %47 = arith.mulf %46, %27 : tensor<256xf32>
+    %48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    %50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %52 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..525f1e8103855f655a6bfdf4a15d32ccdcd98dc7
Binary files /dev/null and b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin differ
diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..1ff4ff6d6cde3e1fe19fda37de707aec8cbcaf3e
--- /dev/null
+++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir
@@ -0,0 +1,283 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %8 = and i32 %7, 31, !dbg !10
+  %9 = lshr i32 %7, 5, !dbg !10
+  %10 = and i32 %9, 1, !dbg !10
+  %urem = shl i32 %7, 2, !dbg !10
+  %11 = and i32 %urem, 252, !dbg !10
+  %12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %13 = shl i32 %12, 8, !dbg !12
+  %14 = or i32 %13, %11, !dbg !13
+  %15 = sext i32 %14 to i64, !dbg !14
+  %16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15
+  %19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15
+  %22 = bitcast i32 %18 to float, !dbg !15
+  %23 = bitcast i32 %19 to float, !dbg !15
+  %24 = bitcast i32 %20 to float, !dbg !15
+  %25 = bitcast i32 %21 to float, !dbg !15
+  %26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16
+  %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
+  %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
+  %30 = trunc i32 %28 to i16, !dbg !17
+  %extelt.offset = lshr i32 %28, 16, !dbg !17
+  %31 = trunc i32 %extelt.offset to i16, !dbg !17
+  %32 = trunc i32 %29 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %29, 16, !dbg !17
+  %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
+  %38 = zext nneg i32 %11 to i64, !dbg !19
+  %39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19
+  %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %41 = fadd float %34, %22, !dbg !21
+  %42 = fadd float %35, %23, !dbg !21
+  %43 = fadd float %36, %24, !dbg !21
+  %44 = fadd float %37, %25, !dbg !21
+  %45 = fadd float %41, %42, !dbg !22
+  %46 = fadd float %45, %43, !dbg !22
+  %47 = fadd float %46, %44, !dbg !22
+  %48 = bitcast float %47 to i32, !dbg !28
+  %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28
+  %50 = bitcast i32 %49 to float, !dbg !28
+  %51 = fadd float %47, %50, !dbg !22
+  %52 = bitcast float %51 to i32, !dbg !28
+  %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28
+  %54 = bitcast i32 %53 to float, !dbg !28
+  %55 = fadd float %51, %54, !dbg !22
+  %56 = bitcast float %55 to i32, !dbg !28
+  %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28
+  %58 = bitcast i32 %57 to float, !dbg !28
+  %59 = fadd float %55, %58, !dbg !22
+  %60 = bitcast float %59 to i32, !dbg !28
+  %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28
+  %62 = bitcast i32 %61 to float, !dbg !28
+  %63 = fadd float %59, %62, !dbg !22
+  %64 = bitcast float %63 to i32, !dbg !28
+  %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28
+  %66 = bitcast i32 %65 to float, !dbg !28
+  %67 = fadd float %63, %66, !dbg !22
+  %68 = icmp eq i32 %8, 0, !dbg !28
+  %69 = zext nneg i32 %10 to i64, !dbg !28
+  %70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %71 = icmp slt i32 %7, 2, !dbg !28
+  %72 = sext i32 %7 to i64, !dbg !28
+  %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28
+  %74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28
+  %75 = bitcast float %74 to i32, !dbg !28
+  %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28
+  %77 = bitcast i32 %76 to float, !dbg !28
+  %78 = fadd float %74, %77, !dbg !22
+  %79 = and i32 %7, 1, !dbg !28
+  %80 = icmp eq i32 %79, 0, !dbg !28
+  %81 = and i1 %71, %80, !dbg !28
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
+  %83 = fadd float %82, 0.000000e+00, !dbg !30
+  %84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34
+  %85 = fsub float %41, %84, !dbg !35
+  %86 = fsub float %42, %84, !dbg !35
+  %87 = fsub float %43, %84, !dbg !35
+  %88 = fsub float %44, %84, !dbg !35
+  %89 = fmul float %85, %85, !dbg !36
+  %90 = fmul float %86, %86, !dbg !36
+  %91 = fmul float %87, %87, !dbg !36
+  %92 = fmul float %88, %88, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %93 = fadd float %89, %90, !dbg !39
+  %94 = fadd float %91, %93, !dbg !39
+  %95 = fadd float %92, %94, !dbg !39
+  %96 = bitcast float %95 to i32, !dbg !37
+  %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37
+  %98 = bitcast i32 %97 to float, !dbg !37
+  %99 = fadd float %95, %98, !dbg !39
+  %100 = bitcast float %99 to i32, !dbg !37
+  %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37
+  %102 = bitcast i32 %101 to float, !dbg !37
+  %103 = fadd float %99, %102, !dbg !39
+  %104 = bitcast float %103 to i32, !dbg !37
+  %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37
+  %106 = bitcast i32 %105 to float, !dbg !37
+  %107 = fadd float %103, %106, !dbg !39
+  %108 = bitcast float %107 to i32, !dbg !37
+  %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37
+  %110 = bitcast i32 %109 to float, !dbg !37
+  %111 = fadd float %107, %110, !dbg !39
+  %112 = bitcast float %111 to i32, !dbg !37
+  %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37
+  %114 = bitcast i32 %113 to float, !dbg !37
+  %115 = fadd float %111, %114, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37
+  %117 = bitcast float %116 to i32, !dbg !37
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
+  %119 = bitcast i32 %118 to float, !dbg !37
+  %120 = fadd float %116, %119, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
+  %122 = fadd float %121, 0.000000e+00, !dbg !42
+  %123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44
+  %124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45
+  %125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
+  %.not.i = icmp eq i32 %125, 0, !dbg !46
+  br i1 %.not.i, label %128, label %126, !dbg !46
+
+126:                                              ; preds = %6
+  %127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46
+  br label %__nv_rsqrtf.exit, !dbg !46
+
+128:                                              ; preds = %6
+  %129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46
+  br label %__nv_rsqrtf.exit, !dbg !46
+
+__nv_rsqrtf.exit:                                 ; preds = %126, %128
+  %.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46
+  %130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20
+  %131 = bitcast i32 %130 to float, !dbg !20
+  %132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20
+  %133 = bitcast i32 %132 to float, !dbg !20
+  %134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20
+  %135 = bitcast i32 %134 to float, !dbg !20
+  %136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20
+  %137 = bitcast i32 %136 to float, !dbg !20
+  %138 = fmul float %85, %.0.i, !dbg !47
+  %139 = fmul float %86, %.0.i, !dbg !47
+  %140 = fmul float %87, %.0.i, !dbg !47
+  %141 = fmul float %88, %.0.i, !dbg !47
+  %142 = fmul float %138, %137, !dbg !48
+  %143 = fmul float %139, %135, !dbg !48
+  %144 = fmul float %140, %133, !dbg !48
+  %145 = fmul float %141, %131, !dbg !48
+  %146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49
+  %147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50
+  %148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50
+  %149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50
+  %150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50
+  %151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50
+  %152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50
+  %153 = bitcast <2 x i16> %152 to i32, !dbg !50
+  %154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50
+  %155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50
+  %156 = bitcast <2 x i16> %155 to i32, !dbg !50
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50
+  ret void, !dbg !51
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ce5cemaf763zop7tgmdl7oghweh4i2o3g632qnkrhju2cthbxnfd.py", directory: "/tmp/torchinductor_root/e5")
+!4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 31, scope: !7)
+!20 = !DILocation(line: 32, column: 36, scope: !7)
+!21 = !DILocation(line: 34, column: 18, scope: !7)
+!22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
+!23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
+!26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
+!27 = !DILocation(line: 39, column: 58, scope: !23)
+!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
+!29 = !DILocation(line: 39, column: 58, scope: !25)
+!30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!33 = !DILocation(line: 39, column: 45, scope: !31)
+!34 = !DILocation(line: 42, column: 20, scope: !7)
+!35 = !DILocation(line: 43, column: 19, scope: !7)
+!36 = !DILocation(line: 44, column: 20, scope: !7)
+!37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
+!38 = !DILocation(line: 47, column: 59, scope: !25)
+!39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
+!40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
+!41 = !DILocation(line: 47, column: 59, scope: !23)
+!42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
+!43 = !DILocation(line: 47, column: 45, scope: !31)
+!44 = !DILocation(line: 50, column: 20, scope: !7)
+!45 = !DILocation(line: 52, column: 20, scope: !7)
+!46 = !DILocation(line: 53, column: 26, scope: !7)
+!47 = !DILocation(line: 54, column: 20, scope: !7)
+!48 = !DILocation(line: 55, column: 20, scope: !7)
+!49 = !DILocation(line: 57, column: 25, scope: !7)
+!50 = !DILocation(line: 57, column: 48, scope: !7)
+!51 = !DILocation(line: 57, column: 4, scope: !7)
diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..db211bf28e227bec36f645e9aa6789e91cd4c6dc
--- /dev/null
+++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx
@@ -0,0 +1,687 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4de5de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4de5de(
+	.param .u64 triton__0d1d2d3d4de5de_param_0,
+	.param .u64 triton__0d1d2d3d4de5de_param_1,
+	.param .u64 triton__0d1d2d3d4de5de_param_2,
+	.param .u64 triton__0d1d2d3d4de5de_param_3,
+	.param .u32 triton__0d1d2d3d4de5de_param_4,
+	.param .u32 triton__0d1d2d3d4de5de_param_5
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<84>;
+	.reg .f32 	%f<70>;
+	.reg .b64 	%rd<12>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd5, [triton__0d1d2d3d4de5de_param_0];
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4de5de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r50, %tid.x;
+	and.b32  	%r51, %r50, 31;
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4de5de_param_2];
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4de5de_param_3];
+	shl.b32 	%r52, %r50, 2;
+	and.b32  	%r53, %r52, 252;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r54, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r55, %r54, %r53;
+	.loc	1 30 30
+	mul.wide.s32 	%rd9, %r55, 4;
+	add.s64 	%rd1, %rd5, %rd9;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd10, %r55, 2;
+	add.s64 	%rd2, %rd6, %rd10;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 31
+	mul.wide.u32 	%rd11, %r53, 4;
+	add.s64 	%rd3, %rd7, %rd11;
+	.loc	1 32 36
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	mov.u32 %r20, 0x0;
+	mov.u32 %r21, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	@!%p1 mov.u32 %r20, %r6;
+	@!%p1 mov.u32 %r21, %r6;
+	.loc	1 34 18
+	add.f32 	%f9, %f5, %f1;
+	add.f32 	%f10, %f6, %f2;
+	add.f32 	%f11, %f7, %f3;
+	add.f32 	%f12, %f8, %f4;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f13, %f9, %f10;
+	add.f32 	%f14, %f13, %f11;
+	add.f32 	%f15, %f14, %f12;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r56, %f15;
+	shfl.sync.bfly.b32	%r57, %r56, 16, 31, -1;
+	mov.b32 	%f16, %r57;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f17, %f15, %f16;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r58, %f17;
+	shfl.sync.bfly.b32	%r59, %r58, 8, 31, -1;
+	mov.b32 	%f18, %r59;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f19, %f17, %f18;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r60, %f19;
+	shfl.sync.bfly.b32	%r61, %r60, 4, 31, -1;
+	mov.b32 	%f20, %r61;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f21, %f19, %f20;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r62, %f21;
+	shfl.sync.bfly.b32	%r63, %r62, 2, 31, -1;
+	mov.b32 	%f22, %r63;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f23, %f21, %f22;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r64, %f23;
+	shfl.sync.bfly.b32	%r65, %r64, 1, 31, -1;
+	mov.b32 	%f24, %r65;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p14, %r51, 0;
+	shr.u32 	%r66, %r50, 3;
+	and.b32  	%r67, %r66, 4;
+	mov.u32 	%r68, global_smem;
+	add.s32 	%r26, %r68, %r67;
+	mov.b32 	%r27, %f25;
+	@%p14 st.shared.b32 [ %r26 + 0 ], %r27;
+	bar.sync 	0;
+	setp.lt.s32 	%p15, %r50, 2;
+	add.s32 	%r29, %r68, %r52;
+	@%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
+	mov.b32 	%f26, %r28;
+	shfl.sync.bfly.b32	%r69, %r28, 1, 31, -1;
+	mov.b32 	%f27, %r69;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f28, %f26, %f27;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r70, %r50, 1;
+	setp.eq.b32 	%p21, %r70, 1;
+	not.pred 	%p22, %p21;
+	and.pred  	%p16, %p15, %p22;
+	mov.b32 	%r31, %f28;
+	@%p16 st.shared.b32 [ %r29 + 0 ], %r31;
+	bar.sync 	0;
+	ld.shared.f32 	%f29, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f30, %f29, 0f00000000;
+$L__tmp16:
+	.loc	1 42 20
+	mov.b32 	%r33, %f30;
+	mov.b32 	%r34, 1132462080;
+	div.full.f32 %r32, %r33, %r34;
+	mov.b32 	%f31, %r32;
+	.loc	1 43 19
+	sub.f32 	%f32, %f9, %f31;
+	sub.f32 	%f33, %f10, %f31;
+	sub.f32 	%f34, %f11, %f31;
+	sub.f32 	%f35, %f12, %f31;
+	.loc	1 44 20
+	mul.f32 	%f36, %f33, %f33;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f37, %f32, %f32, %f36;
+	fma.rn.f32 	%f38, %f34, %f34, %f37;
+	fma.rn.f32 	%f39, %f35, %f35, %f38;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r71, %f39;
+	shfl.sync.bfly.b32	%r72, %r71, 16, 31, -1;
+	mov.b32 	%f40, %r72;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r73, %f41;
+	shfl.sync.bfly.b32	%r74, %r73, 8, 31, -1;
+	mov.b32 	%f42, %r74;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r75, %f43;
+	shfl.sync.bfly.b32	%r76, %r75, 4, 31, -1;
+	mov.b32 	%f44, %r76;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r77, %f45;
+	shfl.sync.bfly.b32	%r78, %r77, 2, 31, -1;
+	mov.b32 	%f46, %r78;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r79, %f47;
+	shfl.sync.bfly.b32	%r80, %r79, 1, 31, -1;
+	mov.b32 	%f48, %r80;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r36, %f49;
+	@%p14 st.shared.b32 [ %r26 + 0 ], %r36;
+	bar.sync 	0;
+	@%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
+	mov.b32 	%f50, %r37;
+	shfl.sync.bfly.b32	%r81, %r37, 1, 31, -1;
+	mov.b32 	%f51, %r81;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r40, %f52;
+	@%p16 st.shared.b32 [ %r29 + 0 ], %r40;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp33:
+	.loc	1 50 20
+	mov.b32 	%r42, %f54;
+	div.full.f32 %r41, %r42, %r34;
+	mov.b32 	%f55, %r41;
+	.loc	1 52 20
+	add.f32 	%f56, %f55, 0f3727C5AC;
+	.loc	1 53 26
+	rsqrt.approx.ftz.f32 	%f57, %f56;
+	.loc	1 32 36
+	mov.b32 	%f58, %r21;
+	mov.b32 	%f59, %r20;
+	mov.b32 	%f60, %r19;
+	mov.b32 	%f61, %r18;
+	.loc	1 54 20
+	mul.f32 	%f62, %f32, %f57;
+	mul.f32 	%f63, %f33, %f57;
+	mul.f32 	%f64, %f34, %f57;
+	mul.f32 	%f65, %f35, %f57;
+	.loc	1 55 20
+	mul.f32 	%f66, %f62, %f61;
+	mul.f32 	%f67, %f63, %f60;
+	mul.f32 	%f68, %f64, %f59;
+	mul.f32 	%f69, %f65, %f58;
+	.loc	1 57 25
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 57 48
+	mov.b32 	%r44, %f66;
+	cvt.rn.bf16.f32 %rs5, %r44;
+	mov.b32 	%r45, %f67;
+	cvt.rn.bf16.f32 %rs6, %r45;
+	mov.b32 	%r46, %f68;
+	cvt.rn.bf16.f32 %rs7, %r46;
+	mov.b32 	%r47, %f69;
+	cvt.rn.bf16.f32 %rs8, %r47;
+	mov.b32 	%r82, {%rs5, %rs6};
+	mov.b32 	%r83, {%rs7, %rs8};
+	@%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 };
+	.loc	1 57 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/e5/ce5cemaf763zop7tgmdl7oghweh4i2o3g632qnkrhju2cthbxnfd.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 391
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 101
+.b8 53
+.b8 99
+.b8 101
+.b8 109
+.b8 97
+.b8 102
+.b8 55
+.b8 54
+.b8 51
+.b8 122
+.b8 111
+.b8 112
+.b8 55
+.b8 116
+.b8 103
+.b8 109
+.b8 100
+.b8 108
+.b8 55
+.b8 111
+.b8 103
+.b8 104
+.b8 119
+.b8 101
+.b8 104
+.b8 52
+.b8 105
+.b8 50
+.b8 111
+.b8 51
+.b8 103
+.b8 54
+.b8 51
+.b8 50
+.b8 113
+.b8 110
+.b8 107
+.b8 114
+.b8 104
+.b8 106
+.b8 117
+.b8 50
+.b8 99
+.b8 116
+.b8 104
+.b8 98
+.b8 120
+.b8 110
+.b8 102
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 101
+.b8 53
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 101
+.b8 53
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 101
+.b8 53
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 39
+.b8 58
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 39
+.b8 58
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 39
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 47
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 47
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 47
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 395
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 101
+.b8 53
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 395
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..baeef93f1bde8390ba5b72235755ba68556b3cd1
--- /dev/null
+++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir
@@ -0,0 +1,58 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
+    ^bb0(%arg6: f32, %arg7: f32):
+      %36 = arith.addf %arg6, %arg7 : f32
+      tt.reduce.return %36 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %19 = arith.addf %18, %cst_2 : f32
+    %20 = arith.divf %19, %cst_1 : f32
+    %21 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
+    %22 = arith.subf %16, %21 : tensor<256xf32, #blocked>
+    %23 = arith.mulf %22, %22 : tensor<256xf32, #blocked>
+    %24 = arith.select %2, %23, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
+    ^bb0(%arg6: f32, %arg7: f32):
+      %36 = arith.addf %arg6, %arg7 : f32
+      tt.reduce.return %36 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %26 = arith.addf %25, %cst_2 : f32
+    %27 = arith.divf %26, %cst_1 : f32
+    %28 = arith.addf %27, %cst_0 : f32
+    %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %30 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
+    %31 = arith.mulf %22, %30 : tensor<256xf32, #blocked>
+    %32 = arith.mulf %31, %15 : tensor<256xf32, #blocked>
+    %33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %35 = arith.truncf %32 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..05b967d0f0faec4cddb151f963e070b40d565f08
--- /dev/null
+++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.llir
@@ -0,0 +1,1121 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = lshr i32 %9, 5, !dbg !10
+  %11 = and i32 %10, 7, !dbg !10
+  %12 = and i32 %9, 15, !dbg !10
+  %13 = shl i32 %9, 3, !dbg !11
+  %14 = and i32 %13, 248, !dbg !11
+  %15 = or i32 %14, 4, !dbg !11
+  %urem = and i32 %9, 255, !dbg !11
+  %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %17 = shl i32 %16, 4, !dbg !13
+  %18 = or i32 %17, %11, !dbg !14
+  %19 = or i32 %18, 8, !dbg !14
+  %20 = or i32 %17, %12, !dbg !14
+  %21 = sext i32 %18 to i64, !dbg !15
+  %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
+  %23 = sext i32 %19 to i64, !dbg !15
+  %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
+  %25 = sext i32 %20 to i64, !dbg !15
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
+  %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
+  %44 = srem i32 %18, 512, !dbg !17
+  %45 = srem i32 %19, 512, !dbg !17
+  %46 = shl nsw i32 %44, 8, !dbg !18
+  %47 = shl nsw i32 %45, 8, !dbg !18
+  %48 = or i32 %46, %14, !dbg !19
+  %49 = or i32 %46, %15, !dbg !19
+  %50 = or i32 %47, %14, !dbg !19
+  %51 = or i32 %47, %15, !dbg !19
+  %52 = sext i32 %48 to i64, !dbg !20
+  %53 = getelementptr float, ptr addrspace(1) %2, i64 %52, !dbg !20
+  %54 = sext i32 %49 to i64, !dbg !20
+  %55 = getelementptr float, ptr addrspace(1) %2, i64 %54, !dbg !20
+  %56 = sext i32 %50 to i64, !dbg !20
+  %57 = getelementptr float, ptr addrspace(1) %2, i64 %56, !dbg !20
+  %58 = sext i32 %51 to i64, !dbg !20
+  %59 = getelementptr float, ptr addrspace(1) %2, i64 %58, !dbg !20
+  %60 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %61 = extractvalue { i32, i32, i32, i32 } %60, 0, !dbg !21
+  %62 = extractvalue { i32, i32, i32, i32 } %60, 1, !dbg !21
+  %63 = extractvalue { i32, i32, i32, i32 } %60, 2, !dbg !21
+  %64 = extractvalue { i32, i32, i32, i32 } %60, 3, !dbg !21
+  %65 = bitcast i32 %61 to float, !dbg !21
+  %66 = bitcast i32 %62 to float, !dbg !21
+  %67 = bitcast i32 %63 to float, !dbg !21
+  %68 = bitcast i32 %64 to float, !dbg !21
+  %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !21
+  %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !21
+  %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !21
+  %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !21
+  %74 = bitcast i32 %70 to float, !dbg !21
+  %75 = bitcast i32 %71 to float, !dbg !21
+  %76 = bitcast i32 %72 to float, !dbg !21
+  %77 = bitcast i32 %73 to float, !dbg !21
+  %78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !21
+  %80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !21
+  %81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !21
+  %82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !21
+  %83 = bitcast i32 %79 to float, !dbg !21
+  %84 = bitcast i32 %80 to float, !dbg !21
+  %85 = bitcast i32 %81 to float, !dbg !21
+  %86 = bitcast i32 %82 to float, !dbg !21
+  %87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !21
+  %89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !21
+  %90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !21
+  %91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !21
+  %92 = bitcast i32 %88 to float, !dbg !21
+  %93 = bitcast i32 %89 to float, !dbg !21
+  %94 = bitcast i32 %90 to float, !dbg !21
+  %95 = bitcast i32 %91 to float, !dbg !21
+  %96 = shl i32 %18, 8, !dbg !22
+  %97 = shl i32 %19, 8, !dbg !22
+  %98 = or i32 %96, %14, !dbg !23
+  %99 = or i32 %97, %14, !dbg !23
+  %100 = sext i32 %98 to i64, !dbg !24
+  %101 = getelementptr i16, ptr addrspace(1) %3, i64 %100, !dbg !24
+  %102 = sext i32 %99 to i64, !dbg !24
+  %103 = getelementptr i16, ptr addrspace(1) %3, i64 %102, !dbg !24
+  %104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
+  %105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !25
+  %106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !25
+  %107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !25
+  %108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !25
+  %109 = trunc i32 %105 to i16, !dbg !25
+  %extelt.offset = lshr i32 %105, 16, !dbg !25
+  %110 = trunc i32 %extelt.offset to i16, !dbg !25
+  %111 = trunc i32 %106 to i16, !dbg !25
+  %extelt.offset1 = lshr i32 %106, 16, !dbg !25
+  %112 = trunc i32 %extelt.offset1 to i16, !dbg !25
+  %113 = trunc i32 %107 to i16, !dbg !25
+  %extelt.offset2 = lshr i32 %107, 16, !dbg !25
+  %114 = trunc i32 %extelt.offset2 to i16, !dbg !25
+  %115 = trunc i32 %108 to i16, !dbg !25
+  %extelt.offset3 = lshr i32 %108, 16, !dbg !25
+  %116 = trunc i32 %extelt.offset3 to i16, !dbg !25
+  %117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
+  %118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !25
+  %119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !25
+  %120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !25
+  %121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !25
+  %122 = trunc i32 %118 to i16, !dbg !25
+  %extelt.offset4 = lshr i32 %118, 16, !dbg !25
+  %123 = trunc i32 %extelt.offset4 to i16, !dbg !25
+  %124 = trunc i32 %119 to i16, !dbg !25
+  %extelt.offset5 = lshr i32 %119, 16, !dbg !25
+  %125 = trunc i32 %extelt.offset5 to i16, !dbg !25
+  %126 = trunc i32 %120 to i16, !dbg !25
+  %extelt.offset6 = lshr i32 %120, 16, !dbg !25
+  %127 = trunc i32 %extelt.offset6 to i16, !dbg !25
+  %128 = trunc i32 %121 to i16, !dbg !25
+  %extelt.offset7 = lshr i32 %121, 16, !dbg !25
+  %129 = trunc i32 %extelt.offset7 to i16, !dbg !25
+  %130 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %109) #6, !dbg !26
+  %131 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %110) #6, !dbg !26
+  %132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #6, !dbg !26
+  %133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #6, !dbg !26
+  %134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !26
+  %135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !26
+  %136 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !26
+  %137 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !26
+  %138 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %122) #6, !dbg !26
+  %139 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %123) #6, !dbg !26
+  %140 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %124) #6, !dbg !26
+  %141 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %125) #6, !dbg !26
+  %142 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %126) #6, !dbg !26
+  %143 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %127) #6, !dbg !26
+  %144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #6, !dbg !26
+  %145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #6, !dbg !26
+  %146 = add i64 %43, 50257, !dbg !27
+  %147 = icmp slt i64 %27, 0, !dbg !28
+  %148 = icmp slt i64 %35, 0, !dbg !28
+  %149 = icmp slt i64 %43, 0, !dbg !28
+  %150 = select i1 %149, i64 %146, i64 %43, !dbg !29
+  %151 = icmp ugt i64 %150, 50256, !dbg !30
+  br i1 %151, label %152, label %153, !dbg !31
+
+152:                                              ; preds = %8
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
+  br label %153, !dbg !31
+
+153:                                              ; preds = %152, %8
+  %154 = shl i64 %27, 8, !dbg !32
+  %155 = add i64 %154, 12865792, !dbg !32
+  %156 = select i1 %147, i64 %155, i64 %154, !dbg !32
+  %157 = shl i64 %35, 8, !dbg !32
+  %158 = add i64 %157, 12865792, !dbg !32
+  %159 = select i1 %148, i64 %158, i64 %157, !dbg !32
+  %160 = zext nneg i32 %14 to i64
+  %161 = zext nneg i32 %15 to i64
+  %162 = or i64 %156, %160, !dbg !33
+  %163 = or i64 %156, %161, !dbg !33
+  %164 = or i64 %159, %160, !dbg !33
+  %165 = or i64 %159, %161, !dbg !33
+  %166 = getelementptr float, ptr addrspace(1) %1, i64 %162, !dbg !34
+  %167 = getelementptr float, ptr addrspace(1) %1, i64 %163, !dbg !34
+  %168 = getelementptr float, ptr addrspace(1) %1, i64 %164, !dbg !34
+  %169 = getelementptr float, ptr addrspace(1) %1, i64 %165, !dbg !34
+  %170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !35
+  %172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !35
+  %173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !35
+  %174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !35
+  %175 = bitcast i32 %171 to float, !dbg !35
+  %176 = bitcast i32 %172 to float, !dbg !35
+  %177 = bitcast i32 %173 to float, !dbg !35
+  %178 = bitcast i32 %174 to float, !dbg !35
+  %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !35
+  %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !35
+  %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !35
+  %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !35
+  %184 = bitcast i32 %180 to float, !dbg !35
+  %185 = bitcast i32 %181 to float, !dbg !35
+  %186 = bitcast i32 %182 to float, !dbg !35
+  %187 = bitcast i32 %183 to float, !dbg !35
+  %188 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %189 = extractvalue { i32, i32, i32, i32 } %188, 0, !dbg !35
+  %190 = extractvalue { i32, i32, i32, i32 } %188, 1, !dbg !35
+  %191 = extractvalue { i32, i32, i32, i32 } %188, 2, !dbg !35
+  %192 = extractvalue { i32, i32, i32, i32 } %188, 3, !dbg !35
+  %193 = bitcast i32 %189 to float, !dbg !35
+  %194 = bitcast i32 %190 to float, !dbg !35
+  %195 = bitcast i32 %191 to float, !dbg !35
+  %196 = bitcast i32 %192 to float, !dbg !35
+  %197 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %198 = extractvalue { i32, i32, i32, i32 } %197, 0, !dbg !35
+  %199 = extractvalue { i32, i32, i32, i32 } %197, 1, !dbg !35
+  %200 = extractvalue { i32, i32, i32, i32 } %197, 2, !dbg !35
+  %201 = extractvalue { i32, i32, i32, i32 } %197, 3, !dbg !35
+  %202 = bitcast i32 %198 to float, !dbg !35
+  %203 = bitcast i32 %199 to float, !dbg !35
+  %204 = bitcast i32 %200 to float, !dbg !35
+  %205 = bitcast i32 %201 to float, !dbg !35
+  %206 = fadd float %65, %175, !dbg !36
+  %207 = fadd float %66, %176, !dbg !36
+  %208 = fadd float %67, %177, !dbg !36
+  %209 = fadd float %68, %178, !dbg !36
+  %210 = fadd float %74, %184, !dbg !36
+  %211 = fadd float %75, %185, !dbg !36
+  %212 = fadd float %76, %186, !dbg !36
+  %213 = fadd float %77, %187, !dbg !36
+  %214 = fadd float %83, %193, !dbg !36
+  %215 = fadd float %84, %194, !dbg !36
+  %216 = fadd float %85, %195, !dbg !36
+  %217 = fadd float %86, %196, !dbg !36
+  %218 = fadd float %92, %202, !dbg !36
+  %219 = fadd float %93, %203, !dbg !36
+  %220 = fadd float %94, %204, !dbg !36
+  %221 = fadd float %95, %205, !dbg !36
+  %222 = fadd float %130, %206, !dbg !37
+  %223 = fadd float %131, %207, !dbg !37
+  %224 = fadd float %132, %208, !dbg !37
+  %225 = fadd float %133, %209, !dbg !37
+  %226 = fadd float %134, %210, !dbg !37
+  %227 = fadd float %135, %211, !dbg !37
+  %228 = fadd float %136, %212, !dbg !37
+  %229 = fadd float %137, %213, !dbg !37
+  %230 = fadd float %138, %214, !dbg !37
+  %231 = fadd float %139, %215, !dbg !37
+  %232 = fadd float %140, %216, !dbg !37
+  %233 = fadd float %141, %217, !dbg !37
+  %234 = fadd float %142, %218, !dbg !37
+  %235 = fadd float %143, %219, !dbg !37
+  %236 = fadd float %144, %220, !dbg !37
+  %237 = fadd float %145, %221, !dbg !37
+  %238 = fadd float %222, 0.000000e+00, !dbg !38
+  %239 = fadd float %223, 0.000000e+00, !dbg !38
+  %240 = fadd float %224, 0.000000e+00, !dbg !38
+  %241 = fadd float %225, 0.000000e+00, !dbg !38
+  %242 = fadd float %226, 0.000000e+00, !dbg !38
+  %243 = fadd float %227, 0.000000e+00, !dbg !38
+  %244 = fadd float %228, 0.000000e+00, !dbg !38
+  %245 = fadd float %229, 0.000000e+00, !dbg !38
+  %246 = fadd float %230, 0.000000e+00, !dbg !38
+  %247 = fadd float %231, 0.000000e+00, !dbg !38
+  %248 = fadd float %232, 0.000000e+00, !dbg !38
+  %249 = fadd float %233, 0.000000e+00, !dbg !38
+  %250 = fadd float %234, 0.000000e+00, !dbg !38
+  %251 = fadd float %235, 0.000000e+00, !dbg !38
+  %252 = fadd float %236, 0.000000e+00, !dbg !38
+  %253 = fadd float %237, 0.000000e+00, !dbg !38
+  %254 = fsub float %222, %238, !dbg !42
+  %255 = fsub float %223, %239, !dbg !42
+  %256 = fsub float %224, %240, !dbg !42
+  %257 = fsub float %225, %241, !dbg !42
+  %258 = fsub float %226, %242, !dbg !42
+  %259 = fsub float %227, %243, !dbg !42
+  %260 = fsub float %228, %244, !dbg !42
+  %261 = fsub float %229, %245, !dbg !42
+  %262 = fsub float %230, %246, !dbg !42
+  %263 = fsub float %231, %247, !dbg !42
+  %264 = fsub float %232, %248, !dbg !42
+  %265 = fsub float %233, %249, !dbg !42
+  %266 = fsub float %234, %250, !dbg !42
+  %267 = fsub float %235, %251, !dbg !42
+  %268 = fsub float %236, %252, !dbg !42
+  %269 = fsub float %237, %253, !dbg !42
+  %270 = fmul float %222, %254, !dbg !43
+  %271 = fmul float %223, %255, !dbg !43
+  %272 = fmul float %224, %256, !dbg !43
+  %273 = fmul float %225, %257, !dbg !43
+  %274 = fmul float %226, %258, !dbg !43
+  %275 = fmul float %227, %259, !dbg !43
+  %276 = fmul float %228, %260, !dbg !43
+  %277 = fmul float %229, %261, !dbg !43
+  %278 = fmul float %230, %262, !dbg !43
+  %279 = fmul float %231, %263, !dbg !43
+  %280 = fmul float %232, %264, !dbg !43
+  %281 = fmul float %233, %265, !dbg !43
+  %282 = fmul float %234, %266, !dbg !43
+  %283 = fmul float %235, %267, !dbg !43
+  %284 = fmul float %236, %268, !dbg !43
+  %285 = fmul float %237, %269, !dbg !43
+  %286 = fadd float %270, 0.000000e+00, !dbg !44
+  %287 = fadd float %271, 0.000000e+00, !dbg !44
+  %288 = fadd float %272, 0.000000e+00, !dbg !44
+  %289 = fadd float %273, 0.000000e+00, !dbg !44
+  %290 = fadd float %274, 0.000000e+00, !dbg !44
+  %291 = fadd float %275, 0.000000e+00, !dbg !44
+  %292 = fadd float %276, 0.000000e+00, !dbg !44
+  %293 = fadd float %277, 0.000000e+00, !dbg !44
+  %294 = fadd float %278, 0.000000e+00, !dbg !44
+  %295 = fadd float %279, 0.000000e+00, !dbg !44
+  %296 = fadd float %280, 0.000000e+00, !dbg !44
+  %297 = fadd float %281, 0.000000e+00, !dbg !44
+  %298 = fadd float %282, 0.000000e+00, !dbg !44
+  %299 = fadd float %283, 0.000000e+00, !dbg !44
+  %300 = fadd float %284, 0.000000e+00, !dbg !44
+  %301 = fadd float %285, 0.000000e+00, !dbg !44
+  %302 = fsub float %239, %238, !dbg !45
+  %303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
+  %304 = fmul float %303, %302, !dbg !50
+  %305 = fadd float %238, %304, !dbg !51
+  %306 = fadd float %286, %287, !dbg !52
+  %307 = fmul float %302, %302, !dbg !53
+  %308 = fmul float %303, %307, !dbg !54
+  %309 = fadd float %308, %306, !dbg !55
+  %310 = fsub float %240, %305, !dbg !45
+  %311 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
+  %312 = fmul float %311, %310, !dbg !50
+  %313 = fadd float %305, %312, !dbg !51
+  %314 = fadd float %288, %309, !dbg !52
+  %315 = fmul float %310, %310, !dbg !53
+  %316 = fmul float %315, 2.000000e+00, !dbg !56
+  %317 = fmul float %311, %316, !dbg !54
+  %318 = fadd float %314, %317, !dbg !55
+  %319 = fsub float %241, %313, !dbg !45
+  %320 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
+  %321 = fmul float %320, %319, !dbg !50
+  %322 = fadd float %313, %321, !dbg !51
+  %323 = fadd float %289, %318, !dbg !52
+  %324 = fmul float %319, %319, !dbg !53
+  %325 = fmul float %324, 3.000000e+00, !dbg !56
+  %326 = fmul float %320, %325, !dbg !54
+  %327 = fadd float %323, %326, !dbg !55
+  %328 = fsub float %242, %322, !dbg !45
+  %329 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
+  %330 = fmul float %329, %328, !dbg !50
+  %331 = fadd float %322, %330, !dbg !51
+  %332 = fadd float %290, %327, !dbg !52
+  %333 = fmul float %328, %328, !dbg !53
+  %334 = fmul float %333, 4.000000e+00, !dbg !56
+  %335 = fmul float %329, %334, !dbg !54
+  %336 = fadd float %332, %335, !dbg !55
+  %337 = fsub float %243, %331, !dbg !45
+  %338 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
+  %339 = fmul float %338, %337, !dbg !50
+  %340 = fadd float %331, %339, !dbg !51
+  %341 = fadd float %291, %336, !dbg !52
+  %342 = fmul float %337, %337, !dbg !53
+  %343 = fmul float %342, 5.000000e+00, !dbg !56
+  %344 = fmul float %338, %343, !dbg !54
+  %345 = fadd float %341, %344, !dbg !55
+  %346 = fsub float %244, %340, !dbg !45
+  %347 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
+  %348 = fmul float %347, %346, !dbg !50
+  %349 = fadd float %340, %348, !dbg !51
+  %350 = fadd float %292, %345, !dbg !52
+  %351 = fmul float %346, %346, !dbg !53
+  %352 = fmul float %351, 6.000000e+00, !dbg !56
+  %353 = fmul float %347, %352, !dbg !54
+  %354 = fadd float %350, %353, !dbg !55
+  %355 = fsub float %245, %349, !dbg !45
+  %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
+  %357 = fmul float %356, %355, !dbg !50
+  %358 = fadd float %349, %357, !dbg !51
+  %359 = fadd float %293, %354, !dbg !52
+  %360 = fmul float %355, %355, !dbg !53
+  %361 = fmul float %360, 7.000000e+00, !dbg !56
+  %362 = fmul float %356, %361, !dbg !54
+  %363 = fadd float %359, %362, !dbg !55
+  %364 = fsub float %247, %246, !dbg !45
+  %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
+  %366 = fmul float %364, %365, !dbg !50
+  %367 = fadd float %246, %366, !dbg !51
+  %368 = fadd float %294, %295, !dbg !52
+  %369 = fmul float %364, %364, !dbg !53
+  %370 = fmul float %369, %365, !dbg !54
+  %371 = fadd float %368, %370, !dbg !55
+  %372 = fsub float %248, %367, !dbg !45
+  %373 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
+  %374 = fmul float %373, %372, !dbg !50
+  %375 = fadd float %367, %374, !dbg !51
+  %376 = fadd float %296, %371, !dbg !52
+  %377 = fmul float %372, %372, !dbg !53
+  %378 = fmul float %377, 2.000000e+00, !dbg !56
+  %379 = fmul float %373, %378, !dbg !54
+  %380 = fadd float %376, %379, !dbg !55
+  %381 = fsub float %249, %375, !dbg !45
+  %382 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
+  %383 = fmul float %382, %381, !dbg !50
+  %384 = fadd float %375, %383, !dbg !51
+  %385 = fadd float %297, %380, !dbg !52
+  %386 = fmul float %381, %381, !dbg !53
+  %387 = fmul float %386, 3.000000e+00, !dbg !56
+  %388 = fmul float %382, %387, !dbg !54
+  %389 = fadd float %385, %388, !dbg !55
+  %390 = fsub float %250, %384, !dbg !45
+  %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 5.000000e+00) #6, !dbg !49
+  %392 = fmul float %391, %390, !dbg !50
+  %393 = fadd float %384, %392, !dbg !51
+  %394 = fadd float %298, %389, !dbg !52
+  %395 = fmul float %390, %390, !dbg !53
+  %396 = fmul float %395, 4.000000e+00, !dbg !56
+  %397 = fmul float %391, %396, !dbg !54
+  %398 = fadd float %394, %397, !dbg !55
+  %399 = fsub float %251, %393, !dbg !45
+  %400 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 6.000000e+00) #6, !dbg !49
+  %401 = fmul float %400, %399, !dbg !50
+  %402 = fadd float %393, %401, !dbg !51
+  %403 = fadd float %299, %398, !dbg !52
+  %404 = fmul float %399, %399, !dbg !53
+  %405 = fmul float %404, 5.000000e+00, !dbg !56
+  %406 = fmul float %400, %405, !dbg !54
+  %407 = fadd float %403, %406, !dbg !55
+  %408 = fsub float %252, %402, !dbg !45
+  %409 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 7.000000e+00) #6, !dbg !49
+  %410 = fmul float %409, %408, !dbg !50
+  %411 = fadd float %402, %410, !dbg !51
+  %412 = fadd float %300, %407, !dbg !52
+  %413 = fmul float %408, %408, !dbg !53
+  %414 = fmul float %413, 6.000000e+00, !dbg !56
+  %415 = fmul float %409, %414, !dbg !54
+  %416 = fadd float %412, %415, !dbg !55
+  %417 = fsub float %253, %411, !dbg !45
+  %418 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 8.000000e+00) #6, !dbg !49
+  %419 = fmul float %418, %417, !dbg !50
+  %420 = fadd float %411, %419, !dbg !51
+  %421 = fadd float %301, %416, !dbg !52
+  %422 = fmul float %417, %417, !dbg !53
+  %423 = fmul float %422, 7.000000e+00, !dbg !56
+  %424 = fmul float %418, %423, !dbg !54
+  %425 = fadd float %421, %424, !dbg !55
+  %426 = bitcast float %358 to i32, !dbg !57
+  %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 16, i32 31), !dbg !57
+  %428 = bitcast i32 %427 to float, !dbg !57
+  %429 = bitcast float %363 to i32, !dbg !57
+  %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !57
+  %431 = bitcast i32 %430 to float, !dbg !57
+  %432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
+  %433 = bitcast i32 %432 to float, !dbg !57
+  %434 = fsub float %428, %358, !dbg !45
+  %435 = fadd float %433, 8.000000e+00, !dbg !59
+  %436 = fcmp oeq float %435, 0.000000e+00, !dbg !60
+  %437 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %433, float %435) #6, !dbg !49
+  %438 = select i1 %436, float 0.000000e+00, float %437, !dbg !61
+  %439 = fmul float %438, %434, !dbg !50
+  %440 = fadd float %358, %439, !dbg !51
+  %441 = fadd float %363, %431, !dbg !52
+  %442 = fmul float %434, %434, !dbg !53
+  %443 = fmul float %442, 8.000000e+00, !dbg !56
+  %444 = fmul float %438, %443, !dbg !54
+  %445 = fadd float %441, %444, !dbg !55
+  %446 = bitcast float %440 to i32, !dbg !57
+  %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !57
+  %448 = bitcast i32 %447 to float, !dbg !57
+  %449 = bitcast float %445 to i32, !dbg !57
+  %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 8, i32 31), !dbg !57
+  %451 = bitcast i32 %450 to float, !dbg !57
+  %452 = bitcast float %435 to i32, !dbg !57
+  %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 8, i32 31), !dbg !57
+  %454 = bitcast i32 %453 to float, !dbg !57
+  %455 = fsub float %448, %440, !dbg !45
+  %456 = fadd float %435, %454, !dbg !59
+  %457 = fcmp oeq float %456, 0.000000e+00, !dbg !60
+  %458 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %454, float %456) #6, !dbg !49
+  %459 = select i1 %457, float 0.000000e+00, float %458, !dbg !61
+  %460 = fmul float %459, %455, !dbg !50
+  %461 = fadd float %440, %460, !dbg !51
+  %462 = fadd float %445, %451, !dbg !52
+  %463 = fmul float %455, %455, !dbg !53
+  %464 = fmul float %435, %463, !dbg !56
+  %465 = fmul float %459, %464, !dbg !54
+  %466 = fadd float %462, %465, !dbg !55
+  %467 = bitcast float %461 to i32, !dbg !57
+  %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 4, i32 31), !dbg !57
+  %469 = bitcast i32 %468 to float, !dbg !57
+  %470 = bitcast float %466 to i32, !dbg !57
+  %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 4, i32 31), !dbg !57
+  %472 = bitcast i32 %471 to float, !dbg !57
+  %473 = bitcast float %456 to i32, !dbg !57
+  %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !57
+  %475 = bitcast i32 %474 to float, !dbg !57
+  %476 = fsub float %469, %461, !dbg !45
+  %477 = fadd float %456, %475, !dbg !59
+  %478 = fcmp oeq float %477, 0.000000e+00, !dbg !60
+  %479 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %475, float %477) #6, !dbg !49
+  %480 = select i1 %478, float 0.000000e+00, float %479, !dbg !61
+  %481 = fmul float %480, %476, !dbg !50
+  %482 = fadd float %461, %481, !dbg !51
+  %483 = fadd float %466, %472, !dbg !52
+  %484 = fmul float %476, %476, !dbg !53
+  %485 = fmul float %456, %484, !dbg !56
+  %486 = fmul float %480, %485, !dbg !54
+  %487 = fadd float %483, %486, !dbg !55
+  %488 = bitcast float %482 to i32, !dbg !57
+  %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 2, i32 31), !dbg !57
+  %490 = bitcast i32 %489 to float, !dbg !57
+  %491 = bitcast float %487 to i32, !dbg !57
+  %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57
+  %493 = bitcast i32 %492 to float, !dbg !57
+  %494 = bitcast float %477 to i32, !dbg !57
+  %495 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 2, i32 31), !dbg !57
+  %496 = bitcast i32 %495 to float, !dbg !57
+  %497 = fsub float %490, %482, !dbg !45
+  %498 = fadd float %477, %496, !dbg !59
+  %499 = fcmp oeq float %498, 0.000000e+00, !dbg !60
+  %500 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %496, float %498) #6, !dbg !49
+  %501 = select i1 %499, float 0.000000e+00, float %500, !dbg !61
+  %502 = fmul float %497, %501, !dbg !50
+  %503 = fadd float %482, %502, !dbg !51
+  %504 = fadd float %487, %493, !dbg !52
+  %505 = fmul float %497, %497, !dbg !53
+  %506 = fmul float %477, %505, !dbg !56
+  %507 = fmul float %501, %506, !dbg !54
+  %508 = fadd float %504, %507, !dbg !55
+  %509 = bitcast float %503 to i32, !dbg !57
+  %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !57
+  %511 = bitcast float %508 to i32, !dbg !57
+  %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 1, i32 31), !dbg !57
+  %513 = bitcast float %498 to i32, !dbg !57
+  %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !57
+  %515 = bitcast i32 %514 to float, !dbg !57
+  %516 = fadd float %498, %515, !dbg !59
+  %517 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %515, float %516) #6, !dbg !49
+  %518 = bitcast float %420 to i32, !dbg !57
+  %519 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %518, i32 16, i32 31), !dbg !57
+  %520 = bitcast i32 %519 to float, !dbg !57
+  %521 = bitcast float %425 to i32, !dbg !57
+  %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 16, i32 31), !dbg !57
+  %523 = bitcast i32 %522 to float, !dbg !57
+  %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1090519040, i32 16, i32 31), !dbg !57
+  %525 = bitcast i32 %524 to float, !dbg !57
+  %526 = fsub float %520, %420, !dbg !45
+  %527 = fadd float %525, 8.000000e+00, !dbg !59
+  %528 = fcmp oeq float %527, 0.000000e+00, !dbg !60
+  %529 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %525, float %527) #6, !dbg !49
+  %530 = select i1 %528, float 0.000000e+00, float %529, !dbg !61
+  %531 = fmul float %526, %530, !dbg !50
+  %532 = fadd float %420, %531, !dbg !51
+  %533 = fadd float %425, %523, !dbg !52
+  %534 = fmul float %526, %526, !dbg !53
+  %535 = fmul float %534, 8.000000e+00, !dbg !56
+  %536 = fmul float %535, %530, !dbg !54
+  %537 = fadd float %533, %536, !dbg !55
+  %538 = bitcast float %532 to i32, !dbg !57
+  %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !57
+  %540 = bitcast i32 %539 to float, !dbg !57
+  %541 = bitcast float %537 to i32, !dbg !57
+  %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 8, i32 31), !dbg !57
+  %543 = bitcast i32 %542 to float, !dbg !57
+  %544 = bitcast float %527 to i32, !dbg !57
+  %545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %544, i32 8, i32 31), !dbg !57
+  %546 = bitcast i32 %545 to float, !dbg !57
+  %547 = fsub float %540, %532, !dbg !45
+  %548 = fadd float %527, %546, !dbg !59
+  %549 = fcmp oeq float %548, 0.000000e+00, !dbg !60
+  %550 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %546, float %548) #6, !dbg !49
+  %551 = select i1 %549, float 0.000000e+00, float %550, !dbg !61
+  %552 = fmul float %547, %551, !dbg !50
+  %553 = fadd float %532, %552, !dbg !51
+  %554 = fadd float %537, %543, !dbg !52
+  %555 = fmul float %547, %547, !dbg !53
+  %556 = fmul float %527, %555, !dbg !56
+  %557 = fmul float %551, %556, !dbg !54
+  %558 = fadd float %554, %557, !dbg !55
+  %559 = bitcast float %553 to i32, !dbg !57
+  %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !57
+  %561 = bitcast i32 %560 to float, !dbg !57
+  %562 = bitcast float %558 to i32, !dbg !57
+  %563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 4, i32 31), !dbg !57
+  %564 = bitcast i32 %563 to float, !dbg !57
+  %565 = bitcast float %548 to i32, !dbg !57
+  %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 4, i32 31), !dbg !57
+  %567 = bitcast i32 %566 to float, !dbg !57
+  %568 = fsub float %561, %553, !dbg !45
+  %569 = fadd float %548, %567, !dbg !59
+  %570 = fcmp oeq float %569, 0.000000e+00, !dbg !60
+  %571 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %567, float %569) #6, !dbg !49
+  %572 = select i1 %570, float 0.000000e+00, float %571, !dbg !61
+  %573 = fmul float %568, %572, !dbg !50
+  %574 = fadd float %553, %573, !dbg !51
+  %575 = fadd float %558, %564, !dbg !52
+  %576 = fmul float %568, %568, !dbg !53
+  %577 = fmul float %548, %576, !dbg !56
+  %578 = fmul float %572, %577, !dbg !54
+  %579 = fadd float %575, %578, !dbg !55
+  %580 = bitcast float %574 to i32, !dbg !57
+  %581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 2, i32 31), !dbg !57
+  %582 = bitcast i32 %581 to float, !dbg !57
+  %583 = bitcast float %579 to i32, !dbg !57
+  %584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %583, i32 2, i32 31), !dbg !57
+  %585 = bitcast i32 %584 to float, !dbg !57
+  %586 = bitcast float %569 to i32, !dbg !57
+  %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 2, i32 31), !dbg !57
+  %588 = bitcast i32 %587 to float, !dbg !57
+  %589 = fsub float %582, %574, !dbg !45
+  %590 = fadd float %569, %588, !dbg !59
+  %591 = fcmp oeq float %590, 0.000000e+00, !dbg !60
+  %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %588, float %590) #6, !dbg !49
+  %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !61
+  %594 = fmul float %589, %593, !dbg !50
+  %595 = fadd float %574, %594, !dbg !51
+  %596 = fadd float %579, %585, !dbg !52
+  %597 = fmul float %589, %589, !dbg !53
+  %598 = fmul float %569, %597, !dbg !56
+  %599 = fmul float %593, %598, !dbg !54
+  %600 = fadd float %596, %599, !dbg !55
+  %601 = bitcast float %595 to i32, !dbg !57
+  %602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !57
+  %603 = bitcast float %600 to i32, !dbg !57
+  %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %603, i32 1, i32 31), !dbg !57
+  %605 = bitcast float %590 to i32, !dbg !57
+  %606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !57
+  %607 = bitcast i32 %606 to float, !dbg !57
+  %608 = fadd float %590, %607, !dbg !59
+  %609 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %607, float %608) #6, !dbg !49
+  %610 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %611 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %612 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %613 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %59, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
+  %614 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %101, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
+  %615 = extractvalue { i32, i32, i32, i32 } %614, 0, !dbg !63
+  %616 = extractvalue { i32, i32, i32, i32 } %614, 1, !dbg !63
+  %617 = extractvalue { i32, i32, i32, i32 } %614, 2, !dbg !63
+  %618 = extractvalue { i32, i32, i32, i32 } %614, 3, !dbg !63
+  %619 = trunc i32 %615 to i16, !dbg !63
+  %extelt.offset8 = lshr i32 %615, 16, !dbg !63
+  %620 = trunc i32 %extelt.offset8 to i16, !dbg !63
+  %621 = trunc i32 %616 to i16, !dbg !63
+  %extelt.offset9 = lshr i32 %616, 16, !dbg !63
+  %622 = trunc i32 %extelt.offset9 to i16, !dbg !63
+  %623 = trunc i32 %617 to i16, !dbg !63
+  %extelt.offset10 = lshr i32 %617, 16, !dbg !63
+  %624 = trunc i32 %extelt.offset10 to i16, !dbg !63
+  %625 = trunc i32 %618 to i16, !dbg !63
+  %extelt.offset11 = lshr i32 %618, 16, !dbg !63
+  %626 = trunc i32 %extelt.offset11 to i16, !dbg !63
+  %627 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %103, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
+  %628 = extractvalue { i32, i32, i32, i32 } %627, 0, !dbg !63
+  %629 = extractvalue { i32, i32, i32, i32 } %627, 1, !dbg !63
+  %630 = extractvalue { i32, i32, i32, i32 } %627, 2, !dbg !63
+  %631 = extractvalue { i32, i32, i32, i32 } %627, 3, !dbg !63
+  %632 = trunc i32 %628 to i16, !dbg !63
+  %extelt.offset12 = lshr i32 %628, 16, !dbg !63
+  %633 = trunc i32 %extelt.offset12 to i16, !dbg !63
+  %634 = trunc i32 %629 to i16, !dbg !63
+  %extelt.offset13 = lshr i32 %629, 16, !dbg !63
+  %635 = trunc i32 %extelt.offset13 to i16, !dbg !63
+  %636 = trunc i32 %630 to i16, !dbg !63
+  %extelt.offset14 = lshr i32 %630, 16, !dbg !63
+  %637 = trunc i32 %extelt.offset14 to i16, !dbg !63
+  %638 = trunc i32 %631 to i16, !dbg !63
+  %extelt.offset15 = lshr i32 %631, 16, !dbg !63
+  %639 = trunc i32 %extelt.offset15 to i16, !dbg !63
+  %640 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %619) #6, !dbg !64
+  %641 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %620) #6, !dbg !64
+  %642 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %621) #6, !dbg !64
+  %643 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %622) #6, !dbg !64
+  %644 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %623) #6, !dbg !64
+  %645 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %624) #6, !dbg !64
+  %646 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %625) #6, !dbg !64
+  %647 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %626) #6, !dbg !64
+  %648 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %632) #6, !dbg !64
+  %649 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %633) #6, !dbg !64
+  %650 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %634) #6, !dbg !64
+  %651 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %635) #6, !dbg !64
+  %652 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %636) #6, !dbg !64
+  %653 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %637) #6, !dbg !64
+  %654 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %638) #6, !dbg !64
+  %655 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %639) #6, !dbg !64
+  %656 = zext nneg i32 %urem to i64, !dbg !65
+  %657 = getelementptr float, ptr addrspace(1) %4, i64 %656, !dbg !65
+  %658 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %657, i1 true, i32 0, i1 true) #6, !dbg !66
+  br i1 %151, label %659, label %660, !dbg !67
+
+659:                                              ; preds = %153
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
+  br label %660, !dbg !67
+
+660:                                              ; preds = %659, %153
+  %661 = bitcast i32 %604 to float, !dbg !57
+  %662 = fadd float %600, %661, !dbg !52
+  %663 = bitcast i32 %602 to float, !dbg !57
+  %664 = fsub float %663, %595, !dbg !45
+  %665 = fmul float %664, %664, !dbg !53
+  %666 = fmul float %590, %665, !dbg !56
+  %667 = fcmp oeq float %608, 0.000000e+00, !dbg !60
+  %668 = select i1 %667, float 0.000000e+00, float %609, !dbg !61
+  %669 = fmul float %668, %666, !dbg !54
+  %670 = fadd float %662, %669, !dbg !55
+  %671 = bitcast i32 %512 to float, !dbg !57
+  %672 = fadd float %508, %671, !dbg !52
+  %673 = bitcast i32 %510 to float, !dbg !57
+  %674 = fsub float %673, %503, !dbg !45
+  %675 = fmul float %674, %674, !dbg !53
+  %676 = fmul float %498, %675, !dbg !56
+  %677 = fcmp oeq float %516, 0.000000e+00, !dbg !60
+  %678 = select i1 %677, float 0.000000e+00, float %517, !dbg !61
+  %679 = fmul float %678, %676, !dbg !54
+  %680 = fadd float %672, %679, !dbg !55
+  %681 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %166, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %682 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %167, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %168, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %684 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %169, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %685 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %686 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %687 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %689 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %690 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %691 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %680, float 2.560000e+02) #6, !dbg !69
+  %693 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %694 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %695 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %696 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %697 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %698 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %699 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %700 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %670, float 2.560000e+02) #6, !dbg !69
+  %701 = fadd float %685, 0x3EE4F8B580000000, !dbg !70
+  %702 = fadd float %693, 0x3EE4F8B580000000, !dbg !70
+  %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %.not.i = icmp eq i32 %703, 0, !dbg !71
+  br i1 %.not.i, label %706, label %704, !dbg !71
+
+704:                                              ; preds = %660
+  %705 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %701), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+
+706:                                              ; preds = %660
+  %707 = tail call float @llvm.nvvm.rsqrt.approx.f(float %701), !dbg !71
+  br label %__nv_rsqrtf.exit, !dbg !71
+
+__nv_rsqrtf.exit:                                 ; preds = %704, %706
+  %.0.i = phi float [ %705, %704 ], [ %707, %706 ], !dbg !71
+  %708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %.not.i37 = icmp eq i32 %715, 0, !dbg !71
+  br i1 %.not.i37, label %718, label %716, !dbg !71
+
+716:                                              ; preds = %__nv_rsqrtf.exit
+  %717 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %702), !dbg !71
+  br label %__nv_rsqrtf.exit39, !dbg !71
+
+718:                                              ; preds = %__nv_rsqrtf.exit
+  %719 = tail call float @llvm.nvvm.rsqrt.approx.f(float %702), !dbg !71
+  br label %__nv_rsqrtf.exit39, !dbg !71
+
+__nv_rsqrtf.exit39:                               ; preds = %716, %718
+  %.0.i38 = phi float [ %717, %716 ], [ %719, %718 ], !dbg !71
+  %720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
+  %727 = extractvalue { i32, i32, i32, i32 } %684, 3, !dbg !68
+  %728 = bitcast i32 %727 to float, !dbg !68
+  %729 = extractvalue { i32, i32, i32, i32 } %613, 3, !dbg !62
+  %730 = bitcast i32 %729 to float, !dbg !62
+  %731 = fadd float %730, %728, !dbg !72
+  %732 = fadd float %655, %731, !dbg !73
+  %733 = fmul float %664, %668, !dbg !50
+  %734 = fadd float %595, %733, !dbg !51
+  %735 = fsub float %732, %734, !dbg !74
+  %736 = extractvalue { i32, i32, i32, i32 } %684, 2, !dbg !68
+  %737 = bitcast i32 %736 to float, !dbg !68
+  %738 = extractvalue { i32, i32, i32, i32 } %613, 2, !dbg !62
+  %739 = bitcast i32 %738 to float, !dbg !62
+  %740 = fadd float %739, %737, !dbg !72
+  %741 = fadd float %654, %740, !dbg !73
+  %742 = fsub float %741, %734, !dbg !74
+  %743 = extractvalue { i32, i32, i32, i32 } %684, 1, !dbg !68
+  %744 = bitcast i32 %743 to float, !dbg !68
+  %745 = extractvalue { i32, i32, i32, i32 } %613, 1, !dbg !62
+  %746 = bitcast i32 %745 to float, !dbg !62
+  %747 = fadd float %746, %744, !dbg !72
+  %748 = fadd float %653, %747, !dbg !73
+  %749 = fsub float %748, %734, !dbg !74
+  %750 = extractvalue { i32, i32, i32, i32 } %684, 0, !dbg !68
+  %751 = bitcast i32 %750 to float, !dbg !68
+  %752 = extractvalue { i32, i32, i32, i32 } %613, 0, !dbg !62
+  %753 = bitcast i32 %752 to float, !dbg !62
+  %754 = fadd float %753, %751, !dbg !72
+  %755 = fadd float %652, %754, !dbg !73
+  %756 = fsub float %755, %734, !dbg !74
+  %757 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !68
+  %758 = bitcast i32 %757 to float, !dbg !68
+  %759 = extractvalue { i32, i32, i32, i32 } %612, 3, !dbg !62
+  %760 = bitcast i32 %759 to float, !dbg !62
+  %761 = fadd float %760, %758, !dbg !72
+  %762 = fadd float %651, %761, !dbg !73
+  %763 = fsub float %762, %734, !dbg !74
+  %764 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !68
+  %765 = bitcast i32 %764 to float, !dbg !68
+  %766 = extractvalue { i32, i32, i32, i32 } %612, 2, !dbg !62
+  %767 = bitcast i32 %766 to float, !dbg !62
+  %768 = fadd float %767, %765, !dbg !72
+  %769 = fadd float %650, %768, !dbg !73
+  %770 = fsub float %769, %734, !dbg !74
+  %771 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !68
+  %772 = bitcast i32 %771 to float, !dbg !68
+  %773 = extractvalue { i32, i32, i32, i32 } %612, 1, !dbg !62
+  %774 = bitcast i32 %773 to float, !dbg !62
+  %775 = fadd float %774, %772, !dbg !72
+  %776 = fadd float %649, %775, !dbg !73
+  %777 = fsub float %776, %734, !dbg !74
+  %778 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !68
+  %779 = bitcast i32 %778 to float, !dbg !68
+  %780 = extractvalue { i32, i32, i32, i32 } %612, 0, !dbg !62
+  %781 = bitcast i32 %780 to float, !dbg !62
+  %782 = fadd float %781, %779, !dbg !72
+  %783 = fadd float %648, %782, !dbg !73
+  %784 = fsub float %783, %734, !dbg !74
+  %785 = extractvalue { i32, i32, i32, i32 } %682, 3, !dbg !68
+  %786 = bitcast i32 %785 to float, !dbg !68
+  %787 = extractvalue { i32, i32, i32, i32 } %611, 3, !dbg !62
+  %788 = bitcast i32 %787 to float, !dbg !62
+  %789 = fadd float %788, %786, !dbg !72
+  %790 = fadd float %647, %789, !dbg !73
+  %791 = fmul float %674, %678, !dbg !50
+  %792 = fadd float %503, %791, !dbg !51
+  %793 = fsub float %790, %792, !dbg !74
+  %794 = extractvalue { i32, i32, i32, i32 } %682, 2, !dbg !68
+  %795 = bitcast i32 %794 to float, !dbg !68
+  %796 = extractvalue { i32, i32, i32, i32 } %611, 2, !dbg !62
+  %797 = bitcast i32 %796 to float, !dbg !62
+  %798 = fadd float %797, %795, !dbg !72
+  %799 = fadd float %646, %798, !dbg !73
+  %800 = fsub float %799, %792, !dbg !74
+  %801 = extractvalue { i32, i32, i32, i32 } %682, 1, !dbg !68
+  %802 = bitcast i32 %801 to float, !dbg !68
+  %803 = extractvalue { i32, i32, i32, i32 } %611, 1, !dbg !62
+  %804 = bitcast i32 %803 to float, !dbg !62
+  %805 = fadd float %804, %802, !dbg !72
+  %806 = fadd float %645, %805, !dbg !73
+  %807 = fsub float %806, %792, !dbg !74
+  %808 = extractvalue { i32, i32, i32, i32 } %682, 0, !dbg !68
+  %809 = bitcast i32 %808 to float, !dbg !68
+  %810 = extractvalue { i32, i32, i32, i32 } %611, 0, !dbg !62
+  %811 = bitcast i32 %810 to float, !dbg !62
+  %812 = fadd float %811, %809, !dbg !72
+  %813 = fadd float %644, %812, !dbg !73
+  %814 = fsub float %813, %792, !dbg !74
+  %815 = extractvalue { i32, i32, i32, i32 } %681, 3, !dbg !68
+  %816 = bitcast i32 %815 to float, !dbg !68
+  %817 = extractvalue { i32, i32, i32, i32 } %610, 3, !dbg !62
+  %818 = bitcast i32 %817 to float, !dbg !62
+  %819 = fadd float %818, %816, !dbg !72
+  %820 = fadd float %643, %819, !dbg !73
+  %821 = fsub float %820, %792, !dbg !74
+  %822 = extractvalue { i32, i32, i32, i32 } %681, 2, !dbg !68
+  %823 = bitcast i32 %822 to float, !dbg !68
+  %824 = extractvalue { i32, i32, i32, i32 } %610, 2, !dbg !62
+  %825 = bitcast i32 %824 to float, !dbg !62
+  %826 = fadd float %825, %823, !dbg !72
+  %827 = fadd float %642, %826, !dbg !73
+  %828 = fsub float %827, %792, !dbg !74
+  %829 = extractvalue { i32, i32, i32, i32 } %681, 1, !dbg !68
+  %830 = bitcast i32 %829 to float, !dbg !68
+  %831 = extractvalue { i32, i32, i32, i32 } %610, 1, !dbg !62
+  %832 = bitcast i32 %831 to float, !dbg !62
+  %833 = fadd float %832, %830, !dbg !72
+  %834 = fadd float %641, %833, !dbg !73
+  %835 = fsub float %834, %792, !dbg !74
+  %836 = extractvalue { i32, i32, i32, i32 } %681, 0, !dbg !68
+  %837 = bitcast i32 %836 to float, !dbg !68
+  %838 = extractvalue { i32, i32, i32, i32 } %610, 0, !dbg !62
+  %839 = bitcast i32 %838 to float, !dbg !62
+  %840 = fadd float %839, %837, !dbg !72
+  %841 = fadd float %640, %840, !dbg !73
+  %842 = fsub float %841, %792, !dbg !74
+  %843 = fmul float %842, %.0.i, !dbg !75
+  %844 = fmul float %835, %.0.i, !dbg !75
+  %845 = fmul float %828, %.0.i, !dbg !75
+  %846 = fmul float %821, %.0.i, !dbg !75
+  %847 = fmul float %814, %.0.i, !dbg !75
+  %848 = fmul float %807, %.0.i, !dbg !75
+  %849 = fmul float %800, %.0.i, !dbg !75
+  %850 = fmul float %793, %.0.i, !dbg !75
+  %851 = fmul float %784, %.0.i38, !dbg !75
+  %852 = fmul float %777, %.0.i38, !dbg !75
+  %853 = fmul float %770, %.0.i38, !dbg !75
+  %854 = fmul float %763, %.0.i38, !dbg !75
+  %855 = fmul float %756, %.0.i38, !dbg !75
+  %856 = fmul float %749, %.0.i38, !dbg !75
+  %857 = fmul float %742, %.0.i38, !dbg !75
+  %858 = fmul float %735, %.0.i38, !dbg !75
+  %859 = getelementptr float, ptr addrspace(3) @global_smem, i64 %656, !dbg !76
+  store i32 %658, ptr addrspace(3) %859, align 4, !dbg !76
+  tail call void @llvm.nvvm.barrier0(), !dbg !76
+  %860 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !76
+  %861 = load float, ptr addrspace(3) %860, align 32, !dbg !76
+  %862 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 1, !dbg !76
+  %863 = load float, ptr addrspace(3) %862, align 4, !dbg !76
+  %864 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 2, !dbg !76
+  %865 = load float, ptr addrspace(3) %864, align 8, !dbg !76
+  %866 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 3, !dbg !76
+  %867 = load float, ptr addrspace(3) %866, align 4, !dbg !76
+  %868 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 4, !dbg !76
+  %869 = load float, ptr addrspace(3) %868, align 16, !dbg !76
+  %870 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 5, !dbg !76
+  %871 = load float, ptr addrspace(3) %870, align 4, !dbg !76
+  %872 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 6, !dbg !76
+  %873 = load float, ptr addrspace(3) %872, align 8, !dbg !76
+  %874 = getelementptr inbounds <8 x float>, ptr addrspace(3) %860, i64 0, i64 7, !dbg !76
+  %875 = load float, ptr addrspace(3) %874, align 4, !dbg !76
+  %876 = fmul float %843, %861, !dbg !76
+  %877 = fmul float %844, %863, !dbg !76
+  %878 = fmul float %845, %865, !dbg !76
+  %879 = fmul float %846, %867, !dbg !76
+  %880 = fmul float %847, %869, !dbg !76
+  %881 = fmul float %848, %871, !dbg !76
+  %882 = fmul float %849, %873, !dbg !76
+  %883 = fmul float %850, %875, !dbg !76
+  %884 = fmul float %851, %861, !dbg !76
+  %885 = fmul float %852, %863, !dbg !76
+  %886 = fmul float %853, %865, !dbg !76
+  %887 = fmul float %854, %867, !dbg !76
+  %888 = fmul float %855, %869, !dbg !76
+  %889 = fmul float %856, %871, !dbg !76
+  %890 = fmul float %857, %873, !dbg !76
+  %891 = fmul float %858, %875, !dbg !76
+  %892 = getelementptr i16, ptr addrspace(1) %5, i64 %100, !dbg !77
+  %893 = getelementptr i16, ptr addrspace(1) %5, i64 %102, !dbg !77
+  %894 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %876) #6, !dbg !78
+  %895 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %877) #6, !dbg !78
+  %896 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %878) #6, !dbg !78
+  %897 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %879) #6, !dbg !78
+  %898 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %880) #6, !dbg !78
+  %899 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %881) #6, !dbg !78
+  %900 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %882) #6, !dbg !78
+  %901 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %883) #6, !dbg !78
+  %902 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %884) #6, !dbg !78
+  %903 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %885) #6, !dbg !78
+  %904 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %886) #6, !dbg !78
+  %905 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %887) #6, !dbg !78
+  %906 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %888) #6, !dbg !78
+  %907 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %889) #6, !dbg !78
+  %908 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %890) #6, !dbg !78
+  %909 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %891) #6, !dbg !78
+  %910 = insertelement <2 x i16> undef, i16 %894, i64 0, !dbg !78
+  %911 = insertelement <2 x i16> %910, i16 %895, i64 1, !dbg !78
+  %912 = bitcast <2 x i16> %911 to i32, !dbg !78
+  %913 = insertelement <2 x i16> undef, i16 %896, i64 0, !dbg !78
+  %914 = insertelement <2 x i16> %913, i16 %897, i64 1, !dbg !78
+  %915 = bitcast <2 x i16> %914 to i32, !dbg !78
+  %916 = insertelement <2 x i16> undef, i16 %898, i64 0, !dbg !78
+  %917 = insertelement <2 x i16> %916, i16 %899, i64 1, !dbg !78
+  %918 = bitcast <2 x i16> %917 to i32, !dbg !78
+  %919 = insertelement <2 x i16> undef, i16 %900, i64 0, !dbg !78
+  %920 = insertelement <2 x i16> %919, i16 %901, i64 1, !dbg !78
+  %921 = bitcast <2 x i16> %920 to i32, !dbg !78
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %912, i32 %915, i32 %918, i32 %921, ptr addrspace(1) %892, i1 true) #6, !dbg !78
+  %922 = insertelement <2 x i16> undef, i16 %902, i64 0, !dbg !78
+  %923 = insertelement <2 x i16> %922, i16 %903, i64 1, !dbg !78
+  %924 = bitcast <2 x i16> %923 to i32, !dbg !78
+  %925 = insertelement <2 x i16> undef, i16 %904, i64 0, !dbg !78
+  %926 = insertelement <2 x i16> %925, i16 %905, i64 1, !dbg !78
+  %927 = bitcast <2 x i16> %926 to i32, !dbg !78
+  %928 = insertelement <2 x i16> undef, i16 %906, i64 0, !dbg !78
+  %929 = insertelement <2 x i16> %928, i16 %907, i64 1, !dbg !78
+  %930 = bitcast <2 x i16> %929 to i32, !dbg !78
+  %931 = insertelement <2 x i16> undef, i16 %908, i64 0, !dbg !78
+  %932 = insertelement <2 x i16> %931, i16 %909, i64 1, !dbg !78
+  %933 = bitcast <2 x i16> %932 to i32, !dbg !78
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %924, i32 %927, i32 %930, i32 %933, ptr addrspace(1) %893, i1 true) #6, !dbg !78
+  ret void, !dbg !79
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 44, scope: !7)
+!23 = !DILocation(line: 36, column: 40, scope: !7)
+!24 = !DILocation(line: 36, column: 34, scope: !7)
+!25 = !DILocation(line: 36, column: 50, scope: !7)
+!26 = !DILocation(line: 36, column: 101, scope: !7)
+!27 = !DILocation(line: 37, column: 22, scope: !7)
+!28 = !DILocation(line: 38, column: 22, scope: !7)
+!29 = !DILocation(line: 39, column: 36, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 55, scope: !7)
+!32 = !DILocation(line: 41, column: 44, scope: !7)
+!33 = !DILocation(line: 41, column: 40, scope: !7)
+!34 = !DILocation(line: 41, column: 34, scope: !7)
+!35 = !DILocation(line: 41, column: 52, scope: !7)
+!36 = !DILocation(line: 42, column: 22, scope: !7)
+!37 = !DILocation(line: 44, column: 22, scope: !7)
+!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 47, column: 41, scope: !39)
+!42 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
+!43 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
+!44 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
+!45 = !DILocation(line: 108, column: 21, scope: !46, inlinedAt: !47)
+!46 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
+!47 = !DILocation(line: 120, column: 46, scope: !46, inlinedAt: !48)
+!48 = !DILocation(line: 53, column: 44, scope: !46)
+!49 = !DILocation(line: 110, column: 60, scope: !46, inlinedAt: !47)
+!50 = !DILocation(line: 112, column: 25, scope: !46, inlinedAt: !47)
+!51 = !DILocation(line: 112, column: 17, scope: !46, inlinedAt: !47)
+!52 = !DILocation(line: 113, column: 15, scope: !46, inlinedAt: !47)
+!53 = !DILocation(line: 113, column: 30, scope: !46, inlinedAt: !47)
+!54 = !DILocation(line: 113, column: 49, scope: !46, inlinedAt: !47)
+!55 = !DILocation(line: 113, column: 22, scope: !46, inlinedAt: !47)
+!56 = !DILocation(line: 113, column: 38, scope: !46, inlinedAt: !47)
+!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
+!58 = !DILocation(line: 53, column: 44, scope: !39)
+!59 = !DILocation(line: 109, column: 28, scope: !46, inlinedAt: !47)
+!60 = !DILocation(line: 110, column: 39, scope: !46, inlinedAt: !47)
+!61 = !DILocation(line: 110, column: 49, scope: !46, inlinedAt: !47)
+!62 = !DILocation(line: 62, column: 51, scope: !7)
+!63 = !DILocation(line: 63, column: 51, scope: !7)
+!64 = !DILocation(line: 63, column: 103, scope: !7)
+!65 = !DILocation(line: 64, column: 35, scope: !7)
+!66 = !DILocation(line: 64, column: 40, scope: !7)
+!67 = !DILocation(line: 68, column: 57, scope: !7)
+!68 = !DILocation(line: 69, column: 54, scope: !7)
+!69 = !DILocation(line: 75, column: 24, scope: !7)
+!70 = !DILocation(line: 77, column: 24, scope: !7)
+!71 = !DILocation(line: 78, column: 30, scope: !7)
+!72 = !DILocation(line: 70, column: 24, scope: !7)
+!73 = !DILocation(line: 72, column: 24, scope: !7)
+!74 = !DILocation(line: 73, column: 24, scope: !7)
+!75 = !DILocation(line: 79, column: 24, scope: !7)
+!76 = !DILocation(line: 80, column: 24, scope: !7)
+!77 = !DILocation(line: 82, column: 29, scope: !7)
+!78 = !DILocation(line: 82, column: 52, scope: !7)
+!79 = !DILocation(line: 58, column: 4, scope: !7)
diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..01750644dc980d239a3288f3712733b3ff9371d0
--- /dev/null
+++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx
@@ -0,0 +1,1854 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<137>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<439>;
+	.reg .f32 	%f<487>;
+	.reg .b64 	%rd<124>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd17, [triton__0d1d2d3d4d5d6de7de_param_4];
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5d6de7de_param_1];
+	ld.param.u64 	%rd59, [triton__0d1d2d3d4d5d6de7de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r89, %tid.x;
+	ld.param.u64 	%rd60, [triton__0d1d2d3d4d5d6de7de_param_2];
+	bfe.u32 	%r90, %r89, 5, 3;
+	ld.param.u64 	%rd61, [triton__0d1d2d3d4d5d6de7de_param_3];
+	and.b32  	%r91, %r89, 15;
+	.loc	1 24 33
+	shl.b32 	%r92, %r89, 3;
+	and.b32  	%r1, %r92, 248;
+	and.b32  	%r2, %r89, 255;
+	.loc	1 21 28
+	mov.u32 %r24, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r93, %r24, 4;
+	.loc	1 22 23
+	or.b32  	%r94, %r93, %r90;
+	or.b32  	%r95, %r94, 8;
+	or.b32  	%r96, %r93, %r91;
+	.loc	1 26 30
+	mul.wide.s32 	%rd62, %r94, 8;
+	add.s64 	%rd20, %rd59, %rd62;
+	add.s64 	%rd36, %rd20, 64;
+	mul.wide.s32 	%rd63, %r96, 8;
+	add.s64 	%rd52, %rd59, %rd63;
+	mov.pred 	%p113, -1;
+	.loc	1 26 35
+	mov.u64 %rd19, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ];
+	mov.u64 %rd21, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ];
+	mov.u64 %rd23, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ];
+	mov.u64 %rd25, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ];
+	mov.u64 %rd27, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ];
+	mov.u64 %rd29, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ];
+	mov.u64 %rd31, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ];
+	mov.u64 %rd33, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ];
+	mov.u64 %rd35, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ];
+	mov.u64 %rd37, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ];
+	mov.u64 %rd39, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ];
+	mov.u64 %rd41, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ];
+	mov.u64 %rd43, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ];
+	mov.u64 %rd45, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ];
+	mov.u64 %rd47, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ];
+	mov.u64 %rd49, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ];
+	mov.u64 %rd51, 0x0;
+	@%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r97, %r24, 27, 1;
+	shr.u32 	%r98, %r97, 23;
+	add.s32 	%r99, %r94, %r98;
+	and.b32  	%r100, %r99, 16776704;
+	sub.s32 	%r101, %r94, %r100;
+	add.s32 	%r102, %r95, %r98;
+	and.b32  	%r103, %r102, 16776704;
+	sub.s32 	%r104, %r95, %r103;
+	.loc	1 35 44
+	shl.b32 	%r105, %r101, 8;
+	shl.b32 	%r106, %r104, 8;
+	.loc	1 35 40
+	or.b32  	%r107, %r105, %r1;
+	or.b32  	%r108, %r106, %r1;
+	.loc	1 35 34
+	mul.wide.s32 	%rd64, %r107, 4;
+	add.s64 	%rd89, %rd60, %rd64;
+	cvt.s64.s32 	%rd65, %r105;
+	cvt.u64.u32 	%rd66, %r1;
+	or.b64  	%rd67, %rd65, %rd66;
+	shl.b64 	%rd68, %rd67, 2;
+	add.s64 	%rd69, %rd60, %rd68;
+	add.s64 	%rd90, %rd69, 16;
+	mul.wide.s32 	%rd70, %r108, 4;
+	add.s64 	%rd91, %rd60, %rd70;
+	cvt.s64.s32 	%rd71, %r106;
+	or.b64  	%rd72, %rd71, %rd66;
+	shl.b64 	%rd73, %rd72, 2;
+	add.s64 	%rd74, %rd60, %rd73;
+	add.s64 	%rd92, %rd74, 16;
+	mov.b32 	%r325, 0;
+	.loc	1 35 50
+	mov.u32 %r25, 0x0;
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ];
+	@!%p113 mov.u32 %r25, %r325;
+	@!%p113 mov.u32 %r26, %r325;
+	@!%p113 mov.u32 %r27, %r325;
+	@!%p113 mov.u32 %r28, %r325;
+	mov.b32 	%f1, %r25;
+	mov.b32 	%f2, %r26;
+	mov.b32 	%f3, %r27;
+	mov.b32 	%f4, %r28;
+	mov.u32 %r33, 0x0;
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ];
+	@!%p113 mov.u32 %r33, %r325;
+	@!%p113 mov.u32 %r34, %r325;
+	@!%p113 mov.u32 %r35, %r325;
+	@!%p113 mov.u32 %r36, %r325;
+	mov.b32 	%f5, %r33;
+	mov.b32 	%f6, %r34;
+	mov.b32 	%f7, %r35;
+	mov.b32 	%f8, %r36;
+	mov.u32 %r41, 0x0;
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ];
+	@!%p113 mov.u32 %r41, %r325;
+	@!%p113 mov.u32 %r42, %r325;
+	@!%p113 mov.u32 %r43, %r325;
+	@!%p113 mov.u32 %r44, %r325;
+	mov.b32 	%f9, %r41;
+	mov.b32 	%f10, %r42;
+	mov.b32 	%f11, %r43;
+	mov.b32 	%f12, %r44;
+	mov.u32 %r49, 0x0;
+	mov.u32 %r50, 0x0;
+	mov.u32 %r51, 0x0;
+	mov.u32 %r52, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ];
+	@!%p113 mov.u32 %r49, %r325;
+	@!%p113 mov.u32 %r50, %r325;
+	@!%p113 mov.u32 %r51, %r325;
+	@!%p113 mov.u32 %r52, %r325;
+	mov.b32 	%f13, %r49;
+	mov.b32 	%f14, %r50;
+	mov.b32 	%f15, %r51;
+	mov.b32 	%f16, %r52;
+	.loc	1 36 44
+	shl.b32 	%r109, %r94, 8;
+	shl.b32 	%r110, %r95, 8;
+	.loc	1 36 40
+	or.b32  	%r111, %r109, %r1;
+	or.b32  	%r112, %r110, %r1;
+	.loc	1 36 34
+	mul.wide.s32 	%rd75, %r111, 2;
+	add.s64 	%rd93, %rd61, %rd75;
+	mul.wide.s32 	%rd76, %r112, 2;
+	add.s64 	%rd94, %rd61, %rd76;
+	.loc	1 36 50
+	mov.u32 %r57, 0x0;
+	mov.u32 %r58, 0x0;
+	mov.u32 %r59, 0x0;
+	mov.u32 %r60, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ];
+	@!%p113 mov.u32 %r57, %r325;
+	@!%p113 mov.u32 %r58, %r325;
+	@!%p113 mov.u32 %r59, %r325;
+	@!%p113 mov.u32 %r60, %r325;
+	cvt.u16.u32 	%rs1, %r57;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; }
+	cvt.u16.u32 	%rs3, %r58;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; }
+	cvt.u16.u32 	%rs5, %r59;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; }
+	cvt.u16.u32 	%rs7, %r60;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; }
+	mov.u32 %r65, 0x0;
+	mov.u32 %r66, 0x0;
+	mov.u32 %r67, 0x0;
+	mov.u32 %r68, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ];
+	@!%p113 mov.u32 %r65, %r325;
+	@!%p113 mov.u32 %r66, %r325;
+	@!%p113 mov.u32 %r67, %r325;
+	@!%p113 mov.u32 %r68, %r325;
+	cvt.u16.u32 	%rs9, %r65;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; }
+	cvt.u16.u32 	%rs11, %r66;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; }
+	cvt.u16.u32 	%rs13, %r67;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; }
+	cvt.u16.u32 	%rs15, %r68;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r73, %rs1;
+	mov.b32 	%f17, %r73;
+	cvt.f32.bf16 %r74, %rs2;
+	mov.b32 	%f18, %r74;
+	cvt.f32.bf16 %r75, %rs3;
+	mov.b32 	%f19, %r75;
+	cvt.f32.bf16 %r76, %rs4;
+	mov.b32 	%f20, %r76;
+	cvt.f32.bf16 %r77, %rs5;
+	mov.b32 	%f21, %r77;
+	cvt.f32.bf16 %r78, %rs6;
+	mov.b32 	%f22, %r78;
+	cvt.f32.bf16 %r79, %rs7;
+	mov.b32 	%f23, %r79;
+	cvt.f32.bf16 %r80, %rs8;
+	mov.b32 	%f24, %r80;
+	cvt.f32.bf16 %r81, %rs9;
+	mov.b32 	%f25, %r81;
+	cvt.f32.bf16 %r82, %rs10;
+	mov.b32 	%f26, %r82;
+	cvt.f32.bf16 %r83, %rs11;
+	mov.b32 	%f27, %r83;
+	cvt.f32.bf16 %r84, %rs12;
+	mov.b32 	%f28, %r84;
+	cvt.f32.bf16 %r85, %rs13;
+	mov.b32 	%f29, %r85;
+	cvt.f32.bf16 %r86, %rs14;
+	mov.b32 	%f30, %r86;
+	cvt.f32.bf16 %r87, %rs15;
+	mov.b32 	%f31, %r87;
+	cvt.f32.bf16 %r88, %rs16;
+	mov.b32 	%f32, %r88;
+	.loc	1 37 22
+	add.s64 	%rd77, %rd51, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p48, %rd51, 0;
+	.loc	1 39 36
+	selp.b64 	%rd11, %rd77, %rd51, %p48;
+	.loc	1 40 40
+	setp.lt.u64 	%p49, %rd11, 50257;
+	mov.b32 	%r438, 883;
+	mov.u64 	%rd123, 1;
+	.loc	1 40 55
+	@%p49 bra 	$L__BB0_2;
+	mov.u64 	%rd78, assertMessage_0;
+	cvta.global.u64 	%rd79, %rd78;
+	mov.u64 	%rd80, assertFile_0;
+	cvta.global.u64 	%rd81, %rd80;
+	mov.u64 	%rd82, assertFunc_0;
+	cvta.global.u64 	%rd83, %rd82;
+	{ // callseq 8, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd79;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd81;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r438;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd83;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd123;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 8
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd18, [triton__0d1d2d3d4d5d6de7de_param_5];
+	cvt.s64.s32 	%rd7, %r111;
+	cvt.s64.s32 	%rd9, %r112;
+	.loc	1 38 22
+	setp.lt.s64 	%p103, %rd35, 0;
+	setp.lt.s64 	%p104, %rd19, 0;
+	.loc	1 41 44
+	shl.b64 	%rd96, %rd19, 8;
+	add.s64 	%rd97, %rd96, 12865792;
+	selp.b64 	%rd98, %rd97, %rd96, %p104;
+	shl.b64 	%rd99, %rd35, 8;
+	add.s64 	%rd100, %rd99, 12865792;
+	selp.b64 	%rd101, %rd100, %rd99, %p103;
+	.loc	1 41 40
+	or.b64  	%rd103, %rd98, %rd66;
+	or.b64  	%rd104, %rd101, %rd66;
+	.loc	1 41 34
+	shl.b64 	%rd105, %rd103, 2;
+	add.s64 	%rd115, %rd16, %rd105;
+	add.s64 	%rd116, %rd115, 16;
+	shl.b64 	%rd106, %rd104, 2;
+	add.s64 	%rd117, %rd16, %rd106;
+	add.s64 	%rd118, %rd117, 16;
+	.loc	1 41 52
+	mov.u32 %r114, 0x0;
+	mov.u32 %r115, 0x0;
+	mov.u32 %r116, 0x0;
+	mov.u32 %r117, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ];
+	@!%p113 mov.u32 %r114, %r325;
+	@!%p113 mov.u32 %r115, %r325;
+	@!%p113 mov.u32 %r116, %r325;
+	@!%p113 mov.u32 %r117, %r325;
+	mov.b32 	%f59, %r114;
+	mov.b32 	%f60, %r115;
+	mov.b32 	%f61, %r116;
+	mov.b32 	%f62, %r117;
+	mov.u32 %r122, 0x0;
+	mov.u32 %r123, 0x0;
+	mov.u32 %r124, 0x0;
+	mov.u32 %r125, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ];
+	@!%p113 mov.u32 %r122, %r325;
+	@!%p113 mov.u32 %r123, %r325;
+	@!%p113 mov.u32 %r124, %r325;
+	@!%p113 mov.u32 %r125, %r325;
+	mov.b32 	%f63, %r122;
+	mov.b32 	%f64, %r123;
+	mov.b32 	%f65, %r124;
+	mov.b32 	%f66, %r125;
+	mov.u32 %r130, 0x0;
+	mov.u32 %r131, 0x0;
+	mov.u32 %r132, 0x0;
+	mov.u32 %r133, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ];
+	@!%p113 mov.u32 %r130, %r325;
+	@!%p113 mov.u32 %r131, %r325;
+	@!%p113 mov.u32 %r132, %r325;
+	@!%p113 mov.u32 %r133, %r325;
+	mov.b32 	%f67, %r130;
+	mov.b32 	%f68, %r131;
+	mov.b32 	%f69, %r132;
+	mov.b32 	%f70, %r133;
+	mov.u32 %r138, 0x0;
+	mov.u32 %r139, 0x0;
+	mov.u32 %r140, 0x0;
+	mov.u32 %r141, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ];
+	@!%p113 mov.u32 %r138, %r325;
+	@!%p113 mov.u32 %r139, %r325;
+	@!%p113 mov.u32 %r140, %r325;
+	@!%p113 mov.u32 %r141, %r325;
+	mov.b32 	%f71, %r138;
+	mov.b32 	%f72, %r139;
+	mov.b32 	%f73, %r140;
+	mov.b32 	%f74, %r141;
+	.loc	1 42 22
+	add.f32 	%f75, %f1, %f59;
+	add.f32 	%f76, %f2, %f60;
+	add.f32 	%f77, %f3, %f61;
+	add.f32 	%f78, %f4, %f62;
+	add.f32 	%f79, %f5, %f63;
+	add.f32 	%f80, %f6, %f64;
+	add.f32 	%f81, %f7, %f65;
+	add.f32 	%f82, %f8, %f66;
+	add.f32 	%f83, %f9, %f67;
+	add.f32 	%f84, %f10, %f68;
+	add.f32 	%f85, %f11, %f69;
+	add.f32 	%f86, %f12, %f70;
+	add.f32 	%f87, %f13, %f71;
+	add.f32 	%f88, %f14, %f72;
+	add.f32 	%f89, %f15, %f73;
+	add.f32 	%f90, %f16, %f74;
+	.loc	1 44 22
+	add.f32 	%f91, %f17, %f75;
+	add.f32 	%f92, %f18, %f76;
+	add.f32 	%f93, %f19, %f77;
+	add.f32 	%f94, %f20, %f78;
+	add.f32 	%f95, %f21, %f79;
+	add.f32 	%f96, %f22, %f80;
+	add.f32 	%f97, %f23, %f81;
+	add.f32 	%f98, %f24, %f82;
+	add.f32 	%f99, %f25, %f83;
+	add.f32 	%f100, %f26, %f84;
+	add.f32 	%f101, %f27, %f85;
+	add.f32 	%f102, %f28, %f86;
+	add.f32 	%f103, %f29, %f87;
+	add.f32 	%f104, %f30, %f88;
+	add.f32 	%f105, %f31, %f89;
+	add.f32 	%f106, %f32, %f90;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f107, %f91, 0f00000000;
+	add.f32 	%f108, %f92, 0f00000000;
+	add.f32 	%f109, %f93, 0f00000000;
+	add.f32 	%f110, %f94, 0f00000000;
+	add.f32 	%f111, %f95, 0f00000000;
+	add.f32 	%f112, %f96, 0f00000000;
+	add.f32 	%f113, %f97, 0f00000000;
+	add.f32 	%f114, %f98, 0f00000000;
+	add.f32 	%f115, %f99, 0f00000000;
+	add.f32 	%f116, %f100, 0f00000000;
+	add.f32 	%f117, %f101, 0f00000000;
+	add.f32 	%f118, %f102, 0f00000000;
+	add.f32 	%f119, %f103, 0f00000000;
+	add.f32 	%f120, %f104, 0f00000000;
+	add.f32 	%f121, %f105, 0f00000000;
+	add.f32 	%f122, %f106, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f123, %f91, %f107;
+	sub.f32 	%f124, %f92, %f108;
+	sub.f32 	%f125, %f93, %f109;
+	sub.f32 	%f126, %f94, %f110;
+	sub.f32 	%f127, %f95, %f111;
+	sub.f32 	%f128, %f96, %f112;
+	sub.f32 	%f129, %f97, %f113;
+	sub.f32 	%f130, %f98, %f114;
+	sub.f32 	%f131, %f99, %f115;
+	sub.f32 	%f132, %f100, %f116;
+	sub.f32 	%f133, %f101, %f117;
+	sub.f32 	%f134, %f102, %f118;
+	sub.f32 	%f135, %f103, %f119;
+	sub.f32 	%f136, %f104, %f120;
+	sub.f32 	%f137, %f105, %f121;
+	sub.f32 	%f138, %f106, %f122;
+	.loc	2 101 13
+	fma.rn.f32 	%f139, %f91, %f123, 0f00000000;
+	fma.rn.f32 	%f140, %f92, %f124, 0f00000000;
+	fma.rn.f32 	%f141, %f93, %f125, 0f00000000;
+	fma.rn.f32 	%f142, %f94, %f126, 0f00000000;
+	fma.rn.f32 	%f143, %f95, %f127, 0f00000000;
+	fma.rn.f32 	%f144, %f96, %f128, 0f00000000;
+	fma.rn.f32 	%f145, %f97, %f129, 0f00000000;
+	fma.rn.f32 	%f146, %f98, %f130, 0f00000000;
+	fma.rn.f32 	%f147, %f99, %f131, 0f00000000;
+	fma.rn.f32 	%f148, %f100, %f132, 0f00000000;
+	fma.rn.f32 	%f149, %f101, %f133, 0f00000000;
+	fma.rn.f32 	%f150, %f102, %f134, 0f00000000;
+	fma.rn.f32 	%f151, %f103, %f135, 0f00000000;
+	fma.rn.f32 	%f152, %f104, %f136, 0f00000000;
+	fma.rn.f32 	%f153, %f105, %f137, 0f00000000;
+	fma.rn.f32 	%f154, %f106, %f138, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f155, %f108, %f107;
+	mov.b32 	%r147, 1065353216;
+	mov.b32 	%r148, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r146, %r147, %r148;
+	mov.b32 	%f156, %r146;
+	.loc	2 112 17
+	fma.rn.f32 	%f157, %f156, %f155, %f107;
+	.loc	2 113 15
+	add.f32 	%f158, %f139, %f140;
+	.loc	2 113 30
+	mul.f32 	%f159, %f155, %f155;
+	.loc	2 113 22
+	fma.rn.f32 	%f160, %f156, %f159, %f158;
+	.loc	2 108 21
+	sub.f32 	%f161, %f109, %f157;
+	mov.b32 	%r151, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r149, %r147, %r151;
+	mov.b32 	%f162, %r149;
+	.loc	2 112 17
+	fma.rn.f32 	%f163, %f162, %f161, %f157;
+	.loc	2 113 15
+	add.f32 	%f164, %f141, %f160;
+	.loc	2 113 30
+	mul.f32 	%f165, %f161, %f161;
+	.loc	2 113 38
+	fma.rn.f32 	%f166, %f161, %f161, %f165;
+	.loc	2 113 22
+	fma.rn.f32 	%f167, %f162, %f166, %f164;
+	.loc	2 108 21
+	sub.f32 	%f168, %f110, %f163;
+	mov.b32 	%r154, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r152, %r147, %r154;
+	mov.b32 	%f169, %r152;
+	.loc	2 112 17
+	fma.rn.f32 	%f170, %f169, %f168, %f163;
+	.loc	2 113 15
+	add.f32 	%f171, %f142, %f167;
+	.loc	2 113 30
+	mul.f32 	%f172, %f168, %f168;
+	.loc	2 113 38
+	mul.f32 	%f173, %f172, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f174, %f169, %f173, %f171;
+	.loc	2 108 21
+	sub.f32 	%f175, %f111, %f170;
+	mov.b32 	%r157, 1084227584;
+	.loc	2 110 60
+	div.full.f32 %r155, %r147, %r157;
+	mov.b32 	%f176, %r155;
+	.loc	2 112 17
+	fma.rn.f32 	%f177, %f176, %f175, %f170;
+	.loc	2 113 15
+	add.f32 	%f178, %f143, %f174;
+	.loc	2 113 30
+	mul.f32 	%f179, %f175, %f175;
+	.loc	2 113 38
+	mul.f32 	%f180, %f179, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f181, %f176, %f180, %f178;
+	.loc	2 108 21
+	sub.f32 	%f182, %f112, %f177;
+	mov.b32 	%r160, 1086324736;
+	.loc	2 110 60
+	div.full.f32 %r158, %r147, %r160;
+	mov.b32 	%f183, %r158;
+	.loc	2 112 17
+	fma.rn.f32 	%f184, %f183, %f182, %f177;
+	.loc	2 113 15
+	add.f32 	%f185, %f144, %f181;
+	.loc	2 113 30
+	mul.f32 	%f186, %f182, %f182;
+	.loc	2 113 38
+	mul.f32 	%f187, %f186, 0f40A00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f188, %f183, %f187, %f185;
+	.loc	2 108 21
+	sub.f32 	%f189, %f113, %f184;
+	mov.b32 	%r163, 1088421888;
+	.loc	2 110 60
+	div.full.f32 %r161, %r147, %r163;
+	mov.b32 	%f190, %r161;
+	.loc	2 112 17
+	fma.rn.f32 	%f191, %f190, %f189, %f184;
+	.loc	2 113 15
+	add.f32 	%f192, %f145, %f188;
+	.loc	2 113 30
+	mul.f32 	%f193, %f189, %f189;
+	.loc	2 113 38
+	mul.f32 	%f194, %f193, 0f40C00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f195, %f190, %f194, %f192;
+	.loc	2 108 21
+	sub.f32 	%f196, %f114, %f191;
+	mov.b32 	%r166, 1090519040;
+	.loc	2 110 60
+	div.full.f32 %r164, %r147, %r166;
+	mov.b32 	%f197, %r164;
+	.loc	2 112 17
+	fma.rn.f32 	%f198, %f197, %f196, %f191;
+	.loc	2 113 15
+	add.f32 	%f199, %f146, %f195;
+	.loc	2 113 30
+	mul.f32 	%f200, %f196, %f196;
+	.loc	2 113 38
+	mul.f32 	%f201, %f200, 0f40E00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f202, %f197, %f201, %f199;
+	.loc	2 108 21
+	sub.f32 	%f203, %f116, %f115;
+	.loc	2 110 60
+	div.full.f32 %r167, %r147, %r148;
+	mov.b32 	%f204, %r167;
+	.loc	2 112 17
+	fma.rn.f32 	%f205, %f203, %f204, %f115;
+	.loc	2 113 15
+	add.f32 	%f206, %f147, %f148;
+	.loc	2 113 30
+	mul.f32 	%f207, %f203, %f203;
+	.loc	2 113 22
+	fma.rn.f32 	%f208, %f207, %f204, %f206;
+	.loc	2 108 21
+	sub.f32 	%f209, %f117, %f205;
+	.loc	2 110 60
+	div.full.f32 %r170, %r147, %r151;
+	mov.b32 	%f210, %r170;
+	.loc	2 112 17
+	fma.rn.f32 	%f211, %f210, %f209, %f205;
+	.loc	2 113 15
+	add.f32 	%f212, %f149, %f208;
+	.loc	2 113 30
+	mul.f32 	%f213, %f209, %f209;
+	.loc	2 113 38
+	fma.rn.f32 	%f214, %f209, %f209, %f213;
+	.loc	2 113 22
+	fma.rn.f32 	%f215, %f210, %f214, %f212;
+	.loc	2 108 21
+	sub.f32 	%f216, %f118, %f211;
+	.loc	2 110 60
+	div.full.f32 %r173, %r147, %r154;
+	mov.b32 	%f217, %r173;
+	.loc	2 112 17
+	fma.rn.f32 	%f218, %f217, %f216, %f211;
+	.loc	2 113 15
+	add.f32 	%f219, %f150, %f215;
+	.loc	2 113 30
+	mul.f32 	%f220, %f216, %f216;
+	.loc	2 113 38
+	mul.f32 	%f221, %f220, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f222, %f217, %f221, %f219;
+	.loc	2 108 21
+	sub.f32 	%f223, %f119, %f218;
+	.loc	2 110 60
+	div.full.f32 %r176, %r147, %r157;
+	mov.b32 	%f224, %r176;
+	.loc	2 112 17
+	fma.rn.f32 	%f225, %f224, %f223, %f218;
+	.loc	2 113 15
+	add.f32 	%f226, %f151, %f222;
+	.loc	2 113 30
+	mul.f32 	%f227, %f223, %f223;
+	.loc	2 113 38
+	mul.f32 	%f228, %f227, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f229, %f224, %f228, %f226;
+	.loc	2 108 21
+	sub.f32 	%f230, %f120, %f225;
+	.loc	2 110 60
+	div.full.f32 %r179, %r147, %r160;
+	mov.b32 	%f231, %r179;
+	.loc	2 112 17
+	fma.rn.f32 	%f232, %f231, %f230, %f225;
+	.loc	2 113 15
+	add.f32 	%f233, %f152, %f229;
+	.loc	2 113 30
+	mul.f32 	%f234, %f230, %f230;
+	.loc	2 113 38
+	mul.f32 	%f235, %f234, 0f40A00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f236, %f231, %f235, %f233;
+	.loc	2 108 21
+	sub.f32 	%f237, %f121, %f232;
+	.loc	2 110 60
+	div.full.f32 %r182, %r147, %r163;
+	mov.b32 	%f238, %r182;
+	.loc	2 112 17
+	fma.rn.f32 	%f239, %f238, %f237, %f232;
+	.loc	2 113 15
+	add.f32 	%f240, %f153, %f236;
+	.loc	2 113 30
+	mul.f32 	%f241, %f237, %f237;
+	.loc	2 113 38
+	mul.f32 	%f242, %f241, 0f40C00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f243, %f238, %f242, %f240;
+	.loc	2 108 21
+	sub.f32 	%f244, %f122, %f239;
+	.loc	2 110 60
+	div.full.f32 %r185, %r147, %r166;
+	mov.b32 	%f245, %r185;
+	.loc	2 112 17
+	fma.rn.f32 	%f246, %f245, %f244, %f239;
+	.loc	2 113 15
+	add.f32 	%f247, %f154, %f243;
+	.loc	2 113 30
+	mul.f32 	%f248, %f244, %f244;
+	.loc	2 113 38
+	mul.f32 	%f249, %f248, 0f40E00000;
+	.loc	2 113 22
+	fma.rn.f32 	%f250, %f245, %f249, %f247;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r284, %f198;
+	shfl.sync.bfly.b32	%r285, %r284, 16, 31, -1;
+	mov.b32 	%f251, %r285;
+	mov.b32 	%r286, %f202;
+	shfl.sync.bfly.b32	%r287, %r286, 16, 31, -1;
+	mov.b32 	%f252, %r287;
+	shfl.sync.bfly.b32	%r189, %r166, 16, 31, -1;
+	mov.b32 	%f253, %r189;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f254, %f251, %f198;
+	.loc	2 109 28
+	add.f32 	%f255, %f253, 0f41000000;
+	.loc	2 110 39
+	setp.eq.f32 	%p105, %f255, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r190, %f255;
+	div.full.f32 %r188, %r189, %r190;
+	mov.b32 	%f256, %r188;
+	.loc	2 110 49
+	selp.f32 	%f257, 0f00000000, %f256, %p105;
+	.loc	2 112 17
+	fma.rn.f32 	%f258, %f257, %f254, %f198;
+	.loc	2 113 15
+	add.f32 	%f259, %f202, %f252;
+	.loc	2 113 30
+	mul.f32 	%f260, %f254, %f254;
+	.loc	2 113 38
+	mul.f32 	%f261, %f260, 0f41000000;
+	.loc	2 113 22
+	fma.rn.f32 	%f262, %f257, %f261, %f259;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r288, %f258;
+	shfl.sync.bfly.b32	%r289, %r288, 8, 31, -1;
+	mov.b32 	%f263, %r289;
+	mov.b32 	%r290, %f262;
+	shfl.sync.bfly.b32	%r291, %r290, 8, 31, -1;
+	mov.b32 	%f264, %r291;
+	shfl.sync.bfly.b32	%r192, %r190, 8, 31, -1;
+	mov.b32 	%f265, %r192;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f266, %f263, %f258;
+	.loc	2 109 28
+	add.f32 	%f267, %f255, %f265;
+	.loc	2 110 39
+	setp.eq.f32 	%p106, %f267, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r193, %f267;
+	div.full.f32 %r191, %r192, %r193;
+	mov.b32 	%f268, %r191;
+	.loc	2 110 49
+	selp.f32 	%f269, 0f00000000, %f268, %p106;
+	.loc	2 112 17
+	fma.rn.f32 	%f270, %f269, %f266, %f258;
+	.loc	2 113 15
+	add.f32 	%f271, %f262, %f264;
+	.loc	2 113 30
+	mul.f32 	%f272, %f266, %f266;
+	.loc	2 113 38
+	mul.f32 	%f273, %f255, %f272;
+	.loc	2 113 22
+	fma.rn.f32 	%f274, %f269, %f273, %f271;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r292, %f270;
+	shfl.sync.bfly.b32	%r293, %r292, 4, 31, -1;
+	mov.b32 	%f275, %r293;
+	mov.b32 	%r294, %f274;
+	shfl.sync.bfly.b32	%r295, %r294, 4, 31, -1;
+	mov.b32 	%f276, %r295;
+	shfl.sync.bfly.b32	%r195, %r193, 4, 31, -1;
+	mov.b32 	%f277, %r195;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f278, %f275, %f270;
+	.loc	2 109 28
+	add.f32 	%f279, %f267, %f277;
+	.loc	2 110 39
+	setp.eq.f32 	%p107, %f279, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r196, %f279;
+	div.full.f32 %r194, %r195, %r196;
+	mov.b32 	%f280, %r194;
+	.loc	2 110 49
+	selp.f32 	%f281, 0f00000000, %f280, %p107;
+	.loc	2 112 17
+	fma.rn.f32 	%f282, %f281, %f278, %f270;
+	.loc	2 113 15
+	add.f32 	%f283, %f274, %f276;
+	.loc	2 113 30
+	mul.f32 	%f284, %f278, %f278;
+	.loc	2 113 38
+	mul.f32 	%f285, %f267, %f284;
+	.loc	2 113 22
+	fma.rn.f32 	%f286, %f281, %f285, %f283;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r296, %f282;
+	shfl.sync.bfly.b32	%r297, %r296, 2, 31, -1;
+	mov.b32 	%f287, %r297;
+	mov.b32 	%r298, %f286;
+	shfl.sync.bfly.b32	%r299, %r298, 2, 31, -1;
+	mov.b32 	%f288, %r299;
+	shfl.sync.bfly.b32	%r198, %r196, 2, 31, -1;
+	mov.b32 	%f289, %r198;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f290, %f287, %f282;
+	.loc	2 109 28
+	add.f32 	%f33, %f279, %f289;
+	.loc	2 110 39
+	setp.eq.f32 	%p108, %f33, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r199, %f33;
+	div.full.f32 %r197, %r198, %r199;
+	mov.b32 	%f291, %r197;
+	.loc	2 110 49
+	selp.f32 	%f292, 0f00000000, %f291, %p108;
+	.loc	2 112 17
+	fma.rn.f32 	%f34, %f290, %f292, %f282;
+	.loc	2 113 15
+	add.f32 	%f293, %f286, %f288;
+	.loc	2 113 30
+	mul.f32 	%f294, %f290, %f290;
+	.loc	2 113 38
+	mul.f32 	%f295, %f279, %f294;
+	.loc	2 113 22
+	fma.rn.f32 	%f35, %f292, %f295, %f293;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r300, %f34;
+	shfl.sync.bfly.b32	%r3, %r300, 1, 31, -1;
+	mov.b32 	%r301, %f35;
+	shfl.sync.bfly.b32	%r4, %r301, 1, 31, -1;
+	shfl.sync.bfly.b32	%r201, %r199, 1, 31, -1;
+	mov.b32 	%f296, %r201;
+$L__tmp12:
+	.loc	2 109 28
+	add.f32 	%f36, %f33, %f296;
+	.loc	2 110 60
+	mov.b32 	%r202, %f36;
+	div.full.f32 %r200, %r201, %r202;
+	mov.b32 	%f37, %r200;
+$L__tmp13:
+	.loc	2 120 46
+	mov.b32 	%r302, %f246;
+	shfl.sync.bfly.b32	%r303, %r302, 16, 31, -1;
+	mov.b32 	%f297, %r303;
+	mov.b32 	%r304, %f250;
+	shfl.sync.bfly.b32	%r305, %r304, 16, 31, -1;
+	mov.b32 	%f298, %r305;
+	shfl.sync.bfly.b32	%r204, %r166, 16, 31, -1;
+	mov.b32 	%f299, %r204;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f300, %f297, %f246;
+	.loc	2 109 28
+	add.f32 	%f301, %f299, 0f41000000;
+	.loc	2 110 39
+	setp.eq.f32 	%p109, %f301, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r205, %f301;
+	div.full.f32 %r203, %r204, %r205;
+	mov.b32 	%f302, %r203;
+	.loc	2 110 49
+	selp.f32 	%f303, 0f00000000, %f302, %p109;
+	.loc	2 112 17
+	fma.rn.f32 	%f304, %f300, %f303, %f246;
+	.loc	2 113 15
+	add.f32 	%f305, %f250, %f298;
+	.loc	2 113 30
+	mul.f32 	%f306, %f300, %f300;
+	.loc	2 113 38
+	mul.f32 	%f307, %f306, 0f41000000;
+	.loc	2 113 22
+	fma.rn.f32 	%f308, %f307, %f303, %f305;
+$L__tmp15:
+	.loc	2 120 46
+	mov.b32 	%r306, %f304;
+	shfl.sync.bfly.b32	%r307, %r306, 8, 31, -1;
+	mov.b32 	%f309, %r307;
+	mov.b32 	%r308, %f308;
+	shfl.sync.bfly.b32	%r309, %r308, 8, 31, -1;
+	mov.b32 	%f310, %r309;
+	shfl.sync.bfly.b32	%r207, %r205, 8, 31, -1;
+	mov.b32 	%f311, %r207;
+$L__tmp16:
+	.loc	2 108 21
+	sub.f32 	%f312, %f309, %f304;
+	.loc	2 109 28
+	add.f32 	%f313, %f301, %f311;
+	.loc	2 110 39
+	setp.eq.f32 	%p110, %f313, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r208, %f313;
+	div.full.f32 %r206, %r207, %r208;
+	mov.b32 	%f314, %r206;
+	.loc	2 110 49
+	selp.f32 	%f315, 0f00000000, %f314, %p110;
+	.loc	2 112 17
+	fma.rn.f32 	%f316, %f312, %f315, %f304;
+	.loc	2 113 15
+	add.f32 	%f317, %f308, %f310;
+	.loc	2 113 30
+	mul.f32 	%f318, %f312, %f312;
+	.loc	2 113 38
+	mul.f32 	%f319, %f301, %f318;
+	.loc	2 113 22
+	fma.rn.f32 	%f320, %f315, %f319, %f317;
+$L__tmp17:
+	.loc	2 120 46
+	mov.b32 	%r310, %f316;
+	shfl.sync.bfly.b32	%r311, %r310, 4, 31, -1;
+	mov.b32 	%f321, %r311;
+	mov.b32 	%r312, %f320;
+	shfl.sync.bfly.b32	%r313, %r312, 4, 31, -1;
+	mov.b32 	%f322, %r313;
+	shfl.sync.bfly.b32	%r210, %r208, 4, 31, -1;
+	mov.b32 	%f323, %r210;
+$L__tmp18:
+	.loc	2 108 21
+	sub.f32 	%f324, %f321, %f316;
+	.loc	2 109 28
+	add.f32 	%f325, %f313, %f323;
+	.loc	2 110 39
+	setp.eq.f32 	%p111, %f325, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r211, %f325;
+	div.full.f32 %r209, %r210, %r211;
+	mov.b32 	%f326, %r209;
+	.loc	2 110 49
+	selp.f32 	%f327, 0f00000000, %f326, %p111;
+	.loc	2 112 17
+	fma.rn.f32 	%f328, %f324, %f327, %f316;
+	.loc	2 113 15
+	add.f32 	%f329, %f320, %f322;
+	.loc	2 113 30
+	mul.f32 	%f330, %f324, %f324;
+	.loc	2 113 38
+	mul.f32 	%f331, %f313, %f330;
+	.loc	2 113 22
+	fma.rn.f32 	%f332, %f327, %f331, %f329;
+$L__tmp19:
+	.loc	2 120 46
+	mov.b32 	%r314, %f328;
+	shfl.sync.bfly.b32	%r315, %r314, 2, 31, -1;
+	mov.b32 	%f333, %r315;
+	mov.b32 	%r316, %f332;
+	shfl.sync.bfly.b32	%r317, %r316, 2, 31, -1;
+	mov.b32 	%f334, %r317;
+	shfl.sync.bfly.b32	%r213, %r211, 2, 31, -1;
+	mov.b32 	%f335, %r213;
+$L__tmp20:
+	.loc	2 108 21
+	sub.f32 	%f336, %f333, %f328;
+	.loc	2 109 28
+	add.f32 	%f38, %f325, %f335;
+	.loc	2 110 39
+	setp.eq.f32 	%p112, %f38, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r214, %f38;
+	div.full.f32 %r212, %r213, %r214;
+	mov.b32 	%f337, %r212;
+	.loc	2 110 49
+	selp.f32 	%f338, 0f00000000, %f337, %p112;
+	.loc	2 112 17
+	fma.rn.f32 	%f39, %f336, %f338, %f328;
+	.loc	2 113 15
+	add.f32 	%f339, %f332, %f334;
+	.loc	2 113 30
+	mul.f32 	%f340, %f336, %f336;
+	.loc	2 113 38
+	mul.f32 	%f341, %f325, %f340;
+	.loc	2 113 22
+	fma.rn.f32 	%f40, %f338, %f341, %f339;
+$L__tmp21:
+	.loc	2 120 46
+	mov.b32 	%r318, %f39;
+	shfl.sync.bfly.b32	%r5, %r318, 1, 31, -1;
+	mov.b32 	%r319, %f40;
+	shfl.sync.bfly.b32	%r6, %r319, 1, 31, -1;
+	shfl.sync.bfly.b32	%r216, %r214, 1, 31, -1;
+	mov.b32 	%f342, %r216;
+$L__tmp22:
+	.loc	2 109 28
+	add.f32 	%f41, %f38, %f342;
+	.loc	2 110 60
+	mov.b32 	%r217, %f41;
+	div.full.f32 %r215, %r216, %r217;
+	mov.b32 	%f42, %r215;
+$L__tmp23:
+	.loc	1 62 51
+	mov.u32 %r218, 0x0;
+	mov.u32 %r219, 0x0;
+	mov.u32 %r220, 0x0;
+	mov.u32 %r221, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ];
+	@!%p113 mov.u32 %r218, %r325;
+	@!%p113 mov.u32 %r219, %r325;
+	@!%p113 mov.u32 %r220, %r325;
+	@!%p113 mov.u32 %r221, %r325;
+	mov.u32 %r226, 0x0;
+	mov.u32 %r227, 0x0;
+	mov.u32 %r228, 0x0;
+	mov.u32 %r229, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ];
+	@!%p113 mov.u32 %r226, %r325;
+	@!%p113 mov.u32 %r227, %r325;
+	@!%p113 mov.u32 %r228, %r325;
+	@!%p113 mov.u32 %r229, %r325;
+	mov.u32 %r234, 0x0;
+	mov.u32 %r235, 0x0;
+	mov.u32 %r236, 0x0;
+	mov.u32 %r237, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ];
+	@!%p113 mov.u32 %r234, %r325;
+	@!%p113 mov.u32 %r235, %r325;
+	@!%p113 mov.u32 %r236, %r325;
+	@!%p113 mov.u32 %r237, %r325;
+	mov.u32 %r242, 0x0;
+	mov.u32 %r243, 0x0;
+	mov.u32 %r244, 0x0;
+	mov.u32 %r245, 0x0;
+	@%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ];
+	@!%p113 mov.u32 %r242, %r325;
+	@!%p113 mov.u32 %r243, %r325;
+	@!%p113 mov.u32 %r244, %r325;
+	@!%p113 mov.u32 %r245, %r325;
+	.loc	1 63 51
+	mov.u32 %r250, 0x0;
+	mov.u32 %r251, 0x0;
+	mov.u32 %r252, 0x0;
+	mov.u32 %r253, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ];
+	@!%p113 mov.u32 %r250, %r325;
+	@!%p113 mov.u32 %r251, %r325;
+	@!%p113 mov.u32 %r252, %r325;
+	@!%p113 mov.u32 %r253, %r325;
+	cvt.u16.u32 	%rs17, %r250;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; }
+	cvt.u16.u32 	%rs19, %r251;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; }
+	cvt.u16.u32 	%rs21, %r252;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; }
+	cvt.u16.u32 	%rs23, %r253;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; }
+	mov.u32 %r258, 0x0;
+	mov.u32 %r259, 0x0;
+	mov.u32 %r260, 0x0;
+	mov.u32 %r261, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ];
+	@!%p113 mov.u32 %r258, %r325;
+	@!%p113 mov.u32 %r259, %r325;
+	@!%p113 mov.u32 %r260, %r325;
+	@!%p113 mov.u32 %r261, %r325;
+	cvt.u16.u32 	%rs25, %r258;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; }
+	cvt.u16.u32 	%rs27, %r259;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; }
+	cvt.u16.u32 	%rs29, %r260;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; }
+	cvt.u16.u32 	%rs31, %r261;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r266, %rs17;
+	mov.b32 	%f43, %r266;
+	cvt.f32.bf16 %r267, %rs18;
+	mov.b32 	%f44, %r267;
+	cvt.f32.bf16 %r268, %rs19;
+	mov.b32 	%f45, %r268;
+	cvt.f32.bf16 %r269, %rs20;
+	mov.b32 	%f46, %r269;
+	cvt.f32.bf16 %r270, %rs21;
+	mov.b32 	%f47, %r270;
+	cvt.f32.bf16 %r271, %rs22;
+	mov.b32 	%f48, %r271;
+	cvt.f32.bf16 %r272, %rs23;
+	mov.b32 	%f49, %r272;
+	cvt.f32.bf16 %r273, %rs24;
+	mov.b32 	%f50, %r273;
+	cvt.f32.bf16 %r274, %rs25;
+	mov.b32 	%f51, %r274;
+	cvt.f32.bf16 %r275, %rs26;
+	mov.b32 	%f52, %r275;
+	cvt.f32.bf16 %r276, %rs27;
+	mov.b32 	%f53, %r276;
+	cvt.f32.bf16 %r277, %rs28;
+	mov.b32 	%f54, %r277;
+	cvt.f32.bf16 %r278, %rs29;
+	mov.b32 	%f55, %r278;
+	cvt.f32.bf16 %r279, %rs30;
+	mov.b32 	%f56, %r279;
+	cvt.f32.bf16 %r280, %rs31;
+	mov.b32 	%f57, %r280;
+	cvt.f32.bf16 %r281, %rs32;
+	mov.b32 	%f58, %r281;
+	.loc	1 64 35
+	mul.wide.u32 	%rd107, %r2, 4;
+	add.s64 	%rd95, %rd17, %rd107;
+	.loc	1 64 40
+	mov.u32 %r282, 0x0;
+	@%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ];
+	@!%p113 mov.u32 %r282, %r325;
+	.loc	1 68 57
+	@%p49 bra 	$L__BB0_4;
+	mov.u64 	%rd108, assertMessage_1;
+	cvta.global.u64 	%rd109, %rd108;
+	mov.u64 	%rd110, assertFile_1;
+	cvta.global.u64 	%rd111, %rd110;
+	mov.u64 	%rd112, assertFunc_1;
+	cvta.global.u64 	%rd113, %rd112;
+	{ // callseq 9, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd109;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd111;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r438;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd113;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd123;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 9
+$L__BB0_4:
+$L__tmp24:
+	.loc	2 120 46
+	mov.b32 	%f343, %r6;
+$L__tmp25:
+	.loc	2 113 15
+	add.f32 	%f344, %f40, %f343;
+$L__tmp26:
+	.loc	2 120 46
+	mov.b32 	%f345, %r5;
+$L__tmp27:
+	.loc	2 108 21
+	sub.f32 	%f346, %f345, %f39;
+	.loc	2 113 30
+	mul.f32 	%f347, %f346, %f346;
+	.loc	2 113 38
+	mul.f32 	%f348, %f38, %f347;
+	.loc	2 110 39
+	setp.eq.f32 	%p135, %f41, 0f00000000;
+	.loc	2 110 49
+	selp.f32 	%f349, 0f00000000, %f42, %p135;
+	.loc	2 113 22
+	fma.rn.f32 	%f350, %f349, %f348, %f344;
+$L__tmp28:
+	.loc	2 120 46
+	mov.b32 	%f351, %r4;
+$L__tmp29:
+	.loc	2 113 15
+	add.f32 	%f352, %f35, %f351;
+$L__tmp30:
+	.loc	2 120 46
+	mov.b32 	%f353, %r3;
+$L__tmp31:
+	.loc	2 108 21
+	sub.f32 	%f354, %f353, %f34;
+	.loc	2 113 30
+	mul.f32 	%f355, %f354, %f354;
+	.loc	2 113 38
+	mul.f32 	%f356, %f33, %f355;
+	.loc	2 110 39
+	setp.eq.f32 	%p136, %f36, 0f00000000;
+	.loc	2 110 49
+	selp.f32 	%f357, 0f00000000, %f37, %p136;
+	.loc	2 113 22
+	fma.rn.f32 	%f358, %f357, %f356, %f352;
+$L__tmp32:
+	.loc	1 69 54
+	mov.u32 %r321, 0x0;
+	mov.u32 %r322, 0x0;
+	mov.u32 %r323, 0x0;
+	mov.u32 %r324, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ];
+	@!%p113 mov.u32 %r321, %r325;
+	@!%p113 mov.u32 %r322, %r325;
+	@!%p113 mov.u32 %r323, %r325;
+	@!%p113 mov.u32 %r324, %r325;
+	mov.u32 %r329, 0x0;
+	mov.u32 %r330, 0x0;
+	mov.u32 %r331, 0x0;
+	mov.u32 %r332, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ];
+	@!%p113 mov.u32 %r329, %r325;
+	@!%p113 mov.u32 %r330, %r325;
+	@!%p113 mov.u32 %r331, %r325;
+	@!%p113 mov.u32 %r332, %r325;
+	mov.u32 %r337, 0x0;
+	mov.u32 %r338, 0x0;
+	mov.u32 %r339, 0x0;
+	mov.u32 %r340, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ];
+	@!%p113 mov.u32 %r337, %r325;
+	@!%p113 mov.u32 %r338, %r325;
+	@!%p113 mov.u32 %r339, %r325;
+	@!%p113 mov.u32 %r340, %r325;
+	mov.u32 %r345, 0x0;
+	mov.u32 %r346, 0x0;
+	mov.u32 %r347, 0x0;
+	mov.u32 %r348, 0x0;
+	@%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ];
+	@!%p113 mov.u32 %r345, %r325;
+	@!%p113 mov.u32 %r346, %r325;
+	@!%p113 mov.u32 %r347, %r325;
+	@!%p113 mov.u32 %r348, %r325;
+	.loc	1 75 24
+	mov.b32 	%r354, %f358;
+	mov.b32 	%r355, 1132462080;
+	div.full.f32 %r353, %r354, %r355;
+	mov.b32 	%f359, %r353;
+	mov.b32 	%r378, %f350;
+	div.full.f32 %r377, %r378, %r355;
+	mov.b32 	%f360, %r377;
+	.loc	1 77 24
+	add.f32 	%f361, %f359, 0f3727C5AC;
+	add.f32 	%f362, %f360, 0f3727C5AC;
+	.loc	1 78 30
+	rsqrt.approx.ftz.f32 	%f363, %f361;
+	rsqrt.approx.ftz.f32 	%f364, %f362;
+	.loc	1 69 54
+	mov.b32 	%f365, %r348;
+	.loc	1 62 51
+	mov.b32 	%f366, %r245;
+	.loc	1 70 24
+	add.f32 	%f367, %f366, %f365;
+	.loc	1 72 24
+	add.f32 	%f368, %f58, %f367;
+$L__tmp33:
+	.loc	2 112 17
+	fma.rn.f32 	%f369, %f346, %f349, %f39;
+$L__tmp34:
+	.loc	1 73 24
+	sub.f32 	%f370, %f368, %f369;
+	.loc	1 69 54
+	mov.b32 	%f371, %r347;
+	.loc	1 62 51
+	mov.b32 	%f372, %r244;
+	.loc	1 70 24
+	add.f32 	%f373, %f372, %f371;
+	.loc	1 72 24
+	add.f32 	%f374, %f57, %f373;
+	.loc	1 73 24
+	sub.f32 	%f375, %f374, %f369;
+	.loc	1 69 54
+	mov.b32 	%f376, %r346;
+	.loc	1 62 51
+	mov.b32 	%f377, %r243;
+	.loc	1 70 24
+	add.f32 	%f378, %f377, %f376;
+	.loc	1 72 24
+	add.f32 	%f379, %f56, %f378;
+	.loc	1 73 24
+	sub.f32 	%f380, %f379, %f369;
+	.loc	1 69 54
+	mov.b32 	%f381, %r345;
+	.loc	1 62 51
+	mov.b32 	%f382, %r242;
+	.loc	1 70 24
+	add.f32 	%f383, %f382, %f381;
+	.loc	1 72 24
+	add.f32 	%f384, %f55, %f383;
+	.loc	1 73 24
+	sub.f32 	%f385, %f384, %f369;
+	.loc	1 69 54
+	mov.b32 	%f386, %r340;
+	.loc	1 62 51
+	mov.b32 	%f387, %r237;
+	.loc	1 70 24
+	add.f32 	%f388, %f387, %f386;
+	.loc	1 72 24
+	add.f32 	%f389, %f54, %f388;
+	.loc	1 73 24
+	sub.f32 	%f390, %f389, %f369;
+	.loc	1 69 54
+	mov.b32 	%f391, %r339;
+	.loc	1 62 51
+	mov.b32 	%f392, %r236;
+	.loc	1 70 24
+	add.f32 	%f393, %f392, %f391;
+	.loc	1 72 24
+	add.f32 	%f394, %f53, %f393;
+	.loc	1 73 24
+	sub.f32 	%f395, %f394, %f369;
+	.loc	1 69 54
+	mov.b32 	%f396, %r338;
+	.loc	1 62 51
+	mov.b32 	%f397, %r235;
+	.loc	1 70 24
+	add.f32 	%f398, %f397, %f396;
+	.loc	1 72 24
+	add.f32 	%f399, %f52, %f398;
+	.loc	1 73 24
+	sub.f32 	%f400, %f399, %f369;
+	.loc	1 69 54
+	mov.b32 	%f401, %r337;
+	.loc	1 62 51
+	mov.b32 	%f402, %r234;
+	.loc	1 70 24
+	add.f32 	%f403, %f402, %f401;
+	.loc	1 72 24
+	add.f32 	%f404, %f51, %f403;
+	.loc	1 73 24
+	sub.f32 	%f405, %f404, %f369;
+	.loc	1 69 54
+	mov.b32 	%f406, %r332;
+	.loc	1 62 51
+	mov.b32 	%f407, %r229;
+	.loc	1 70 24
+	add.f32 	%f408, %f407, %f406;
+	.loc	1 72 24
+	add.f32 	%f409, %f50, %f408;
+$L__tmp35:
+	.loc	2 112 17
+	fma.rn.f32 	%f410, %f354, %f357, %f34;
+$L__tmp36:
+	.loc	1 73 24
+	sub.f32 	%f411, %f409, %f410;
+	.loc	1 69 54
+	mov.b32 	%f412, %r331;
+	.loc	1 62 51
+	mov.b32 	%f413, %r228;
+	.loc	1 70 24
+	add.f32 	%f414, %f413, %f412;
+	.loc	1 72 24
+	add.f32 	%f415, %f49, %f414;
+	.loc	1 73 24
+	sub.f32 	%f416, %f415, %f410;
+	.loc	1 69 54
+	mov.b32 	%f417, %r330;
+	.loc	1 62 51
+	mov.b32 	%f418, %r227;
+	.loc	1 70 24
+	add.f32 	%f419, %f418, %f417;
+	.loc	1 72 24
+	add.f32 	%f420, %f48, %f419;
+	.loc	1 73 24
+	sub.f32 	%f421, %f420, %f410;
+	.loc	1 69 54
+	mov.b32 	%f422, %r329;
+	.loc	1 62 51
+	mov.b32 	%f423, %r226;
+	.loc	1 70 24
+	add.f32 	%f424, %f423, %f422;
+	.loc	1 72 24
+	add.f32 	%f425, %f47, %f424;
+	.loc	1 73 24
+	sub.f32 	%f426, %f425, %f410;
+	.loc	1 69 54
+	mov.b32 	%f427, %r324;
+	.loc	1 62 51
+	mov.b32 	%f428, %r221;
+	.loc	1 70 24
+	add.f32 	%f429, %f428, %f427;
+	.loc	1 72 24
+	add.f32 	%f430, %f46, %f429;
+	.loc	1 73 24
+	sub.f32 	%f431, %f430, %f410;
+	.loc	1 69 54
+	mov.b32 	%f432, %r323;
+	.loc	1 62 51
+	mov.b32 	%f433, %r220;
+	.loc	1 70 24
+	add.f32 	%f434, %f433, %f432;
+	.loc	1 72 24
+	add.f32 	%f435, %f45, %f434;
+	.loc	1 73 24
+	sub.f32 	%f436, %f435, %f410;
+	.loc	1 69 54
+	mov.b32 	%f437, %r322;
+	.loc	1 62 51
+	mov.b32 	%f438, %r219;
+	.loc	1 70 24
+	add.f32 	%f439, %f438, %f437;
+	.loc	1 72 24
+	add.f32 	%f440, %f44, %f439;
+	.loc	1 73 24
+	sub.f32 	%f441, %f440, %f410;
+	.loc	1 69 54
+	mov.b32 	%f442, %r321;
+	.loc	1 62 51
+	mov.b32 	%f443, %r218;
+	.loc	1 70 24
+	add.f32 	%f444, %f443, %f442;
+	.loc	1 72 24
+	add.f32 	%f445, %f43, %f444;
+	.loc	1 73 24
+	sub.f32 	%f446, %f445, %f410;
+	.loc	1 79 24
+	mul.f32 	%f447, %f446, %f363;
+	mul.f32 	%f448, %f441, %f363;
+	mul.f32 	%f449, %f436, %f363;
+	mul.f32 	%f450, %f431, %f363;
+	mul.f32 	%f451, %f426, %f363;
+	mul.f32 	%f452, %f421, %f363;
+	mul.f32 	%f453, %f416, %f363;
+	mul.f32 	%f454, %f411, %f363;
+	mul.f32 	%f455, %f405, %f364;
+	mul.f32 	%f456, %f400, %f364;
+	mul.f32 	%f457, %f395, %f364;
+	mul.f32 	%f458, %f390, %f364;
+	mul.f32 	%f459, %f385, %f364;
+	mul.f32 	%f460, %f380, %f364;
+	mul.f32 	%f461, %f375, %f364;
+	mul.f32 	%f462, %f370, %f364;
+	.loc	1 80 24
+	shl.b32 	%r425, %r2, 2;
+	mov.u32 	%r426, global_smem;
+	add.s32 	%r427, %r426, %r425;
+	st.shared.u32 	[%r427], %r282;
+	bar.sync 	0;
+	shl.b32 	%r428, %r1, 2;
+	add.s32 	%r429, %r426, %r428;
+	ld.shared.v4.f32 	{%f463, %f464, %f465, %f466}, [%r429];
+	ld.shared.v4.f32 	{%f467, %f468, %f469, %f470}, [%r429+16];
+	mul.f32 	%f471, %f447, %f463;
+	mul.f32 	%f472, %f448, %f464;
+	mul.f32 	%f473, %f449, %f465;
+	mul.f32 	%f474, %f450, %f466;
+	mul.f32 	%f475, %f451, %f467;
+	mul.f32 	%f476, %f452, %f468;
+	mul.f32 	%f477, %f453, %f469;
+	mul.f32 	%f478, %f454, %f470;
+	mul.f32 	%f479, %f455, %f463;
+	mul.f32 	%f480, %f456, %f464;
+	mul.f32 	%f481, %f457, %f465;
+	mul.f32 	%f482, %f458, %f466;
+	mul.f32 	%f483, %f459, %f467;
+	mul.f32 	%f484, %f460, %f468;
+	mul.f32 	%f485, %f461, %f469;
+	mul.f32 	%f486, %f462, %f470;
+	.loc	1 82 29
+	shl.b64 	%rd121, %rd7, 1;
+	add.s64 	%rd119, %rd18, %rd121;
+	shl.b64 	%rd122, %rd9, 1;
+	add.s64 	%rd120, %rd18, %rd122;
+	.loc	1 82 52
+	mov.b32 	%r401, %f471;
+	cvt.rn.bf16.f32 %rs33, %r401;
+	mov.b32 	%r402, %f472;
+	cvt.rn.bf16.f32 %rs34, %r402;
+	mov.b32 	%r403, %f473;
+	cvt.rn.bf16.f32 %rs35, %r403;
+	mov.b32 	%r404, %f474;
+	cvt.rn.bf16.f32 %rs36, %r404;
+	mov.b32 	%r405, %f475;
+	cvt.rn.bf16.f32 %rs37, %r405;
+	mov.b32 	%r406, %f476;
+	cvt.rn.bf16.f32 %rs38, %r406;
+	mov.b32 	%r407, %f477;
+	cvt.rn.bf16.f32 %rs39, %r407;
+	mov.b32 	%r408, %f478;
+	cvt.rn.bf16.f32 %rs40, %r408;
+	mov.b32 	%r409, %f479;
+	cvt.rn.bf16.f32 %rs41, %r409;
+	mov.b32 	%r410, %f480;
+	cvt.rn.bf16.f32 %rs42, %r410;
+	mov.b32 	%r411, %f481;
+	cvt.rn.bf16.f32 %rs43, %r411;
+	mov.b32 	%r412, %f482;
+	cvt.rn.bf16.f32 %rs44, %r412;
+	mov.b32 	%r413, %f483;
+	cvt.rn.bf16.f32 %rs45, %r413;
+	mov.b32 	%r414, %f484;
+	cvt.rn.bf16.f32 %rs46, %r414;
+	mov.b32 	%r415, %f485;
+	cvt.rn.bf16.f32 %rs47, %r415;
+	mov.b32 	%r416, %f486;
+	cvt.rn.bf16.f32 %rs48, %r416;
+	mov.b32 	%r430, {%rs33, %rs34};
+	mov.b32 	%r431, {%rs35, %rs36};
+	mov.b32 	%r432, {%rs37, %rs38};
+	mov.b32 	%r433, {%rs39, %rs40};
+	@%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 };
+	mov.b32 	%r434, {%rs41, %rs42};
+	mov.b32 	%r435, {%rs43, %rs44};
+	mov.b32 	%r436, {%rs45, %rs46};
+	mov.b32 	%r437, {%rs47, %rs48};
+	@%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 };
+	.loc	1 58 4
+	ret;
+$L__tmp37:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 103
+.b8 54
+.b8 102
+.b8 107
+.b8 105
+.b8 54
+.b8 112
+.b8 52
+.b8 108
+.b8 120
+.b8 114
+.b8 100
+.b8 109
+.b8 103
+.b8 103
+.b8 54
+.b8 101
+.b8 117
+.b8 100
+.b8 97
+.b8 104
+.b8 105
+.b8 101
+.b8 120
+.b8 99
+.b8 118
+.b8 117
+.b8 101
+.b8 101
+.b8 111
+.b8 108
+.b8 50
+.b8 112
+.b8 52
+.b8 113
+.b8 112
+.b8 53
+.b8 51
+.b8 50
+.b8 112
+.b8 118
+.b8 118
+.b8 101
+.b8 50
+.b8 121
+.b8 52
+.b8 54
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 47
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp36
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp36
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp31
+.b8 2
+.b8 53
+.b8 44
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..17c7b6ff0a1e1c4975411464963e443d6507abb0
--- /dev/null
+++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir
@@ -0,0 +1,134 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
+    %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
+    %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
+    %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
+    %cst_9 = arith.constant 0.000000e+00 : f32
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
+    %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
+    %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
+    %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<16x256xbf16, #blocked>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
+    %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
+    %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
+    %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
+    %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
+    %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
+    %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
+    %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
+    %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %31 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
+    %32 = tt.broadcast %31 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %33 = arith.addi %24, %32 : tensor<16x256xi32, #blocked>
+    %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
+    %35 = tt.addptr %34, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
+    %36 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
+    %37 = arith.extf %36 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
+    %38 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
+    %39 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
+    %40 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
+    %41 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
+    %42 = arith.select %40, %38, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
+    %43 = arith.select %41, %39, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
+    %44 = arith.cmpi sge, %43, %cst_7 : tensor<16x1xi64, #blocked1>
+    %45 = arith.cmpi slt, %43, %cst_8 : tensor<16x1xi64, #blocked1>
+    %46 = arith.andi %44, %45 : tensor<16x1xi1, #blocked1>
+    tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
+    %47 = arith.muli %42, %cst_4 : tensor<16x1xi64, #blocked>
+    %48 = tt.broadcast %47 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
+    %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
+    %51 = arith.addi %50, %48 : tensor<16x256xi64, #blocked>
+    %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
+    %53 = tt.addptr %52, %51 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
+    %54 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %55 = arith.addf %54, %30 : tensor<16x256xf32, #blocked>
+    %56 = arith.addf %55, %37 : tensor<16x256xf32, #blocked>
+    %57 = arith.addf %56, %cst_14 : tensor<16x256xf32, #blocked>
+    %58 = arith.subf %56, %57 : tensor<16x256xf32, #blocked>
+    %59 = arith.mulf %56, %58 : tensor<16x256xf32, #blocked>
+    %60 = arith.addf %59, %cst_14 : tensor<16x256xf32, #blocked>
+    %61 = arith.select %29, %57, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
+    %62 = arith.select %29, %60, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
+    %63 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %90 = arith.subf %arg11, %arg8 : f32
+      %91 = arith.addf %arg10, %arg13 : f32
+      %92 = arith.cmpf oeq, %91, %cst_9 : f32
+      %93 = arith.divf %arg13, %91 : f32
+      %94 = arith.select %92, %cst_9, %93 : f32
+      %95 = arith.mulf %90, %94 : f32
+      %96 = arith.addf %arg8, %95 : f32
+      %97 = arith.addf %arg9, %arg12 : f32
+      %98 = arith.mulf %90, %90 : f32
+      %99 = arith.mulf %98, %arg10 : f32
+      %100 = arith.mulf %99, %94 : f32
+      %101 = arith.addf %97, %100 : f32
+      tt.reduce.return %96, %101, %91 : f32, f32, f32
+    }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %68 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %69 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
+    %70 = arith.extf %69 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
+    %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
+    %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
+    %73 = tt.load %72, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
+    tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
+    %74 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %75 = arith.addf %74, %68 : tensor<16x256xf32, #blocked>
+    %76 = arith.addf %75, %70 : tensor<16x256xf32, #blocked>
+    %77 = tt.broadcast %66 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %78 = arith.subf %76, %77 : tensor<16x256xf32, #blocked>
+    %79 = arith.divf %67, %cst_13 : tensor<16x1xf32, #blocked>
+    %80 = arith.addf %79, %cst_12 : tensor<16x1xf32, #blocked>
+    %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
+    %82 = tt.broadcast %81 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %83 = arith.mulf %78, %82 : tensor<16x256xf32, #blocked>
+    %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
+    %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %86 = arith.mulf %83, %85 : tensor<16x256xf32, #blocked>
+    %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
+    %88 = tt.addptr %87, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
+    %89 = arith.truncf %86 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
+    tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..da6ce5a2908c1f7b07d13f6e312d6b02d2777059
--- /dev/null
+++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir
@@ -0,0 +1,245 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 31, !dbg !8
+  %8 = lshr i32 %6, 5, !dbg !8
+  %9 = and i32 %6, 3, !dbg !8
+  %10 = and i32 %8, 3, !dbg !9
+  %urem = and i32 %6, 127, !dbg !9
+  %11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
+  %12 = shl i32 %11, 2, !dbg !11
+  %13 = or i32 %12, %9, !dbg !12
+  %14 = icmp ult i32 %urem, 120, !dbg !13
+  %15 = shl nuw nsw i32 %urem, 17, !dbg !14
+  %16 = add i32 %12, %15, !dbg !15
+  %17 = sext i32 %16 to i64, !dbg !16
+  %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16
+  %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17
+  %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17
+  %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17
+  %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17
+  %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17
+  %24 = bitcast i32 %20 to float, !dbg !17
+  %25 = bitcast i32 %21 to float, !dbg !17
+  %26 = bitcast i32 %22 to float, !dbg !17
+  %27 = bitcast i32 %23 to float, !dbg !17
+  %28 = fadd float %24, 0.000000e+00, !dbg !18
+  %29 = fadd float %25, 0.000000e+00, !dbg !18
+  %30 = fadd float %26, 0.000000e+00, !dbg !18
+  %31 = fadd float %27, 0.000000e+00, !dbg !18
+  %32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19
+  %33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19
+  %34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19
+  %35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19
+  %36 = bitcast float %32 to i32, !dbg !20
+  %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20
+  %38 = bitcast i32 %37 to float, !dbg !20
+  %39 = fadd float %32, %38, !dbg !24
+  %40 = bitcast float %39 to i32, !dbg !20
+  %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20
+  %42 = bitcast i32 %41 to float, !dbg !20
+  %43 = fadd float %39, %42, !dbg !24
+  %44 = bitcast float %43 to i32, !dbg !20
+  %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20
+  %46 = bitcast i32 %45 to float, !dbg !20
+  %47 = fadd float %43, %46, !dbg !24
+  %48 = bitcast float %47 to i32, !dbg !20
+  %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20
+  %50 = bitcast i32 %49 to float, !dbg !20
+  %51 = fadd float %47, %50, !dbg !24
+  %52 = bitcast float %51 to i32, !dbg !20
+  %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20
+  %54 = bitcast i32 %53 to float, !dbg !20
+  %55 = fadd float %51, %54, !dbg !24
+  %56 = bitcast float %33 to i32, !dbg !20
+  %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
+  %58 = bitcast i32 %57 to float, !dbg !20
+  %59 = fadd float %33, %58, !dbg !24
+  %60 = bitcast float %59 to i32, !dbg !20
+  %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20
+  %62 = bitcast i32 %61 to float, !dbg !20
+  %63 = fadd float %59, %62, !dbg !24
+  %64 = bitcast float %63 to i32, !dbg !20
+  %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20
+  %66 = bitcast i32 %65 to float, !dbg !20
+  %67 = fadd float %63, %66, !dbg !24
+  %68 = bitcast float %67 to i32, !dbg !20
+  %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20
+  %70 = bitcast i32 %69 to float, !dbg !20
+  %71 = fadd float %67, %70, !dbg !24
+  %72 = bitcast float %71 to i32, !dbg !20
+  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20
+  %74 = bitcast i32 %73 to float, !dbg !20
+  %75 = fadd float %71, %74, !dbg !24
+  %76 = bitcast float %34 to i32, !dbg !20
+  %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20
+  %78 = bitcast i32 %77 to float, !dbg !20
+  %79 = fadd float %34, %78, !dbg !24
+  %80 = bitcast float %79 to i32, !dbg !20
+  %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20
+  %82 = bitcast i32 %81 to float, !dbg !20
+  %83 = fadd float %79, %82, !dbg !24
+  %84 = bitcast float %83 to i32, !dbg !20
+  %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20
+  %86 = bitcast i32 %85 to float, !dbg !20
+  %87 = fadd float %83, %86, !dbg !24
+  %88 = bitcast float %87 to i32, !dbg !20
+  %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20
+  %90 = bitcast i32 %89 to float, !dbg !20
+  %91 = fadd float %87, %90, !dbg !24
+  %92 = bitcast float %91 to i32, !dbg !20
+  %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20
+  %94 = bitcast i32 %93 to float, !dbg !20
+  %95 = fadd float %91, %94, !dbg !24
+  %96 = bitcast float %35 to i32, !dbg !20
+  %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20
+  %98 = bitcast i32 %97 to float, !dbg !20
+  %99 = fadd float %35, %98, !dbg !24
+  %100 = bitcast float %99 to i32, !dbg !20
+  %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20
+  %102 = bitcast i32 %101 to float, !dbg !20
+  %103 = fadd float %99, %102, !dbg !24
+  %104 = bitcast float %103 to i32, !dbg !20
+  %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20
+  %106 = bitcast i32 %105 to float, !dbg !20
+  %107 = fadd float %103, %106, !dbg !24
+  %108 = bitcast float %107 to i32, !dbg !20
+  %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20
+  %110 = bitcast i32 %109 to float, !dbg !20
+  %111 = fadd float %107, %110, !dbg !24
+  %112 = bitcast float %111 to i32, !dbg !20
+  %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20
+  %114 = bitcast i32 %113 to float, !dbg !20
+  %115 = fadd float %111, %114, !dbg !24
+  %116 = icmp eq i32 %7, 0, !dbg !20
+  %117 = zext nneg i32 %10 to i64, !dbg !20
+  %118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20
+  %119 = or i32 %10, 4, !dbg !20
+  %120 = zext nneg i32 %119 to i64, !dbg !20
+  %121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20
+  %122 = or i32 %10, 8, !dbg !20
+  %123 = zext nneg i32 %122 to i64, !dbg !20
+  %124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20
+  %125 = or i32 %10, 12, !dbg !20
+  %126 = zext nneg i32 %125 to i64, !dbg !20
+  %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %128 = icmp slt i32 %6, 16, !dbg !20
+  %129 = sext i32 %6 to i64, !dbg !20
+  %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20
+  %131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20
+  %132 = bitcast float %131 to i32, !dbg !20
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20
+  %134 = bitcast i32 %133 to float, !dbg !20
+  %135 = fadd float %131, %134, !dbg !24
+  %136 = bitcast float %135 to i32, !dbg !20
+  %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20
+  %138 = bitcast i32 %137 to float, !dbg !20
+  %139 = fadd float %135, %138, !dbg !24
+  %140 = icmp eq i32 %9, 0, !dbg !20
+  %141 = and i1 %128, %140, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20
+  %143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20
+  %144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20
+  %145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28
+  store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28
+  %147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28
+  store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28
+  %148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28
+  store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28
+  %149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28
+  store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %150 = zext nneg i32 %9 to i64, !dbg !28
+  %151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28
+  %152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28
+  %.frozen = freeze i32 %13
+  %153 = sdiv i32 %.frozen, 256, !dbg !29
+  %154 = mul i32 %153, 256
+  %.decomposed = sub i32 %.frozen, %154
+  %155 = sext i32 %153 to i64, !dbg !30
+  %156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30
+  %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31
+  %158 = lshr i64 %157, 54, !dbg !32
+  %159 = and i64 %158, 512, !dbg !32
+  %160 = add i64 %159, %157, !dbg !32
+  %161 = shl i64 %160, 8, !dbg !33
+  %162 = sext i32 %.decomposed to i64, !dbg !34
+  %163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35
+  %164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35
+  %165 = lshr i32 %7, 2, !dbg !36
+  %166 = shl nuw nsw i32 %10, 3, !dbg !36
+  %167 = or i32 %166, %165, !dbg !36
+  %168 = icmp eq i32 %167, 0, !dbg !36
+  %169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
+!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 22, column: 44, scope: !5)
+!9 = !DILocation(line: 24, column: 33, scope: !5)
+!10 = !DILocation(line: 21, column: 28, scope: !5)
+!11 = !DILocation(line: 21, column: 33, scope: !5)
+!12 = !DILocation(line: 22, column: 23, scope: !5)
+!13 = !DILocation(line: 29, column: 25, scope: !5)
+!14 = !DILocation(line: 31, column: 47, scope: !5)
+!15 = !DILocation(line: 31, column: 40, scope: !5)
+!16 = !DILocation(line: 31, column: 34, scope: !5)
+!17 = !DILocation(line: 31, column: 53, scope: !5)
+!18 = !DILocation(line: 33, column: 23, scope: !5)
+!19 = !DILocation(line: 34, column: 38, scope: !5)
+!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!23 = !DILocation(line: 35, column: 25, scope: !21)
+!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
+!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
+!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
+!27 = !DILocation(line: 35, column: 25, scope: !25)
+!28 = !DILocation(line: 35, column: 28, scope: !5)
+!29 = !DILocation(line: 36, column: 20, scope: !5)
+!30 = !DILocation(line: 38, column: 30, scope: !5)
+!31 = !DILocation(line: 38, column: 35, scope: !5)
+!32 = !DILocation(line: 41, column: 32, scope: !5)
+!33 = !DILocation(line: 45, column: 40, scope: !5)
+!34 = !DILocation(line: 45, column: 36, scope: !5)
+!35 = !DILocation(line: 45, column: 30, scope: !5)
+!36 = !DILocation(line: 45, column: 55, scope: !5)
+!37 = !DILocation(line: 45, column: 4, scope: !5)
diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..bc71f79fb6dd2aa7ab04d393771ae77f838376a9
--- /dev/null
+++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir
@@ -0,0 +1,858 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %4 = shl i32 %3, 3, !dbg !10
+  %5 = and i32 %4, 1016, !dbg !10
+  %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
+  %7 = shl i32 %6, 10, !dbg !12
+  %8 = or i32 %7, %5, !dbg !13
+  %9 = sext i32 %8 to i64, !dbg !14
+  %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
+  %11 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
+  %12 = extractvalue { i32, i32, i32, i32 } %11, 0, !dbg !15
+  %13 = extractvalue { i32, i32, i32, i32 } %11, 1, !dbg !15
+  %14 = extractvalue { i32, i32, i32, i32 } %11, 2, !dbg !15
+  %15 = extractvalue { i32, i32, i32, i32 } %11, 3, !dbg !15
+  %16 = trunc i32 %12 to i16, !dbg !15
+  %extelt.offset = lshr i32 %12, 16, !dbg !15
+  %17 = trunc i32 %extelt.offset to i16, !dbg !15
+  %18 = trunc i32 %13 to i16, !dbg !15
+  %extelt.offset1 = lshr i32 %13, 16, !dbg !15
+  %19 = trunc i32 %extelt.offset1 to i16, !dbg !15
+  %20 = trunc i32 %14 to i16, !dbg !15
+  %extelt.offset2 = lshr i32 %14, 16, !dbg !15
+  %21 = trunc i32 %extelt.offset2 to i16, !dbg !15
+  %22 = trunc i32 %15 to i16, !dbg !15
+  %extelt.offset3 = lshr i32 %15, 16, !dbg !15
+  %23 = trunc i32 %extelt.offset3 to i16, !dbg !15
+  %24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #4, !dbg !16
+  %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
+  %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
+  %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
+  %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
+  %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
+  %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
+  %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
+  %32 = fmul float %24, 0x3FE6A09E60000000, !dbg !17
+  %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
+  %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
+  %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
+  %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
+  %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
+  %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
+  %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
+  %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i = icmp eq i32 %40, 0, !dbg !18
+  %41 = tail call float @llvm.nvvm.fabs.ftz.f(float %32) #4, !dbg !18
+  %42 = tail call float @llvm.nvvm.fabs.f(float %32) #4, !dbg !18
+  %.0.i = select i1 %.not.i, float %42, float %41, !dbg !18
+  %43 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %43, label %__nv_fabsf.exit1.i, label %45, !dbg !18
+
+__nv_fabsf.exit1.i:                               ; preds = %2
+  %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i = icmp eq i32 %44, 0, !dbg !18
+  %.01.i = select i1 %.not1.i, float %42, float %41, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+
+45:                                               ; preds = %2
+  %46 = fmul float %32, %32, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+
+__internal_fmad.exit.i:                           ; preds = %45, %__nv_fabsf.exit1.i
+  %47 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %45 ], !dbg !18
+  %48 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %45 ], !dbg !18
+  %49 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %45 ], !dbg !18
+  %50 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %45 ], !dbg !18
+  %51 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %45 ], !dbg !18
+  %52 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %45 ], !dbg !18
+  %53 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %45 ], !dbg !18
+  %54 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %46, %45 ], !dbg !18
+  %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i = icmp eq i32 %55, 0, !dbg !18
+  %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %53, float %54, float %52) #4, !dbg !18
+  %57 = tail call float @llvm.nvvm.fma.rn.f(float %53, float %54, float %52) #4, !dbg !18
+  %.02.i = select i1 %.not2.i, float %57, float %56, !dbg !18
+  %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i = icmp eq i32 %58, 0, !dbg !18
+  %59 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %54, float %51) #4, !dbg !18
+  %60 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %54, float %51) #4, !dbg !18
+  %.03.i = select i1 %.not3.i, float %60, float %59, !dbg !18
+  %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i = icmp eq i32 %61, 0, !dbg !18
+  %62 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %54, float %50) #4, !dbg !18
+  %63 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %54, float %50) #4, !dbg !18
+  %.04.i = select i1 %.not4.i, float %63, float %62, !dbg !18
+  %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i = icmp eq i32 %64, 0, !dbg !18
+  %65 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %54, float %49) #4, !dbg !18
+  %66 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %54, float %49) #4, !dbg !18
+  %.05.i = select i1 %.not5.i, float %66, float %65, !dbg !18
+  %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i = icmp eq i32 %67, 0, !dbg !18
+  %68 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %54, float %48) #4, !dbg !18
+  %69 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %54, float %48) #4, !dbg !18
+  %.06.i = select i1 %.not6.i, float %69, float %68, !dbg !18
+  %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i = icmp eq i32 %70, 0, !dbg !18
+  %71 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %54, float %47) #4, !dbg !18
+  %72 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %54, float %47) #4, !dbg !18
+  %.07.i = select i1 %.not7.i, float %72, float %71, !dbg !18
+  %73 = fneg float %54, !dbg !18
+  %74 = select i1 %43, float %73, float %32, !dbg !18
+  %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i = icmp eq i32 %75, 0, !dbg !18
+  %76 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %74, float %74) #4, !dbg !18
+  %77 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %74, float %74) #4, !dbg !18
+  %.08.i = select i1 %.not8.i, float %77, float %76, !dbg !18
+  br i1 %43, label %78, label %__nv_erff.exit, !dbg !18
+
+78:                                               ; preds = %__internal_fmad.exit.i
+  %79 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
+  %80 = fsub float 1.000000e+00, %79, !dbg !18
+  %81 = bitcast float %80 to i32, !dbg !18
+  %82 = bitcast float %32 to i32, !dbg !18
+  %83 = and i32 %82, -2147483648, !dbg !18
+  %84 = or i32 %83, %81, !dbg !18
+  %85 = bitcast i32 %84 to float, !dbg !18
+  br label %__nv_erff.exit, !dbg !18
+
+__nv_erff.exit:                                   ; preds = %__internal_fmad.exit.i, %78
+  %r.0.i = phi float [ %85, %78 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
+  %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i4 = icmp eq i32 %86, 0, !dbg !18
+  %87 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
+  %88 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
+  %.0.i5 = select i1 %.not.i4, float %88, float %87, !dbg !18
+  %89 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %89, label %__nv_fabsf.exit1.i22, label %91, !dbg !18
+
+__nv_fabsf.exit1.i22:                             ; preds = %__nv_erff.exit
+  %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i23 = icmp eq i32 %90, 0, !dbg !18
+  %.01.i24 = select i1 %.not1.i23, float %88, float %87, !dbg !18
+  br label %__internal_fmad.exit.i6, !dbg !18
+
+91:                                               ; preds = %__nv_erff.exit
+  %92 = fmul float %33, %33, !dbg !18
+  br label %__internal_fmad.exit.i6, !dbg !18
+
+__internal_fmad.exit.i6:                          ; preds = %91, %__nv_fabsf.exit1.i22
+  %93 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %91 ], !dbg !18
+  %94 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %91 ], !dbg !18
+  %95 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %91 ], !dbg !18
+  %96 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %91 ], !dbg !18
+  %97 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %91 ], !dbg !18
+  %98 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %91 ], !dbg !18
+  %99 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %91 ], !dbg !18
+  %100 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %92, %91 ], !dbg !18
+  %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i7 = icmp eq i32 %101, 0, !dbg !18
+  %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %99, float %100, float %98) #4, !dbg !18
+  %103 = tail call float @llvm.nvvm.fma.rn.f(float %99, float %100, float %98) #4, !dbg !18
+  %.02.i8 = select i1 %.not2.i7, float %103, float %102, !dbg !18
+  %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i9 = icmp eq i32 %104, 0, !dbg !18
+  %105 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %100, float %97) #4, !dbg !18
+  %106 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %100, float %97) #4, !dbg !18
+  %.03.i10 = select i1 %.not3.i9, float %106, float %105, !dbg !18
+  %107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i11 = icmp eq i32 %107, 0, !dbg !18
+  %108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %100, float %96) #4, !dbg !18
+  %109 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %100, float %96) #4, !dbg !18
+  %.04.i12 = select i1 %.not4.i11, float %109, float %108, !dbg !18
+  %110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i13 = icmp eq i32 %110, 0, !dbg !18
+  %111 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %100, float %95) #4, !dbg !18
+  %112 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %100, float %95) #4, !dbg !18
+  %.05.i14 = select i1 %.not5.i13, float %112, float %111, !dbg !18
+  %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i15 = icmp eq i32 %113, 0, !dbg !18
+  %114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %100, float %94) #4, !dbg !18
+  %115 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %100, float %94) #4, !dbg !18
+  %.06.i16 = select i1 %.not6.i15, float %115, float %114, !dbg !18
+  %116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i17 = icmp eq i32 %116, 0, !dbg !18
+  %117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %100, float %93) #4, !dbg !18
+  %118 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %100, float %93) #4, !dbg !18
+  %.07.i18 = select i1 %.not7.i17, float %118, float %117, !dbg !18
+  %119 = fneg float %100, !dbg !18
+  %120 = select i1 %89, float %119, float %33, !dbg !18
+  %121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i19 = icmp eq i32 %121, 0, !dbg !18
+  %122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %120, float %120) #4, !dbg !18
+  %123 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %120, float %120) #4, !dbg !18
+  %.08.i20 = select i1 %.not8.i19, float %123, float %122, !dbg !18
+  br i1 %89, label %124, label %__nv_erff.exit25, !dbg !18
+
+124:                                              ; preds = %__internal_fmad.exit.i6
+  %125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
+  %126 = fsub float 1.000000e+00, %125, !dbg !18
+  %127 = bitcast float %126 to i32, !dbg !18
+  %128 = bitcast float %33 to i32, !dbg !18
+  %129 = and i32 %128, -2147483648, !dbg !18
+  %130 = or i32 %129, %127, !dbg !18
+  %131 = bitcast i32 %130 to float, !dbg !18
+  br label %__nv_erff.exit25, !dbg !18
+
+__nv_erff.exit25:                                 ; preds = %__internal_fmad.exit.i6, %124
+  %r.0.i21 = phi float [ %131, %124 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
+  %132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i26 = icmp eq i32 %132, 0, !dbg !18
+  %133 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
+  %134 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
+  %.0.i27 = select i1 %.not.i26, float %134, float %133, !dbg !18
+  %135 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %135, label %__nv_fabsf.exit1.i44, label %137, !dbg !18
+
+__nv_fabsf.exit1.i44:                             ; preds = %__nv_erff.exit25
+  %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i45 = icmp eq i32 %136, 0, !dbg !18
+  %.01.i46 = select i1 %.not1.i45, float %134, float %133, !dbg !18
+  br label %__internal_fmad.exit.i28, !dbg !18
+
+137:                                              ; preds = %__nv_erff.exit25
+  %138 = fmul float %34, %34, !dbg !18
+  br label %__internal_fmad.exit.i28, !dbg !18
+
+__internal_fmad.exit.i28:                         ; preds = %137, %__nv_fabsf.exit1.i44
+  %139 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %137 ], !dbg !18
+  %140 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %137 ], !dbg !18
+  %141 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %137 ], !dbg !18
+  %142 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %137 ], !dbg !18
+  %143 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %137 ], !dbg !18
+  %144 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %137 ], !dbg !18
+  %145 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %137 ], !dbg !18
+  %146 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %138, %137 ], !dbg !18
+  %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i29 = icmp eq i32 %147, 0, !dbg !18
+  %148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %145, float %146, float %144) #4, !dbg !18
+  %149 = tail call float @llvm.nvvm.fma.rn.f(float %145, float %146, float %144) #4, !dbg !18
+  %.02.i30 = select i1 %.not2.i29, float %149, float %148, !dbg !18
+  %150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i31 = icmp eq i32 %150, 0, !dbg !18
+  %151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %146, float %143) #4, !dbg !18
+  %152 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %146, float %143) #4, !dbg !18
+  %.03.i32 = select i1 %.not3.i31, float %152, float %151, !dbg !18
+  %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i33 = icmp eq i32 %153, 0, !dbg !18
+  %154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %146, float %142) #4, !dbg !18
+  %155 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %146, float %142) #4, !dbg !18
+  %.04.i34 = select i1 %.not4.i33, float %155, float %154, !dbg !18
+  %156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i35 = icmp eq i32 %156, 0, !dbg !18
+  %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %146, float %141) #4, !dbg !18
+  %158 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %146, float %141) #4, !dbg !18
+  %.05.i36 = select i1 %.not5.i35, float %158, float %157, !dbg !18
+  %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i37 = icmp eq i32 %159, 0, !dbg !18
+  %160 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %146, float %140) #4, !dbg !18
+  %161 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %146, float %140) #4, !dbg !18
+  %.06.i38 = select i1 %.not6.i37, float %161, float %160, !dbg !18
+  %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i39 = icmp eq i32 %162, 0, !dbg !18
+  %163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %146, float %139) #4, !dbg !18
+  %164 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %146, float %139) #4, !dbg !18
+  %.07.i40 = select i1 %.not7.i39, float %164, float %163, !dbg !18
+  %165 = fneg float %146, !dbg !18
+  %166 = select i1 %135, float %165, float %34, !dbg !18
+  %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i41 = icmp eq i32 %167, 0, !dbg !18
+  %168 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %166, float %166) #4, !dbg !18
+  %169 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %166, float %166) #4, !dbg !18
+  %.08.i42 = select i1 %.not8.i41, float %169, float %168, !dbg !18
+  br i1 %135, label %170, label %__nv_erff.exit47, !dbg !18
+
+170:                                              ; preds = %__internal_fmad.exit.i28
+  %171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
+  %172 = fsub float 1.000000e+00, %171, !dbg !18
+  %173 = bitcast float %172 to i32, !dbg !18
+  %174 = bitcast float %34 to i32, !dbg !18
+  %175 = and i32 %174, -2147483648, !dbg !18
+  %176 = or i32 %175, %173, !dbg !18
+  %177 = bitcast i32 %176 to float, !dbg !18
+  br label %__nv_erff.exit47, !dbg !18
+
+__nv_erff.exit47:                                 ; preds = %__internal_fmad.exit.i28, %170
+  %r.0.i43 = phi float [ %177, %170 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
+  %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i48 = icmp eq i32 %178, 0, !dbg !18
+  %179 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
+  %180 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
+  %.0.i49 = select i1 %.not.i48, float %180, float %179, !dbg !18
+  %181 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %181, label %__nv_fabsf.exit1.i66, label %183, !dbg !18
+
+__nv_fabsf.exit1.i66:                             ; preds = %__nv_erff.exit47
+  %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i67 = icmp eq i32 %182, 0, !dbg !18
+  %.01.i68 = select i1 %.not1.i67, float %180, float %179, !dbg !18
+  br label %__internal_fmad.exit.i50, !dbg !18
+
+183:                                              ; preds = %__nv_erff.exit47
+  %184 = fmul float %35, %35, !dbg !18
+  br label %__internal_fmad.exit.i50, !dbg !18
+
+__internal_fmad.exit.i50:                         ; preds = %183, %__nv_fabsf.exit1.i66
+  %185 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %183 ], !dbg !18
+  %186 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %183 ], !dbg !18
+  %187 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %183 ], !dbg !18
+  %188 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %183 ], !dbg !18
+  %189 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %183 ], !dbg !18
+  %190 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %183 ], !dbg !18
+  %191 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %183 ], !dbg !18
+  %192 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %184, %183 ], !dbg !18
+  %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i51 = icmp eq i32 %193, 0, !dbg !18
+  %194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %191, float %192, float %190) #4, !dbg !18
+  %195 = tail call float @llvm.nvvm.fma.rn.f(float %191, float %192, float %190) #4, !dbg !18
+  %.02.i52 = select i1 %.not2.i51, float %195, float %194, !dbg !18
+  %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i53 = icmp eq i32 %196, 0, !dbg !18
+  %197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %192, float %189) #4, !dbg !18
+  %198 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %192, float %189) #4, !dbg !18
+  %.03.i54 = select i1 %.not3.i53, float %198, float %197, !dbg !18
+  %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i55 = icmp eq i32 %199, 0, !dbg !18
+  %200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %192, float %188) #4, !dbg !18
+  %201 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %192, float %188) #4, !dbg !18
+  %.04.i56 = select i1 %.not4.i55, float %201, float %200, !dbg !18
+  %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i57 = icmp eq i32 %202, 0, !dbg !18
+  %203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %192, float %187) #4, !dbg !18
+  %204 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %192, float %187) #4, !dbg !18
+  %.05.i58 = select i1 %.not5.i57, float %204, float %203, !dbg !18
+  %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i59 = icmp eq i32 %205, 0, !dbg !18
+  %206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %192, float %186) #4, !dbg !18
+  %207 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %192, float %186) #4, !dbg !18
+  %.06.i60 = select i1 %.not6.i59, float %207, float %206, !dbg !18
+  %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i61 = icmp eq i32 %208, 0, !dbg !18
+  %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %192, float %185) #4, !dbg !18
+  %210 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %192, float %185) #4, !dbg !18
+  %.07.i62 = select i1 %.not7.i61, float %210, float %209, !dbg !18
+  %211 = fneg float %192, !dbg !18
+  %212 = select i1 %181, float %211, float %35, !dbg !18
+  %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i63 = icmp eq i32 %213, 0, !dbg !18
+  %214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %212, float %212) #4, !dbg !18
+  %215 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %212, float %212) #4, !dbg !18
+  %.08.i64 = select i1 %.not8.i63, float %215, float %214, !dbg !18
+  br i1 %181, label %216, label %__nv_erff.exit69, !dbg !18
+
+216:                                              ; preds = %__internal_fmad.exit.i50
+  %217 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
+  %218 = fsub float 1.000000e+00, %217, !dbg !18
+  %219 = bitcast float %218 to i32, !dbg !18
+  %220 = bitcast float %35 to i32, !dbg !18
+  %221 = and i32 %220, -2147483648, !dbg !18
+  %222 = or i32 %221, %219, !dbg !18
+  %223 = bitcast i32 %222 to float, !dbg !18
+  br label %__nv_erff.exit69, !dbg !18
+
+__nv_erff.exit69:                                 ; preds = %__internal_fmad.exit.i50, %216
+  %r.0.i65 = phi float [ %223, %216 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
+  %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i70 = icmp eq i32 %224, 0, !dbg !18
+  %225 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
+  %226 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
+  %.0.i71 = select i1 %.not.i70, float %226, float %225, !dbg !18
+  %227 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %227, label %__nv_fabsf.exit1.i88, label %229, !dbg !18
+
+__nv_fabsf.exit1.i88:                             ; preds = %__nv_erff.exit69
+  %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i89 = icmp eq i32 %228, 0, !dbg !18
+  %.01.i90 = select i1 %.not1.i89, float %226, float %225, !dbg !18
+  br label %__internal_fmad.exit.i72, !dbg !18
+
+229:                                              ; preds = %__nv_erff.exit69
+  %230 = fmul float %36, %36, !dbg !18
+  br label %__internal_fmad.exit.i72, !dbg !18
+
+__internal_fmad.exit.i72:                         ; preds = %229, %__nv_fabsf.exit1.i88
+  %231 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %229 ], !dbg !18
+  %232 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %229 ], !dbg !18
+  %233 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %229 ], !dbg !18
+  %234 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %229 ], !dbg !18
+  %235 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %229 ], !dbg !18
+  %236 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %229 ], !dbg !18
+  %237 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %229 ], !dbg !18
+  %238 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %230, %229 ], !dbg !18
+  %239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i73 = icmp eq i32 %239, 0, !dbg !18
+  %240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %237, float %238, float %236) #4, !dbg !18
+  %241 = tail call float @llvm.nvvm.fma.rn.f(float %237, float %238, float %236) #4, !dbg !18
+  %.02.i74 = select i1 %.not2.i73, float %241, float %240, !dbg !18
+  %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i75 = icmp eq i32 %242, 0, !dbg !18
+  %243 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %238, float %235) #4, !dbg !18
+  %244 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %238, float %235) #4, !dbg !18
+  %.03.i76 = select i1 %.not3.i75, float %244, float %243, !dbg !18
+  %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i77 = icmp eq i32 %245, 0, !dbg !18
+  %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %238, float %234) #4, !dbg !18
+  %247 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %238, float %234) #4, !dbg !18
+  %.04.i78 = select i1 %.not4.i77, float %247, float %246, !dbg !18
+  %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i79 = icmp eq i32 %248, 0, !dbg !18
+  %249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %238, float %233) #4, !dbg !18
+  %250 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %238, float %233) #4, !dbg !18
+  %.05.i80 = select i1 %.not5.i79, float %250, float %249, !dbg !18
+  %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i81 = icmp eq i32 %251, 0, !dbg !18
+  %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %238, float %232) #4, !dbg !18
+  %253 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %238, float %232) #4, !dbg !18
+  %.06.i82 = select i1 %.not6.i81, float %253, float %252, !dbg !18
+  %254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i83 = icmp eq i32 %254, 0, !dbg !18
+  %255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %238, float %231) #4, !dbg !18
+  %256 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %238, float %231) #4, !dbg !18
+  %.07.i84 = select i1 %.not7.i83, float %256, float %255, !dbg !18
+  %257 = fneg float %238, !dbg !18
+  %258 = select i1 %227, float %257, float %36, !dbg !18
+  %259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i85 = icmp eq i32 %259, 0, !dbg !18
+  %260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %258, float %258) #4, !dbg !18
+  %261 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %258, float %258) #4, !dbg !18
+  %.08.i86 = select i1 %.not8.i85, float %261, float %260, !dbg !18
+  br i1 %227, label %262, label %__nv_erff.exit91, !dbg !18
+
+262:                                              ; preds = %__internal_fmad.exit.i72
+  %263 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
+  %264 = fsub float 1.000000e+00, %263, !dbg !18
+  %265 = bitcast float %264 to i32, !dbg !18
+  %266 = bitcast float %36 to i32, !dbg !18
+  %267 = and i32 %266, -2147483648, !dbg !18
+  %268 = or i32 %267, %265, !dbg !18
+  %269 = bitcast i32 %268 to float, !dbg !18
+  br label %__nv_erff.exit91, !dbg !18
+
+__nv_erff.exit91:                                 ; preds = %__internal_fmad.exit.i72, %262
+  %r.0.i87 = phi float [ %269, %262 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
+  %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i92 = icmp eq i32 %270, 0, !dbg !18
+  %271 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
+  %272 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
+  %.0.i93 = select i1 %.not.i92, float %272, float %271, !dbg !18
+  %273 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %273, label %__nv_fabsf.exit1.i110, label %275, !dbg !18
+
+__nv_fabsf.exit1.i110:                            ; preds = %__nv_erff.exit91
+  %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i111 = icmp eq i32 %274, 0, !dbg !18
+  %.01.i112 = select i1 %.not1.i111, float %272, float %271, !dbg !18
+  br label %__internal_fmad.exit.i94, !dbg !18
+
+275:                                              ; preds = %__nv_erff.exit91
+  %276 = fmul float %37, %37, !dbg !18
+  br label %__internal_fmad.exit.i94, !dbg !18
+
+__internal_fmad.exit.i94:                         ; preds = %275, %__nv_fabsf.exit1.i110
+  %277 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %275 ], !dbg !18
+  %278 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %275 ], !dbg !18
+  %279 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %275 ], !dbg !18
+  %280 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %275 ], !dbg !18
+  %281 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %275 ], !dbg !18
+  %282 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %275 ], !dbg !18
+  %283 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %275 ], !dbg !18
+  %284 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %276, %275 ], !dbg !18
+  %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i95 = icmp eq i32 %285, 0, !dbg !18
+  %286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %283, float %284, float %282) #4, !dbg !18
+  %287 = tail call float @llvm.nvvm.fma.rn.f(float %283, float %284, float %282) #4, !dbg !18
+  %.02.i96 = select i1 %.not2.i95, float %287, float %286, !dbg !18
+  %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i97 = icmp eq i32 %288, 0, !dbg !18
+  %289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %284, float %281) #4, !dbg !18
+  %290 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %284, float %281) #4, !dbg !18
+  %.03.i98 = select i1 %.not3.i97, float %290, float %289, !dbg !18
+  %291 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i99 = icmp eq i32 %291, 0, !dbg !18
+  %292 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %284, float %280) #4, !dbg !18
+  %293 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %284, float %280) #4, !dbg !18
+  %.04.i100 = select i1 %.not4.i99, float %293, float %292, !dbg !18
+  %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i101 = icmp eq i32 %294, 0, !dbg !18
+  %295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %284, float %279) #4, !dbg !18
+  %296 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %284, float %279) #4, !dbg !18
+  %.05.i102 = select i1 %.not5.i101, float %296, float %295, !dbg !18
+  %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i103 = icmp eq i32 %297, 0, !dbg !18
+  %298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %284, float %278) #4, !dbg !18
+  %299 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %284, float %278) #4, !dbg !18
+  %.06.i104 = select i1 %.not6.i103, float %299, float %298, !dbg !18
+  %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i105 = icmp eq i32 %300, 0, !dbg !18
+  %301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %284, float %277) #4, !dbg !18
+  %302 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %284, float %277) #4, !dbg !18
+  %.07.i106 = select i1 %.not7.i105, float %302, float %301, !dbg !18
+  %303 = fneg float %284, !dbg !18
+  %304 = select i1 %273, float %303, float %37, !dbg !18
+  %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i107 = icmp eq i32 %305, 0, !dbg !18
+  %306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %304, float %304) #4, !dbg !18
+  %307 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %304, float %304) #4, !dbg !18
+  %.08.i108 = select i1 %.not8.i107, float %307, float %306, !dbg !18
+  br i1 %273, label %308, label %__nv_erff.exit113, !dbg !18
+
+308:                                              ; preds = %__internal_fmad.exit.i94
+  %309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
+  %310 = fsub float 1.000000e+00, %309, !dbg !18
+  %311 = bitcast float %310 to i32, !dbg !18
+  %312 = bitcast float %37 to i32, !dbg !18
+  %313 = and i32 %312, -2147483648, !dbg !18
+  %314 = or i32 %313, %311, !dbg !18
+  %315 = bitcast i32 %314 to float, !dbg !18
+  br label %__nv_erff.exit113, !dbg !18
+
+__nv_erff.exit113:                                ; preds = %__internal_fmad.exit.i94, %308
+  %r.0.i109 = phi float [ %315, %308 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
+  %316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i114 = icmp eq i32 %316, 0, !dbg !18
+  %317 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
+  %318 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
+  %.0.i115 = select i1 %.not.i114, float %318, float %317, !dbg !18
+  %319 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %319, label %__nv_fabsf.exit1.i132, label %321, !dbg !18
+
+__nv_fabsf.exit1.i132:                            ; preds = %__nv_erff.exit113
+  %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i133 = icmp eq i32 %320, 0, !dbg !18
+  %.01.i134 = select i1 %.not1.i133, float %318, float %317, !dbg !18
+  br label %__internal_fmad.exit.i116, !dbg !18
+
+321:                                              ; preds = %__nv_erff.exit113
+  %322 = fmul float %38, %38, !dbg !18
+  br label %__internal_fmad.exit.i116, !dbg !18
+
+__internal_fmad.exit.i116:                        ; preds = %321, %__nv_fabsf.exit1.i132
+  %323 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %321 ], !dbg !18
+  %324 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %321 ], !dbg !18
+  %325 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %321 ], !dbg !18
+  %326 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %321 ], !dbg !18
+  %327 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %321 ], !dbg !18
+  %328 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %321 ], !dbg !18
+  %329 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %321 ], !dbg !18
+  %330 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %322, %321 ], !dbg !18
+  %331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i117 = icmp eq i32 %331, 0, !dbg !18
+  %332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %329, float %330, float %328) #4, !dbg !18
+  %333 = tail call float @llvm.nvvm.fma.rn.f(float %329, float %330, float %328) #4, !dbg !18
+  %.02.i118 = select i1 %.not2.i117, float %333, float %332, !dbg !18
+  %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i119 = icmp eq i32 %334, 0, !dbg !18
+  %335 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %330, float %327) #4, !dbg !18
+  %336 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %330, float %327) #4, !dbg !18
+  %.03.i120 = select i1 %.not3.i119, float %336, float %335, !dbg !18
+  %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i121 = icmp eq i32 %337, 0, !dbg !18
+  %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %330, float %326) #4, !dbg !18
+  %339 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %330, float %326) #4, !dbg !18
+  %.04.i122 = select i1 %.not4.i121, float %339, float %338, !dbg !18
+  %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i123 = icmp eq i32 %340, 0, !dbg !18
+  %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %330, float %325) #4, !dbg !18
+  %342 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %330, float %325) #4, !dbg !18
+  %.05.i124 = select i1 %.not5.i123, float %342, float %341, !dbg !18
+  %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i125 = icmp eq i32 %343, 0, !dbg !18
+  %344 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %330, float %324) #4, !dbg !18
+  %345 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %330, float %324) #4, !dbg !18
+  %.06.i126 = select i1 %.not6.i125, float %345, float %344, !dbg !18
+  %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i127 = icmp eq i32 %346, 0, !dbg !18
+  %347 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %330, float %323) #4, !dbg !18
+  %348 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %330, float %323) #4, !dbg !18
+  %.07.i128 = select i1 %.not7.i127, float %348, float %347, !dbg !18
+  %349 = fneg float %330, !dbg !18
+  %350 = select i1 %319, float %349, float %38, !dbg !18
+  %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i129 = icmp eq i32 %351, 0, !dbg !18
+  %352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %350, float %350) #4, !dbg !18
+  %353 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %350, float %350) #4, !dbg !18
+  %.08.i130 = select i1 %.not8.i129, float %353, float %352, !dbg !18
+  br i1 %319, label %354, label %__nv_erff.exit135, !dbg !18
+
+354:                                              ; preds = %__internal_fmad.exit.i116
+  %355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
+  %356 = fsub float 1.000000e+00, %355, !dbg !18
+  %357 = bitcast float %356 to i32, !dbg !18
+  %358 = bitcast float %38 to i32, !dbg !18
+  %359 = and i32 %358, -2147483648, !dbg !18
+  %360 = or i32 %359, %357, !dbg !18
+  %361 = bitcast i32 %360 to float, !dbg !18
+  br label %__nv_erff.exit135, !dbg !18
+
+__nv_erff.exit135:                                ; preds = %__internal_fmad.exit.i116, %354
+  %r.0.i131 = phi float [ %361, %354 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
+  %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i136 = icmp eq i32 %362, 0, !dbg !18
+  %363 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
+  %364 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
+  %.0.i137 = select i1 %.not.i136, float %364, float %363, !dbg !18
+  %365 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %365, label %__nv_fabsf.exit1.i154, label %367, !dbg !18
+
+__nv_fabsf.exit1.i154:                            ; preds = %__nv_erff.exit135
+  %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i155 = icmp eq i32 %366, 0, !dbg !18
+  %.01.i156 = select i1 %.not1.i155, float %364, float %363, !dbg !18
+  br label %__internal_fmad.exit.i138, !dbg !18
+
+367:                                              ; preds = %__nv_erff.exit135
+  %368 = fmul float %39, %39, !dbg !18
+  br label %__internal_fmad.exit.i138, !dbg !18
+
+__internal_fmad.exit.i138:                        ; preds = %367, %__nv_fabsf.exit1.i154
+  %369 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %367 ], !dbg !18
+  %370 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %367 ], !dbg !18
+  %371 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %367 ], !dbg !18
+  %372 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %367 ], !dbg !18
+  %373 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %367 ], !dbg !18
+  %374 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %367 ], !dbg !18
+  %375 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %367 ], !dbg !18
+  %376 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %368, %367 ], !dbg !18
+  %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i139 = icmp eq i32 %377, 0, !dbg !18
+  %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float %376, float %374) #4, !dbg !18
+  %379 = tail call float @llvm.nvvm.fma.rn.f(float %375, float %376, float %374) #4, !dbg !18
+  %.02.i140 = select i1 %.not2.i139, float %379, float %378, !dbg !18
+  %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i141 = icmp eq i32 %380, 0, !dbg !18
+  %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %376, float %373) #4, !dbg !18
+  %382 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %376, float %373) #4, !dbg !18
+  %.03.i142 = select i1 %.not3.i141, float %382, float %381, !dbg !18
+  %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i143 = icmp eq i32 %383, 0, !dbg !18
+  %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %376, float %372) #4, !dbg !18
+  %385 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %376, float %372) #4, !dbg !18
+  %.04.i144 = select i1 %.not4.i143, float %385, float %384, !dbg !18
+  %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i145 = icmp eq i32 %386, 0, !dbg !18
+  %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %376, float %371) #4, !dbg !18
+  %388 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %376, float %371) #4, !dbg !18
+  %.05.i146 = select i1 %.not5.i145, float %388, float %387, !dbg !18
+  %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i147 = icmp eq i32 %389, 0, !dbg !18
+  %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %376, float %370) #4, !dbg !18
+  %391 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %376, float %370) #4, !dbg !18
+  %.06.i148 = select i1 %.not6.i147, float %391, float %390, !dbg !18
+  %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i149 = icmp eq i32 %392, 0, !dbg !18
+  %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %376, float %369) #4, !dbg !18
+  %394 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %376, float %369) #4, !dbg !18
+  %.07.i150 = select i1 %.not7.i149, float %394, float %393, !dbg !18
+  %395 = fneg float %376, !dbg !18
+  %396 = select i1 %365, float %395, float %39, !dbg !18
+  %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i151 = icmp eq i32 %397, 0, !dbg !18
+  %398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %396, float %396) #4, !dbg !18
+  %399 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %396, float %396) #4, !dbg !18
+  %.08.i152 = select i1 %.not8.i151, float %399, float %398, !dbg !18
+  br i1 %365, label %400, label %__nv_erff.exit157, !dbg !18
+
+400:                                              ; preds = %__internal_fmad.exit.i138
+  %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
+  %402 = fsub float 1.000000e+00, %401, !dbg !18
+  %403 = bitcast float %402 to i32, !dbg !18
+  %404 = bitcast float %39 to i32, !dbg !18
+  %405 = and i32 %404, -2147483648, !dbg !18
+  %406 = or i32 %405, %403, !dbg !18
+  %407 = bitcast i32 %406 to float, !dbg !18
+  br label %__nv_erff.exit157, !dbg !18
+
+__nv_erff.exit157:                                ; preds = %__internal_fmad.exit.i138, %400
+  %r.0.i153 = phi float [ %407, %400 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
+  %408 = fmul float %31, 5.000000e-01, !dbg !19
+  %409 = fmul float %30, 5.000000e-01, !dbg !19
+  %410 = fmul float %29, 5.000000e-01, !dbg !19
+  %411 = fmul float %28, 5.000000e-01, !dbg !19
+  %412 = fmul float %27, 5.000000e-01, !dbg !19
+  %413 = fmul float %26, 5.000000e-01, !dbg !19
+  %414 = fmul float %25, 5.000000e-01, !dbg !19
+  %415 = fmul float %24, 5.000000e-01, !dbg !19
+  %416 = fadd float %r.0.i, 1.000000e+00, !dbg !20
+  %417 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
+  %418 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
+  %419 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
+  %420 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
+  %421 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
+  %422 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
+  %423 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
+  %424 = fmul float %415, %416, !dbg !21
+  %425 = fmul float %414, %417, !dbg !21
+  %426 = fmul float %413, %418, !dbg !21
+  %427 = fmul float %412, %419, !dbg !21
+  %428 = fmul float %411, %420, !dbg !21
+  %429 = fmul float %410, %421, !dbg !21
+  %430 = fmul float %409, %422, !dbg !21
+  %431 = fmul float %408, %423, !dbg !21
+  %432 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %424) #4, !dbg !22
+  %433 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !22
+  %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !22
+  %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !22
+  %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !22
+  %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !22
+  %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !22
+  %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !22
+  %440 = insertelement <2 x i16> undef, i16 %432, i64 0, !dbg !22
+  %441 = insertelement <2 x i16> %440, i16 %433, i64 1, !dbg !22
+  %442 = bitcast <2 x i16> %441 to i32, !dbg !22
+  %443 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !22
+  %444 = insertelement <2 x i16> %443, i16 %435, i64 1, !dbg !22
+  %445 = bitcast <2 x i16> %444 to i32, !dbg !22
+  %446 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !22
+  %447 = insertelement <2 x i16> %446, i16 %437, i64 1, !dbg !22
+  %448 = bitcast <2 x i16> %447 to i32, !dbg !22
+  %449 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !22
+  %450 = insertelement <2 x i16> %449, i16 %439, i64 1, !dbg !22
+  %451 = bitcast <2 x i16> %450 to i32, !dbg !22
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %445, i32 %448, i32 %451, ptr addrspace(1) %10, i1 true) #4, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_erff(float %a) local_unnamed_addr #1 {
+__nv_fabsf.exit:
+  %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not = icmp eq i32 %0, 0
+  %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
+  %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
+  %.0 = select i1 %.not, float %2, float %1
+  %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
+  br i1 %3, label %__nv_fabsf.exit1, label %5
+
+__nv_fabsf.exit1:                                 ; preds = %__nv_fabsf.exit
+  %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not1 = icmp eq i32 %4, 0
+  %.01 = select i1 %.not1, float %2, float %1
+  br label %__internal_fmad.exit
+
+5:                                                ; preds = %__nv_fabsf.exit
+  %6 = fmul float %a, %a
+  br label %__internal_fmad.exit
+
+__internal_fmad.exit:                             ; preds = %5, %__nv_fabsf.exit1
+  %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
+  %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
+  %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
+  %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
+  %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
+  %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
+  %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
+  %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
+  %.02 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
+  %.03 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
+  %.04 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
+  %.06 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
+  %.07 = select i1 %.not7, float %32, float %31
+  %33 = fneg float %14
+  %34 = select i1 %3, float %33, float %a
+  %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not8 = icmp eq i32 %35, 0
+  %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
+  %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
+  %.08 = select i1 %.not8, float %37, float %36
+  br i1 %3, label %38, label %46
+
+38:                                               ; preds = %__internal_fmad.exit
+  %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
+  %40 = fsub float 1.000000e+00, %39
+  %41 = bitcast float %40 to i32
+  %42 = bitcast float %a to i32
+  %43 = and i32 %42, -2147483648
+  %44 = or i32 %43, %41
+  %45 = bitcast i32 %44 to float
+  br label %46
+
+46:                                               ; preds = %38, %__internal_fmad.exit
+  %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
+  ret float %r.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.ftz.f(float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.f(float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cafucwnmq4o436kwzkmrinerrnocxll7q6wsadcl726g6cradipo.py", directory: "/tmp/torchinductor_root/af")
+!4 = !{ptr @triton__0d1de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 33, scope: !7)
+!13 = !DILocation(line: 21, column: 23, scope: !7)
+!14 = !DILocation(line: 24, column: 34, scope: !7)
+!15 = !DILocation(line: 24, column: 39, scope: !7)
+!16 = !DILocation(line: 24, column: 48, scope: !7)
+!17 = !DILocation(line: 29, column: 18, scope: !7)
+!18 = !DILocation(line: 30, column: 23, scope: !7)
+!19 = !DILocation(line: 27, column: 18, scope: !7)
+!20 = !DILocation(line: 32, column: 18, scope: !7)
+!21 = !DILocation(line: 33, column: 18, scope: !7)
+!22 = !DILocation(line: 35, column: 40, scope: !7)
+!23 = !DILocation(line: 35, column: 4, scope: !7)
diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d2a9a5cb9370160003226fec8c61be7abfe7c35e
--- /dev/null
+++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..1b0de64a6a0b90e4dfdc051bb5551e0c485d158b
--- /dev/null
+++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.llir
@@ -0,0 +1,330 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = lshr i32 %8, 2, !dbg !10
+  %10 = and i32 %9, 63, !dbg !10
+  %11 = and i32 %8, 63, !dbg !10
+  %12 = and i32 %8, 3, !dbg !11
+  %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
+  %14 = shl i32 %13, 6, !dbg !13
+  %15 = or i32 %14, %10, !dbg !14
+  %16 = or i32 %14, %11, !dbg !14
+  %17 = sext i32 %15 to i64, !dbg !15
+  %18 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !15
+  %19 = sext i32 %16 to i64, !dbg !15
+  %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %18, i1 true) #5, !dbg !16
+  %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #5, !dbg !16
+  %23 = srem i32 %15, 512, !dbg !17
+  %24 = shl nsw i32 %23, 8, !dbg !18
+  %25 = add i64 %22, 50257, !dbg !19
+  %26 = icmp slt i64 %21, 0, !dbg !20
+  %27 = icmp slt i64 %22, 0, !dbg !20
+  %28 = select i1 %27, i64 %25, i64 %22, !dbg !21
+  %.fr8 = freeze i64 %28, !dbg !22
+  %29 = icmp ugt i64 %.fr8, 50256, !dbg !22
+  %30 = shl i64 %21, 8, !dbg !23
+  %31 = add i64 %30, 12865792, !dbg !23
+  %32 = select i1 %26, i64 %31, i64 %30, !dbg !23
+  %33 = getelementptr float, ptr addrspace(1) %1, i64 %32
+  br i1 %29, label %.split.us, label %.split, !dbg !24
+
+.split.us:                                        ; preds = %7, %.split.us
+  %34 = phi float [ %50, %.split.us ], [ 0.000000e+00, %7 ]
+  %35 = phi float [ %55, %.split.us ], [ 0.000000e+00, %7 ]
+  %36 = phi float [ %52, %.split.us ], [ 0.000000e+00, %7 ]
+  %37 = phi i32 [ %56, %.split.us ], [ 0, %7 ]
+  %38 = or i32 %37, %12, !dbg !25
+  %39 = add i32 %38, %24, !dbg !26
+  %40 = sext i32 %39 to i64, !dbg !27
+  %41 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !27
+  %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true) #5, !dbg !28
+  %43 = bitcast i32 %42 to float, !dbg !28
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !24
+  %44 = zext nneg i32 %38 to i64, !dbg !29
+  %45 = getelementptr float, ptr addrspace(1) %33, i64 %44, !dbg !30
+  %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %45, i1 true, i32 0, i1 true) #5, !dbg !31
+  %47 = bitcast i32 %46 to float, !dbg !31
+  %48 = fadd float %43, %47, !dbg !32
+  %49 = fsub float %48, %36, !dbg !33
+  %50 = fadd float %34, 1.000000e+00, !dbg !37
+  %51 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %49, float %50) #5, !dbg !38
+  %52 = fadd float %36, %51, !dbg !39
+  %53 = fsub float %48, %52, !dbg !40
+  %54 = fmul float %49, %53, !dbg !41
+  %55 = fadd float %35, %54, !dbg !42
+  %56 = add nuw nsw i32 %37, 4, !dbg !43
+  %57 = icmp ult i32 %37, 252, !dbg !43
+  br i1 %57, label %.split.us, label %.split5.us, !dbg !43
+
+.split:                                           ; preds = %7, %.split
+  %58 = phi float [ %74, %.split ], [ 0.000000e+00, %7 ]
+  %59 = phi float [ %79, %.split ], [ 0.000000e+00, %7 ]
+  %60 = phi float [ %76, %.split ], [ 0.000000e+00, %7 ]
+  %61 = phi i32 [ %80, %.split ], [ 0, %7 ]
+  %62 = or i32 %61, %12, !dbg !25
+  %63 = add i32 %62, %24, !dbg !26
+  %64 = sext i32 %63 to i64, !dbg !27
+  %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
+  %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true) #5, !dbg !28
+  %67 = bitcast i32 %66 to float, !dbg !28
+  %68 = zext nneg i32 %62 to i64, !dbg !29
+  %69 = getelementptr float, ptr addrspace(1) %33, i64 %68, !dbg !30
+  %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %69, i1 true, i32 0, i1 true) #5, !dbg !31
+  %71 = bitcast i32 %70 to float, !dbg !31
+  %72 = fadd float %67, %71, !dbg !32
+  %73 = fsub float %72, %60, !dbg !33
+  %74 = fadd float %58, 1.000000e+00, !dbg !37
+  %75 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %73, float %74) #5, !dbg !38
+  %76 = fadd float %60, %75, !dbg !39
+  %77 = fsub float %72, %76, !dbg !40
+  %78 = fmul float %73, %77, !dbg !41
+  %79 = fadd float %59, %78, !dbg !42
+  %80 = add nuw nsw i32 %61, 4, !dbg !43
+  %81 = icmp ult i32 %61, 252, !dbg !43
+  br i1 %81, label %.split, label %.split5.us, !dbg !43
+
+.split5.us:                                       ; preds = %.split, %.split.us
+  %.us-phi = phi float [ %52, %.split.us ], [ %76, %.split ]
+  %.us-phi6 = phi float [ %55, %.split.us ], [ %79, %.split ]
+  %.us-phi7 = phi float [ %50, %.split.us ], [ %74, %.split ]
+  %82 = bitcast float %.us-phi to i32, !dbg !44
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !44
+  %84 = bitcast i32 %83 to float, !dbg !44
+  %85 = bitcast float %.us-phi6 to i32, !dbg !44
+  %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !44
+  %87 = bitcast i32 %86 to float, !dbg !44
+  %88 = bitcast float %.us-phi7 to i32, !dbg !44
+  %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !44
+  %90 = bitcast i32 %89 to float, !dbg !44
+  %91 = fsub float %84, %.us-phi, !dbg !46
+  %92 = fadd float %.us-phi7, %90, !dbg !50
+  %93 = fcmp oeq float %92, 0.000000e+00, !dbg !51
+  %94 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %90, float %92) #5, !dbg !52
+  %95 = select i1 %93, float 0.000000e+00, float %94, !dbg !53
+  %96 = fmul float %91, %95, !dbg !54
+  %97 = fadd float %.us-phi, %96, !dbg !55
+  %98 = fadd float %.us-phi6, %87, !dbg !56
+  %99 = fmul float %91, %91, !dbg !57
+  %100 = fmul float %.us-phi7, %99, !dbg !58
+  %101 = fmul float %100, %95, !dbg !59
+  %102 = fadd float %98, %101, !dbg !60
+  %103 = bitcast float %97 to i32, !dbg !44
+  %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 1, i32 31), !dbg !44
+  %105 = bitcast i32 %104 to float, !dbg !44
+  %106 = bitcast float %102 to i32, !dbg !44
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !44
+  %108 = bitcast i32 %107 to float, !dbg !44
+  %109 = bitcast float %92 to i32, !dbg !44
+  %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 1, i32 31), !dbg !44
+  %111 = bitcast i32 %110 to float, !dbg !44
+  %112 = fsub float %105, %97, !dbg !46
+  %113 = fadd float %92, %111, !dbg !50
+  %114 = fcmp oeq float %113, 0.000000e+00, !dbg !51
+  %115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %111, float %113) #5, !dbg !52
+  %116 = select i1 %114, float 0.000000e+00, float %115, !dbg !53
+  %117 = fmul float %112, %116, !dbg !54
+  %118 = fadd float %97, %117, !dbg !55
+  %119 = fadd float %102, %108, !dbg !56
+  %120 = fmul float %112, %112, !dbg !57
+  %121 = fmul float %92, %120, !dbg !58
+  %122 = fmul float %116, %121, !dbg !59
+  %123 = fadd float %119, %122, !dbg !60
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %123, float 2.560000e+02) #5, !dbg !61
+  %125 = fadd float %124, 0x3EE4F8B580000000, !dbg !62
+  %126 = shl i32 %15, 8, !dbg !63
+  br label %127, !dbg !64
+
+127:                                              ; preds = %.split5.us, %__nv_rsqrtf.exit
+  %128 = phi i32 [ 0, %.split5.us ], [ %157, %__nv_rsqrtf.exit ]
+  %129 = or i32 %128, %12, !dbg !65
+  %130 = add i32 %129, %24, !dbg !66
+  %131 = sext i32 %130 to i64, !dbg !67
+  %132 = getelementptr float, ptr addrspace(1) %2, i64 %131, !dbg !67
+  %133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %132, i1 true, i32 0, i1 true) #5, !dbg !68
+  %134 = bitcast i32 %133 to float, !dbg !68
+  %135 = zext nneg i32 %129 to i64, !dbg !69
+  %136 = getelementptr float, ptr addrspace(1) %3, i64 %135, !dbg !69
+  %137 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true) #5, !dbg !70
+  %138 = bitcast i32 %137 to float, !dbg !70
+  br i1 %29, label %139, label %140, !dbg !71
+
+139:                                              ; preds = %127
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %140, !dbg !71
+
+140:                                              ; preds = %139, %127
+  %141 = getelementptr float, ptr addrspace(1) %33, i64 %135, !dbg !72
+  %142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true) #5, !dbg !73
+  %143 = bitcast i32 %142 to float, !dbg !73
+  %144 = fadd float %134, %143, !dbg !74
+  %145 = fsub float %144, %118, !dbg !75
+  %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !76
+  %.not.i = icmp eq i32 %146, 0, !dbg !76
+  br i1 %.not.i, label %149, label %147, !dbg !76
+
+147:                                              ; preds = %140
+  %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+149:                                              ; preds = %140
+  %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+__nv_rsqrtf.exit:                                 ; preds = %147, %149
+  %.0.i = phi float [ %148, %147 ], [ %150, %149 ], !dbg !76
+  %151 = fmul float %145, %.0.i, !dbg !77
+  %152 = fmul float %151, %138, !dbg !78
+  %153 = add i32 %129, %126, !dbg !79
+  %154 = sext i32 %153 to i64, !dbg !80
+  %155 = getelementptr i16, ptr addrspace(1) %4, i64 %154, !dbg !80
+  %156 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %152) #5, !dbg !81
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %155, i1 true) #5, !dbg !81
+  %157 = add nuw nsw i32 %128, 4, !dbg !64
+  %158 = icmp ult i32 %128, 252, !dbg !64
+  br i1 %158, label %127, label %159, !dbg !64
+
+159:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !82
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 22, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 36, scope: !7)
+!22 = !DILocation(line: 39, column: 40, scope: !7)
+!23 = !DILocation(line: 40, column: 44, scope: !7)
+!24 = !DILocation(line: 39, column: 55, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 40, column: 40, scope: !7)
+!30 = !DILocation(line: 40, column: 34, scope: !7)
+!31 = !DILocation(line: 40, column: 52, scope: !7)
+!32 = !DILocation(line: 41, column: 22, scope: !7)
+!33 = !DILocation(line: 96, column: 20, scope: !34, inlinedAt: !36)
+!34 = distinct !DILexicalBlockFile(scope: !7, file: !35, discriminator: 0)
+!35 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!36 = !DILocation(line: 44, column: 38, scope: !34)
+!37 = !DILocation(line: 97, column: 26, scope: !34, inlinedAt: !36)
+!38 = !DILocation(line: 98, column: 30, scope: !34, inlinedAt: !36)
+!39 = !DILocation(line: 98, column: 22, scope: !34, inlinedAt: !36)
+!40 = !DILocation(line: 101, column: 30, scope: !34, inlinedAt: !36)
+!41 = !DILocation(line: 101, column: 22, scope: !34, inlinedAt: !36)
+!42 = !DILocation(line: 47, column: 48, scope: !7)
+!43 = !DILocation(line: 31, column: 36, scope: !7)
+!44 = !DILocation(line: 120, column: 46, scope: !34, inlinedAt: !45)
+!45 = !DILocation(line: 50, column: 41, scope: !34)
+!46 = !DILocation(line: 108, column: 21, scope: !47, inlinedAt: !48)
+!47 = distinct !DILexicalBlockFile(scope: !34, file: !35, discriminator: 0)
+!48 = !DILocation(line: 120, column: 46, scope: !47, inlinedAt: !49)
+!49 = !DILocation(line: 50, column: 41, scope: !47)
+!50 = !DILocation(line: 109, column: 28, scope: !47, inlinedAt: !48)
+!51 = !DILocation(line: 110, column: 39, scope: !47, inlinedAt: !48)
+!52 = !DILocation(line: 110, column: 60, scope: !47, inlinedAt: !48)
+!53 = !DILocation(line: 110, column: 49, scope: !47, inlinedAt: !48)
+!54 = !DILocation(line: 112, column: 25, scope: !47, inlinedAt: !48)
+!55 = !DILocation(line: 112, column: 17, scope: !47, inlinedAt: !48)
+!56 = !DILocation(line: 113, column: 15, scope: !47, inlinedAt: !48)
+!57 = !DILocation(line: 113, column: 30, scope: !47, inlinedAt: !48)
+!58 = !DILocation(line: 113, column: 38, scope: !47, inlinedAt: !48)
+!59 = !DILocation(line: 113, column: 49, scope: !47, inlinedAt: !48)
+!60 = !DILocation(line: 113, column: 22, scope: !47, inlinedAt: !48)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..f840730c2d0422b8caf5e7d6bba7045517cc6717
--- /dev/null
+++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx
@@ -0,0 +1,756 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<27>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<81>;
+	.reg .f32 	%f<73>;
+	.reg .b64 	%rd<84>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd35, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd34, [triton__0d1d2d3d4d5de6de_param_2];
+	ld.param.u64 	%rd33, [triton__0d1d2d3d4d5de6de_param_1];
+	ld.param.u64 	%rd41, [triton__0d1d2d3d4d5de6de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	bfe.u32 	%r2, %r1, 2, 6;
+	and.b32  	%r14, %r1, 63;
+	.loc	1 24 33
+	and.b32  	%r3, %r1, 3;
+	.loc	1 21 28
+	mov.u32 %r13, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r15, %r13, 6;
+	.loc	1 22 23
+	or.b32  	%r16, %r15, %r2;
+	or.b32  	%r17, %r15, %r14;
+	.loc	1 26 30
+	mul.wide.s32 	%rd42, %r16, 8;
+	add.s64 	%rd38, %rd41, %rd42;
+	mul.wide.s32 	%rd43, %r17, 8;
+	add.s64 	%rd40, %rd41, %rd43;
+	mov.pred 	%p11, -1;
+	.loc	1 26 35
+	mov.u64 %rd37, 0x0;
+	@%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ];
+	mov.u64 %rd39, 0x0;
+	@%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r18, %r13, 25, 1;
+	shr.u32 	%r19, %r18, 23;
+	add.s32 	%r20, %r16, %r19;
+	and.b32  	%r21, %r20, 16776704;
+	sub.s32 	%r22, %r16, %r21;
+	.loc	1 35 44
+	shl.b32 	%r5, %r22, 8;
+	.loc	1 36 22
+	add.s64 	%rd44, %rd39, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p3, %rd37, 0;
+	setp.lt.s64 	%p4, %rd39, 0;
+	.loc	1 38 36
+	selp.b64 	%rd45, %rd44, %rd39, %p4;
+	.loc	1 39 40
+	setp.gt.u64 	%p5, %rd45, 50256;
+	.loc	1 40 44
+	shl.b64 	%rd46, %rd37, 8;
+	add.s64 	%rd47, %rd46, 12865792;
+	selp.b64 	%rd2, %rd47, %rd46, %p3;
+	mov.b32 	%r67, 0;
+	mov.b32 	%r77, 883;
+	mov.u64 	%rd73, 1;
+	.loc	1 39 55
+	@%p5 bra 	$L__BB0_3;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 31 36
+	shl.b64 	%rd51, %rd2, 2;
+	mul.wide.u32 	%rd80, %r3, 4;
+	add.s64 	%rd79, %rd51, %rd80;
+	add.s64 	%rd75, %rd33, %rd79;
+	add.s32 	%r35, %r5, %r3;
+	mul.wide.s32 	%rd78, %r35, 4;
+	add.s64 	%rd74, %rd34, %rd78;
+	mov.f32 	%f72, 0f00000000;
+	mov.b32 	%r78, -4;
+	mov.f32 	%f71, %f72;
+	mov.f32 	%f70, %f72;
+$L__BB0_4:
+	.loc	1 35 50
+	mov.u32 %r36, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ];
+	@!%p11 mov.u32 %r36, %r67;
+	mov.b32 	%f28, %r36;
+	.loc	1 39 55
+	mov.u64 	%rd54, assertMessage_0;
+	cvta.global.u64 	%rd55, %rd54;
+	mov.u64 	%rd56, assertFile_0;
+	cvta.global.u64 	%rd57, %rd56;
+	mov.u64 	%rd58, assertFunc_0;
+	cvta.global.u64 	%rd59, %rd58;
+	{ // callseq 10, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd55;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd57;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r77;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd59;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd73;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 10
+	.loc	1 40 52
+	mov.u32 %r38, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ];
+	@!%p11 mov.u32 %r38, %r67;
+	mov.b32 	%f29, %r38;
+	.loc	1 41 22
+	add.f32 	%f30, %f28, %f29;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f31, %f30, %f70;
+	.loc	2 97 26
+	add.f32 	%f72, %f72, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r41, %f31;
+	mov.b32 	%r42, %f72;
+	div.full.f32 %r40, %r41, %r42;
+	mov.b32 	%f32, %r40;
+	.loc	2 98 22
+	add.f32 	%f70, %f70, %f32;
+	.loc	2 101 30
+	sub.f32 	%f33, %f30, %f70;
+$L__tmp2:
+	.loc	1 47 48
+	fma.rn.f32 	%f71, %f31, %f33, %f71;
+	.loc	1 31 36
+	add.s32 	%r78, %r78, 4;
+	add.s64 	%rd75, %rd75, 16;
+	add.s64 	%rd74, %rd74, 16;
+	setp.lt.u32 	%p15, %r78, 252;
+	@%p15 bra 	$L__BB0_4;
+	bra.uni 	$L__BB0_5;
+$L__BB0_1:
+	.loc	1 0 36
+	mov.b32 	%r79, -4;
+	.loc	1 31 36
+	shl.b64 	%rd48, %rd2, 2;
+	mul.wide.u32 	%rd80, %r3, 4;
+	add.s64 	%rd79, %rd48, %rd80;
+	add.s64 	%rd77, %rd33, %rd79;
+	add.s32 	%r25, %r5, %r3;
+	mul.wide.s32 	%rd78, %r25, 4;
+	add.s64 	%rd76, %rd34, %rd78;
+	mov.f32 	%f72, 0f00000000;
+	mov.f32 	%f71, %f72;
+	mov.f32 	%f70, %f72;
+$L__BB0_2:
+	.loc	1 35 50
+	mov.u32 %r26, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ];
+	@!%p11 mov.u32 %r26, %r67;
+	mov.b32 	%f21, %r26;
+	.loc	1 40 52
+	mov.u32 %r28, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ];
+	@!%p11 mov.u32 %r28, %r67;
+	mov.b32 	%f22, %r28;
+	.loc	1 41 22
+	add.f32 	%f23, %f21, %f22;
+$L__tmp3:
+	.loc	2 96 20
+	sub.f32 	%f24, %f23, %f70;
+	.loc	2 97 26
+	add.f32 	%f72, %f72, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r31, %f24;
+	mov.b32 	%r32, %f72;
+	div.full.f32 %r30, %r31, %r32;
+	mov.b32 	%f25, %r30;
+	.loc	2 98 22
+	add.f32 	%f70, %f70, %f25;
+	.loc	2 101 30
+	sub.f32 	%f26, %f23, %f70;
+$L__tmp4:
+	.loc	1 47 48
+	fma.rn.f32 	%f71, %f24, %f26, %f71;
+	.loc	1 31 36
+	add.s32 	%r79, %r79, 4;
+	add.s64 	%rd77, %rd77, 16;
+	add.s64 	%rd76, %rd76, 16;
+	setp.lt.u32 	%p10, %r79, 252;
+	@%p10 bra 	$L__BB0_2;
+$L__BB0_5:
+	.loc	1 0 36
+	ld.param.u64 	%rd36, [triton__0d1d2d3d4d5de6de_param_4];
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r54, %f70;
+	shfl.sync.bfly.b32	%r55, %r54, 2, 31, -1;
+	mov.b32 	%f34, %r55;
+	mov.b32 	%r56, %f71;
+	shfl.sync.bfly.b32	%r57, %r56, 2, 31, -1;
+	mov.b32 	%f35, %r57;
+	mov.b32 	%r58, %f72;
+	shfl.sync.bfly.b32	%r45, %r58, 2, 31, -1;
+	mov.b32 	%f36, %r45;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f37, %f34, %f70;
+	.loc	2 109 28
+	add.f32 	%f38, %f72, %f36;
+	.loc	2 110 39
+	setp.eq.f32 	%p16, %f38, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r46, %f38;
+	div.full.f32 %r44, %r45, %r46;
+	mov.b32 	%f39, %r44;
+	.loc	2 110 49
+	selp.f32 	%f40, 0f00000000, %f39, %p16;
+	.loc	2 112 17
+	fma.rn.f32 	%f41, %f37, %f40, %f70;
+	.loc	2 113 15
+	add.f32 	%f42, %f71, %f35;
+	.loc	2 113 30
+	mul.f32 	%f43, %f37, %f37;
+	.loc	2 113 38
+	mul.f32 	%f44, %f72, %f43;
+	.loc	2 113 22
+	fma.rn.f32 	%f45, %f44, %f40, %f42;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r59, %f41;
+	shfl.sync.bfly.b32	%r60, %r59, 1, 31, -1;
+	mov.b32 	%f46, %r60;
+	mov.b32 	%r61, %f45;
+	shfl.sync.bfly.b32	%r62, %r61, 1, 31, -1;
+	mov.b32 	%f47, %r62;
+	shfl.sync.bfly.b32	%r48, %r46, 1, 31, -1;
+	mov.b32 	%f48, %r48;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f49, %f46, %f41;
+	.loc	2 109 28
+	add.f32 	%f50, %f38, %f48;
+	.loc	2 110 39
+	setp.eq.f32 	%p17, %f50, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r49, %f50;
+	div.full.f32 %r47, %r48, %r49;
+	mov.b32 	%f51, %r47;
+	.loc	2 110 49
+	selp.f32 	%f52, 0f00000000, %f51, %p17;
+	.loc	2 112 17
+	fma.rn.f32 	%f16, %f49, %f52, %f41;
+	.loc	2 113 15
+	add.f32 	%f53, %f45, %f47;
+	.loc	2 113 30
+	mul.f32 	%f54, %f49, %f49;
+	.loc	2 113 38
+	mul.f32 	%f55, %f38, %f54;
+	.loc	2 113 22
+	fma.rn.f32 	%f56, %f52, %f55, %f53;
+$L__tmp9:
+	.loc	1 69 23
+	mov.b32 	%r51, %f56;
+	mov.b32 	%r52, 1132462080;
+	div.full.f32 %r50, %r51, %r52;
+	mov.b32 	%f57, %r50;
+	.loc	1 71 24
+	add.f32 	%f17, %f57, 0f3727C5AC;
+	.loc	1 55 36
+	shl.b32 	%r63, %r13, 14;
+	shl.b32 	%r64, %r2, 8;
+	or.b32  	%r65, %r63, %r64;
+	or.b32  	%r10, %r65, %r3;
+	add.s64 	%rd83, %rd33, %rd79;
+	add.s64 	%rd82, %rd35, %rd80;
+	add.s64 	%rd81, %rd34, %rd78;
+	mov.b32 	%r80, -4;
+	setp.lt.u64 	%p22, %rd45, 50257;
+	rsqrt.approx.ftz.f32 	%f61, %f17;
+	bra.uni 	$L__BB0_6;
+$L__BB0_8:
+	.loc	1 0 0
+	mov.b32 	%f18, %r66;
+	mov.b32 	%f19, %r68;
+	.loc	1 65 54
+	mov.u32 %r71, 0x0;
+	@%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ];
+	@!%p11 mov.u32 %r71, %r67;
+	mov.b32 	%f58, %r71;
+	.loc	1 66 24
+	add.f32 	%f59, %f18, %f58;
+	.loc	1 67 24
+	sub.f32 	%f60, %f59, %f16;
+	.loc	1 73 24
+	mul.f32 	%f62, %f60, %f61;
+	.loc	1 74 24
+	mul.f32 	%f63, %f62, %f19;
+	.loc	1 55 36
+	add.s32 	%r80, %r80, 4;
+	.loc	1 76 29
+	add.s32 	%r74, %r80, %r10;
+	mul.wide.s32 	%rd72, %r74, 2;
+	add.s64 	%rd71, %rd36, %rd72;
+	.loc	1 76 52
+	mov.b32 	%r73, %f63;
+	cvt.rn.bf16.f32 %rs1, %r73;
+	@%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 };
+	.loc	1 55 36
+	add.s64 	%rd83, %rd83, 16;
+	add.s64 	%rd82, %rd82, 16;
+	add.s64 	%rd81, %rd81, 16;
+	setp.lt.u32 	%p26, %r80, 252;
+	@%p26 bra 	$L__BB0_6;
+	bra.uni 	$L__BB0_9;
+$L__BB0_6:
+	.loc	1 59 51
+	mov.u32 %r66, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ];
+	@!%p11 mov.u32 %r66, %r67;
+	.loc	1 60 40
+	mov.u32 %r68, 0x0;
+	@%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ];
+	@!%p11 mov.u32 %r68, %r67;
+	.loc	1 64 57
+	@%p22 bra 	$L__BB0_8;
+	mov.u64 	%rd63, assertMessage_1;
+	cvta.global.u64 	%rd64, %rd63;
+	mov.u64 	%rd65, assertFile_1;
+	cvta.global.u64 	%rd66, %rd65;
+	mov.u64 	%rd67, assertFunc_1;
+	cvta.global.u64 	%rd68, %rd67;
+	{ // callseq 11, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd64;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd66;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r77;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd68;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd73;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 11
+	bra.uni 	$L__BB0_8;
+$L__BB0_9:
+	.loc	1 55 4
+	ret;
+$L__tmp10:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 108
+.b8 104
+.b8 101
+.b8 52
+.b8 97
+.b8 51
+.b8 115
+.b8 116
+.b8 118
+.b8 117
+.b8 102
+.b8 120
+.b8 97
+.b8 102
+.b8 109
+.b8 113
+.b8 51
+.b8 107
+.b8 107
+.b8 53
+.b8 104
+.b8 111
+.b8 100
+.b8 97
+.b8 122
+.b8 122
+.b8 50
+.b8 101
+.b8 102
+.b8 99
+.b8 116
+.b8 102
+.b8 102
+.b8 116
+.b8 101
+.b8 54
+.b8 52
+.b8 54
+.b8 122
+.b8 110
+.b8 106
+.b8 100
+.b8 110
+.b8 118
+.b8 51
+.b8 108
+.b8 113
+.b8 105
+.b8 53
+.b8 111
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 108
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp4
+.b8 2
+.b8 44
+.b8 38
+.b8 4
+.b32 125
+.b64 $L__tmp5
+.b64 $L__tmp8
+.b8 2
+.b8 50
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp6
+.b64 $L__tmp9
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp6
+.b64 $L__tmp9
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2bfb2534267e2d16953e769c87c8995d28982d84
--- /dev/null
+++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir
@@ -0,0 +1,141 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_7 = arith.constant 0.000000e+00 : f32
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
+    %cst_9 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_10 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
+    %cst_11 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_12 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
+    %12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
+    %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %22 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
+    %23 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
+    %24 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
+    %25 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
+    %26 = arith.select %24, %22, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %27 = arith.select %25, %23, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %28 = arith.cmpi sge, %27, %cst_5 : tensor<64x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %27, %cst_4 : tensor<64x1xi64, #blocked1>
+    %30 = arith.andi %28, %29 : tensor<64x1xi1, #blocked1>
+    %31 = arith.muli %26, %cst_1 : tensor<64x1xi64, #blocked>
+    %32 = tt.broadcast %31 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+    %33 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %34:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_9, %arg9 = %cst_9, %arg10 = %cst_9) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>)  : i32 {
+      %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
+      %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
+      %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
+      %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
+      %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      tt.assert %30, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %53 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
+      %54 = tt.broadcast %53 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+      %55 = arith.addi %54, %32 : tensor<64x4xi64, #blocked>
+      %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
+      %57 = tt.load %56, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %58 = arith.addf %57, %52 : tensor<64x4xf32, #blocked>
+      %59 = arith.subf %58, %arg8 : tensor<64x4xf32, #blocked>
+      %60 = arith.addf %arg10, %cst_6 : tensor<64x4xf32, #blocked>
+      %61 = arith.divf %59, %60 : tensor<64x4xf32, #blocked>
+      %62 = arith.addf %arg8, %61 : tensor<64x4xf32, #blocked>
+      %63 = arith.subf %58, %62 : tensor<64x4xf32, #blocked>
+      %64 = arith.mulf %59, %63 : tensor<64x4xf32, #blocked>
+      %65 = arith.addf %arg9, %64 : tensor<64x4xf32, #blocked>
+      %66 = arith.select %51, %62, %arg8 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      %67 = arith.select %51, %65, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      %68 = arith.select %51, %60, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      scf.yield %66, %67, %68 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
+    }
+    %35:3 = "tt.reduce"(%34#0, %34#1, %34#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %45 = arith.subf %arg10, %arg7 : f32
+      %46 = arith.addf %arg9, %arg12 : f32
+      %47 = arith.cmpf oeq, %46, %cst_7 : f32
+      %48 = arith.divf %arg12, %46 : f32
+      %49 = arith.select %47, %cst_7, %48 : f32
+      %50 = arith.mulf %45, %49 : f32
+      %51 = arith.addf %arg7, %50 : f32
+      %52 = arith.addf %arg8, %arg11 : f32
+      %53 = arith.mulf %45, %45 : f32
+      %54 = arith.mulf %53, %arg9 : f32
+      %55 = arith.mulf %54, %49 : f32
+      %56 = arith.addf %52, %55 : f32
+      tt.reduce.return %51, %56, %46 : f32, f32, f32
+    }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %38 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
+    %39 = tt.broadcast %36 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+    %40 = arith.divf %37, %cst_12 : tensor<64x1xf32, #blocked>
+    %41 = arith.addf %40, %cst_11 : tensor<64x1xf32, #blocked>
+    %42 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
+    %43 = tt.broadcast %42 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32  : i32 {
+      %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
+      %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
+      %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
+      %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
+      %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %53 = tt.addptr %38, %46 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
+      %54 = tt.load %53, %47, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
+      tt.assert %30, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %55 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
+      %56 = tt.broadcast %55 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+      %57 = arith.addi %56, %32 : tensor<64x4xi64, #blocked>
+      %58 = tt.addptr %33, %57 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
+      %59 = tt.load %58, %51, %cst_9 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %60 = arith.addf %59, %52 : tensor<64x4xf32, #blocked>
+      %61 = arith.subf %60, %39 : tensor<64x4xf32, #blocked>
+      %62 = tt.extern_elementwise %41 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %63 = tt.broadcast %62 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+      %64 = arith.mulf %61, %63 : tensor<64x4xf32, #blocked>
+      %65 = tt.broadcast %54 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+      %66 = arith.mulf %64, %65 : tensor<64x4xf32, #blocked>
+      %67 = arith.addi %48, %43 : tensor<64x4xi32, #blocked>
+      %68 = tt.addptr %44, %67 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %69 = arith.truncf %66 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
+      tt.store %68, %69, %51 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..24396c81e1300c5ffeb599c07262ada7e3ea0a4e
--- /dev/null
+++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir
@@ -0,0 +1,139 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
+    %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x4xi32>
+    %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
+    %12 = arith.muli %11, %cst_8 : tensor<64x1xi32>
+    %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %15 = arith.addi %10, %cst_3 : tensor<64x1xi64>
+    %16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
+    %17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64>
+    %20 = arith.andi %18, %19 : tensor<64x1xi1>
+    %21 = arith.muli %17, %cst_1 : tensor<64x1xi64>
+    %22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>)  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
+      %48 = arith.addi %47, %7 : tensor<1x4xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
+      %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %51 = arith.addi %50, %13 : tensor<64x4xi32>
+      %52 = tt.addptr %14, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %55 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
+      %56 = tt.broadcast %55 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %57 = arith.addi %56, %22 : tensor<64x4xi64>
+      %58 = tt.addptr %23, %57 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %60 = arith.addf %59, %54 : tensor<64x4xf32>
+      %61 = arith.subf %60, %arg8 : tensor<64x4xf32>
+      %62 = arith.addf %arg10, %cst_0 : tensor<64x4xf32>
+      %63 = arith.divf %61, %62 : tensor<64x4xf32>
+      %64 = arith.addf %arg8, %63 : tensor<64x4xf32>
+      %65 = arith.subf %60, %64 : tensor<64x4xf32>
+      %66 = arith.mulf %61, %65 : tensor<64x4xf32>
+      %67 = arith.addf %arg9, %66 : tensor<64x4xf32>
+      %68 = arith.select %53, %64, %arg8 : tensor<64x4xi1>, tensor<64x4xf32>
+      %69 = arith.select %53, %67, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
+      %70 = arith.select %53, %62, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
+      scf.yield %68, %69, %70 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
+    }
+    %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %47 = arith.subf %arg10, %arg7 : f32
+      %48 = arith.addf %arg9, %arg12 : f32
+      %49 = arith.cmpf oeq, %48, %cst : f32
+      %50 = arith.divf %arg12, %48 : f32
+      %51 = arith.select %49, %cst, %50 : f32
+      %52 = arith.mulf %47, %51 : f32
+      %53 = arith.addf %arg7, %52 : f32
+      %54 = arith.addf %arg8, %arg11 : f32
+      %55 = arith.mulf %47, %47 : f32
+      %56 = arith.mulf %55, %arg9 : f32
+      %57 = arith.mulf %56, %51 : f32
+      %58 = arith.addf %54, %57 : f32
+      tt.reduce.return %53, %58, %48 : f32, f32, f32
+    }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %28 = arith.muli %11, %cst_8 : tensor<64x1xi32>
+    %29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
+    %32 = arith.addi %10, %cst_3 : tensor<64x1xi64>
+    %33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
+    %34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64>
+    %36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64>
+    %37 = arith.andi %35, %36 : tensor<64x1xi1>
+    %38 = arith.muli %34, %cst_1 : tensor<64x1xi64>
+    %39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+    %42 = arith.divf %27, %cst_5 : tensor<64x1xf32>
+    %43 = arith.addf %42, %cst_4 : tensor<64x1xf32>
+    %44 = arith.muli %5, %cst_8 : tensor<64x1xi32>
+    %45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
+      %48 = arith.addi %47, %7 : tensor<1x4xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
+      %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %51 = arith.addi %50, %29 : tensor<64x4xi32>
+      %52 = tt.addptr %30, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %55 = tt.addptr %31, %48 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
+      %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
+      tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %57 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
+      %58 = tt.broadcast %57 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %59 = arith.addi %58, %39 : tensor<64x4xi64>
+      %60 = tt.addptr %40, %59 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
+      %62 = arith.addf %61, %54 : tensor<64x4xf32>
+      %63 = arith.subf %62, %41 : tensor<64x4xf32>
+      %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+      %66 = arith.mulf %63, %65 : tensor<64x4xf32>
+      %67 = tt.broadcast %56 : (tensor<1x4xf32>) -> tensor<64x4xf32>
+      %68 = arith.mulf %66, %67 : tensor<64x4xf32>
+      %69 = arith.addi %50, %45 : tensor<64x4xi32>
+      %70 = tt.addptr %46, %69 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %71 = arith.truncf %68 : tensor<64x4xf32> to tensor<64x4xbf16>
+      tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..ef91adf029023f7e93531c46cfeb0259ee37a45e
--- /dev/null
+++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir
@@ -0,0 +1,235 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 31, !dbg !8
+  %8 = lshr i32 %6, 5, !dbg !8
+  %9 = shl i32 %6, 2, !dbg !8
+  %10 = and i32 %9, 60, !dbg !8
+  %11 = and i32 %8, 3, !dbg !9
+  %12 = lshr i32 %7, 4, !dbg !9
+  %13 = shl nuw nsw i32 %11, 1, !dbg !9
+  %14 = or i32 %13, %12, !dbg !9
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
+  %16 = shl i32 %15, 6, !dbg !11
+  %17 = or i32 %16, %10, !dbg !12
+  %.frozen = freeze i32 %17
+  %18 = sdiv i32 %.frozen, 256, !dbg !13
+  %19 = mul i32 %18, 256
+  %.decomposed = sub i32 %.frozen, %19
+  %20 = shl i32 %18, 15, !dbg !14
+  %21 = add i32 %20, %.decomposed
+  br label %22, !dbg !15
+
+22:                                               ; preds = %5, %22
+  %23 = phi i32 [ 0, %5 ], [ %53, %22 ]
+  %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %52, %22 ]
+  %25 = or i32 %23, %14, !dbg !16
+  %26 = shl i32 %25, 8, !dbg !17
+  %27 = add i32 %21, %26, !dbg !18
+  %28 = sext i32 %27 to i64, !dbg !19
+  %29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19
+  %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
+  %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !20
+  %32 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !20
+  %33 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !20
+  %34 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !20
+  %35 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !21
+  %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !22
+  %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !22
+  %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !22
+  %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !22
+  %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !22
+  %41 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !20
+  %42 = insertelement <4 x i32> %41, i32 %32, i64 1, !dbg !20
+  %43 = insertelement <4 x i32> %42, i32 %33, i64 2, !dbg !20
+  %44 = insertelement <4 x i32> %43, i32 %34, i64 3, !dbg !20
+  %45 = bitcast <4 x i32> %44 to <4 x float>, !dbg !20
+  %46 = insertelement <4 x i32> poison, i32 %37, i64 0, !dbg !22
+  %47 = insertelement <4 x i32> %46, i32 %38, i64 1, !dbg !22
+  %48 = insertelement <4 x i32> %47, i32 %39, i64 2, !dbg !22
+  %49 = insertelement <4 x i32> %48, i32 %40, i64 3, !dbg !22
+  %50 = bitcast <4 x i32> %49 to <4 x float>, !dbg !22
+  %51 = fmul <4 x float> %45, %50, !dbg !23
+  %52 = fadd <4 x float> %24, %51, !dbg !24
+  %53 = add nuw nsw i32 %23, 8, !dbg !15
+  %54 = icmp ult i32 %23, 120, !dbg !15
+  br i1 %54, label %22, label %55, !dbg !15
+
+55:                                               ; preds = %22
+  %56 = and i32 %6, 63, !dbg !8
+  %57 = or i32 %16, %56, !dbg !12
+  %58 = or i32 %10, 3, !dbg !25
+  %59 = or i32 %10, 2, !dbg !25
+  %60 = or i32 %10, 1, !dbg !25
+  %61 = extractelement <4 x float> %52, i64 0, !dbg !25
+  %62 = bitcast float %61 to i32, !dbg !25
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 16, i32 31), !dbg !25
+  %64 = bitcast i32 %63 to float, !dbg !25
+  %65 = fadd float %61, %64, !dbg !29
+  %66 = extractelement <4 x float> %52, i64 1, !dbg !25
+  %67 = bitcast float %66 to i32, !dbg !25
+  %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !25
+  %69 = bitcast i32 %68 to float, !dbg !25
+  %70 = fadd float %66, %69, !dbg !29
+  %71 = extractelement <4 x float> %52, i64 2, !dbg !25
+  %72 = bitcast float %71 to i32, !dbg !25
+  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !25
+  %74 = bitcast i32 %73 to float, !dbg !25
+  %75 = fadd float %71, %74, !dbg !29
+  %76 = extractelement <4 x float> %52, i64 3, !dbg !25
+  %77 = bitcast float %76 to i32, !dbg !25
+  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !25
+  %79 = bitcast i32 %78 to float, !dbg !25
+  %80 = fadd float %76, %79, !dbg !29
+  %81 = icmp ult i32 %7, 16, !dbg !25
+  %82 = shl nuw nsw i32 %10, 2, !dbg !25
+  %83 = or i32 %82, %11, !dbg !25
+  %84 = zext nneg i32 %83 to i64, !dbg !25
+  %85 = getelementptr float, ptr addrspace(3) @global_smem, i64 %84, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, float %65, i1 %81) #3, !dbg !25
+  %86 = shl nuw nsw i32 %60, 2, !dbg !25
+  %87 = or i32 %86, %11, !dbg !25
+  %88 = zext nneg i32 %87 to i64, !dbg !25
+  %89 = getelementptr float, ptr addrspace(3) @global_smem, i64 %88, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %89, float %70, i1 %81) #3, !dbg !25
+  %90 = shl nuw nsw i32 %59, 2, !dbg !25
+  %91 = or i32 %90, %11, !dbg !25
+  %92 = zext nneg i32 %91 to i64, !dbg !25
+  %93 = getelementptr float, ptr addrspace(3) @global_smem, i64 %92, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %93, float %75, i1 %81) #3, !dbg !25
+  %94 = shl nuw nsw i32 %58, 2, !dbg !25
+  %95 = or i32 %94, %11, !dbg !25
+  %96 = zext nneg i32 %95 to i64, !dbg !25
+  %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %80, i1 %81) #3, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %98 = icmp slt i32 %6, 256, !dbg !25
+  %99 = sext i32 %6 to i64, !dbg !25
+  %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !25
+  %101 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %100, i1 %98) #3, !dbg !25
+  %102 = bitcast float %101 to i32, !dbg !25
+  %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !25
+  %104 = bitcast i32 %103 to float, !dbg !25
+  %105 = fadd float %101, %104, !dbg !29
+  %106 = bitcast float %105 to i32, !dbg !25
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !25
+  %108 = bitcast i32 %107 to float, !dbg !25
+  %109 = fadd float %105, %108, !dbg !29
+  %110 = and i32 %6, 3, !dbg !25
+  %111 = icmp eq i32 %110, 0, !dbg !25
+  %112 = and i1 %98, %111, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %109, i1 %112) #3, !dbg !25
+  %113 = add i32 %6, 128, !dbg !25
+  %114 = sext i32 %113 to i64, !dbg !25
+  %115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !25
+  %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %98) #3, !dbg !25
+  %117 = bitcast float %116 to i32, !dbg !25
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !25
+  %119 = bitcast i32 %118 to float, !dbg !25
+  %120 = fadd float %116, %119, !dbg !29
+  %121 = bitcast float %120 to i32, !dbg !25
+  %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !25
+  %123 = bitcast i32 %122 to float, !dbg !25
+  %124 = fadd float %120, %123, !dbg !29
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %124, i1 %112) #3, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %125 = zext nneg i32 %82 to i64, !dbg !25
+  %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !25
+  %127 = load float, ptr addrspace(3) %126, align 4, !dbg !25
+  %128 = zext nneg i32 %86 to i64, !dbg !25
+  %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !25
+  %130 = load float, ptr addrspace(3) %129, align 4, !dbg !25
+  %131 = zext nneg i32 %90 to i64, !dbg !25
+  %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !25
+  %133 = load float, ptr addrspace(3) %132, align 4, !dbg !25
+  %134 = zext nneg i32 %94 to i64, !dbg !25
+  %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !25
+  %136 = load float, ptr addrspace(3) %135, align 4, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !33
+  %137 = zext nneg i32 %10 to i64, !dbg !33
+  %138 = getelementptr float, ptr addrspace(3) @global_smem, i64 %137, !dbg !33
+  %139 = insertelement <1 x float> undef, float %127, i64 0, !dbg !33
+  store <1 x float> %139, ptr addrspace(3) %138, align 4, !dbg !33
+  %140 = zext nneg i32 %60 to i64, !dbg !33
+  %141 = getelementptr float, ptr addrspace(3) @global_smem, i64 %140, !dbg !33
+  %142 = insertelement <1 x float> undef, float %130, i64 0, !dbg !33
+  store <1 x float> %142, ptr addrspace(3) %141, align 4, !dbg !33
+  %143 = zext nneg i32 %59 to i64, !dbg !33
+  %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !33
+  %145 = insertelement <1 x float> undef, float %133, i64 0, !dbg !33
+  store <1 x float> %145, ptr addrspace(3) %144, align 4, !dbg !33
+  %146 = zext nneg i32 %58 to i64, !dbg !33
+  %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !33
+  %148 = insertelement <1 x float> undef, float %136, i64 0, !dbg !33
+  store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !33
+  tail call void @llvm.nvvm.barrier0(), !dbg !33
+  %149 = zext nneg i32 %56 to i64, !dbg !33
+  %150 = getelementptr float, ptr addrspace(3) @global_smem, i64 %149, !dbg !33
+  %151 = load i32, ptr addrspace(3) %150, align 4, !dbg !33
+  %152 = sext i32 %57 to i64, !dbg !34
+  %153 = getelementptr float, ptr addrspace(1) %2, i64 %152, !dbg !34
+  %154 = and i32 %6, 64, !dbg !35
+  %155 = icmp eq i32 %154, 0, !dbg !35
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %153, i1 %155) #3, !dbg !35
+  ret void, !dbg !36
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py", directory: "/tmp/torchinductor_root/qd")
+!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 22, column: 44, scope: !5)
+!9 = !DILocation(line: 24, column: 33, scope: !5)
+!10 = !DILocation(line: 21, column: 28, scope: !5)
+!11 = !DILocation(line: 21, column: 33, scope: !5)
+!12 = !DILocation(line: 22, column: 23, scope: !5)
+!13 = !DILocation(line: 26, column: 20, scope: !5)
+!14 = !DILocation(line: 33, column: 57, scope: !5)
+!15 = !DILocation(line: 29, column: 36, scope: !5)
+!16 = !DILocation(line: 30, column: 27, scope: !5)
+!17 = !DILocation(line: 33, column: 44, scope: !5)
+!18 = !DILocation(line: 33, column: 51, scope: !5)
+!19 = !DILocation(line: 33, column: 34, scope: !5)
+!20 = !DILocation(line: 33, column: 63, scope: !5)
+!21 = !DILocation(line: 34, column: 34, scope: !5)
+!22 = !DILocation(line: 34, column: 63, scope: !5)
+!23 = !DILocation(line: 35, column: 22, scope: !5)
+!24 = !DILocation(line: 38, column: 38, scope: !5)
+!25 = !DILocation(line: 243, column: 36, scope: !26, inlinedAt: !28)
+!26 = distinct !DILexicalBlockFile(scope: !5, file: !27, discriminator: 0)
+!27 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!28 = !DILocation(line: 39, column: 25, scope: !26)
+!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !31)
+!30 = distinct !DILexicalBlockFile(scope: !26, file: !27, discriminator: 0)
+!31 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
+!32 = !DILocation(line: 39, column: 25, scope: !30)
+!33 = !DILocation(line: 39, column: 28, scope: !5)
+!34 = !DILocation(line: 40, column: 25, scope: !5)
+!35 = !DILocation(line: 40, column: 36, scope: !5)
+!36 = !DILocation(line: 40, column: 4, scope: !5)
diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..962967057117e882cdf06f7455d5b47cc5232325
--- /dev/null
+++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir
@@ -0,0 +1,56 @@
+module {
+  tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c8_i32 = arith.constant 8 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<32768> : tensor<64x1xi32>
+    %cst_0 = arith.constant dense<256> : tensor<1x8xi32>
+    %cst_1 = arith.constant dense<128> : tensor<1x8xi32>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
+    %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %8 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
+    %9 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
+    %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %11 = arith.muli %9, %cst : tensor<64x1xi32>
+    %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>)  : i32 {
+      %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
+      %21 = arith.addi %20, %7 : tensor<1x8xi32>
+      %22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32>
+      %23 = arith.muli %21, %cst_0 : tensor<1x8xi32>
+      %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %25 = arith.addi %10, %24 : tensor<64x8xi32>
+      %26 = arith.addi %25, %12 : tensor<64x8xi32>
+      %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %32 = arith.mulf %29, %31 : tensor<64x8xf32>
+      %33 = arith.addf %arg6, %32 : tensor<64x8xf32>
+      %34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
+      scf.yield %34 : tensor<64x8xf32>
+    }
+    %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %20 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %20 : f32
+    }) : (tensor<64x8xf32>) -> tensor<64xf32>
+    %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
+    %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
+    tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..548ea109c31bc48da1e784c81d4c3a3aae77fa1e
Binary files /dev/null and b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin differ
diff --git a/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..aeb8dd323d696ad5ddd0317a7f24fe57c146d7e4
--- /dev/null
+++ b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.llir
@@ -0,0 +1,503 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = lshr i32 %8, 6, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %13 = and i32 %8, 1, !dbg !10
+  %14 = and i32 %10, 1, !dbg !11
+  %urem = shl i32 %8, 2, !dbg !11
+  %15 = and i32 %urem, 252, !dbg !11
+  %16 = shl i32 %8, 1, !dbg !11
+  %17 = and i32 %16, 254, !dbg !11
+  %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %19 = shl i32 %18, 1, !dbg !13
+  %20 = or i32 %19, %12, !dbg !14
+  %21 = or i32 %19, %13, !dbg !14
+  %22 = sext i32 %20 to i64, !dbg !15
+  %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
+  %24 = sext i32 %21 to i64, !dbg !15
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
+  %31 = srem i32 %20, 512, !dbg !17
+  %32 = shl nsw i32 %31, 8, !dbg !18
+  %33 = or i32 %32, %15, !dbg !19
+  %34 = sext i32 %33 to i64, !dbg !20
+  %35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
+  %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
+  %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
+  %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
+  %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
+  %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
+  %41 = bitcast i32 %37 to float, !dbg !21
+  %42 = bitcast i32 %38 to float, !dbg !21
+  %43 = bitcast i32 %39 to float, !dbg !21
+  %44 = bitcast i32 %40 to float, !dbg !21
+  %45 = add i64 %30, 50257, !dbg !22
+  %46 = icmp slt i64 %26, 0, !dbg !23
+  %47 = icmp slt i64 %30, 0, !dbg !23
+  %48 = select i1 %47, i64 %45, i64 %30, !dbg !24
+  %49 = icmp ugt i64 %48, 50256, !dbg !25
+  br i1 %49, label %50, label %51, !dbg !26
+
+50:                                               ; preds = %7
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
+  br label %51, !dbg !26
+
+51:                                               ; preds = %50, %7
+  %52 = shl i64 %26, 8, !dbg !27
+  %53 = add i64 %52, 12865792, !dbg !27
+  %54 = select i1 %46, i64 %53, i64 %52, !dbg !27
+  %55 = zext nneg i32 %15 to i64
+  %56 = or i64 %54, %55, !dbg !28
+  %57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
+  %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
+  %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
+  %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
+  %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
+  %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
+  %63 = bitcast i32 %59 to float, !dbg !30
+  %64 = bitcast i32 %60 to float, !dbg !30
+  %65 = bitcast i32 %61 to float, !dbg !30
+  %66 = bitcast i32 %62 to float, !dbg !30
+  %67 = fadd float %41, %63, !dbg !31
+  %68 = fadd float %42, %64, !dbg !31
+  %69 = fadd float %43, %65, !dbg !31
+  %70 = fadd float %44, %66, !dbg !31
+  %71 = fadd float %67, 0.000000e+00, !dbg !32
+  %72 = fadd float %68, 0.000000e+00, !dbg !32
+  %73 = fadd float %69, 0.000000e+00, !dbg !32
+  %74 = fadd float %70, 0.000000e+00, !dbg !32
+  %75 = fsub float %67, %71, !dbg !36
+  %76 = fsub float %68, %72, !dbg !36
+  %77 = fsub float %69, %73, !dbg !36
+  %78 = fsub float %70, %74, !dbg !36
+  %79 = fmul float %67, %75, !dbg !37
+  %80 = fmul float %68, %76, !dbg !37
+  %81 = fmul float %69, %77, !dbg !37
+  %82 = fmul float %70, %78, !dbg !37
+  %83 = fadd float %79, 0.000000e+00, !dbg !38
+  %84 = fadd float %80, 0.000000e+00, !dbg !38
+  %85 = fadd float %81, 0.000000e+00, !dbg !38
+  %86 = fadd float %82, 0.000000e+00, !dbg !38
+  %87 = fsub float %72, %71, !dbg !39
+  %88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
+  %89 = fmul float %88, %87, !dbg !44
+  %90 = fadd float %71, %89, !dbg !45
+  %91 = fadd float %83, %84, !dbg !46
+  %92 = fmul float %87, %87, !dbg !47
+  %93 = fmul float %88, %92, !dbg !48
+  %94 = fadd float %93, %91, !dbg !49
+  %95 = fsub float %73, %90, !dbg !39
+  %96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
+  %97 = fmul float %96, %95, !dbg !44
+  %98 = fadd float %90, %97, !dbg !45
+  %99 = fadd float %85, %94, !dbg !46
+  %100 = fmul float %95, %95, !dbg !47
+  %101 = fmul float %100, 2.000000e+00, !dbg !50
+  %102 = fmul float %96, %101, !dbg !48
+  %103 = fadd float %99, %102, !dbg !49
+  %104 = fsub float %74, %98, !dbg !39
+  %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
+  %106 = fmul float %105, %104, !dbg !44
+  %107 = fadd float %98, %106, !dbg !45
+  %108 = fadd float %86, %103, !dbg !46
+  %109 = fmul float %104, %104, !dbg !47
+  %110 = fmul float %109, 3.000000e+00, !dbg !50
+  %111 = fmul float %105, %110, !dbg !48
+  %112 = fadd float %108, %111, !dbg !49
+  %113 = bitcast float %107 to i32, !dbg !51
+  %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
+  %115 = bitcast i32 %114 to float, !dbg !51
+  %116 = bitcast float %112 to i32, !dbg !51
+  %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
+  %118 = bitcast i32 %117 to float, !dbg !51
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
+  %120 = bitcast i32 %119 to float, !dbg !51
+  %121 = fsub float %115, %107, !dbg !39
+  %122 = fadd float %120, 4.000000e+00, !dbg !53
+  %123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
+  %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
+  %126 = fmul float %125, %121, !dbg !44
+  %127 = fadd float %107, %126, !dbg !45
+  %128 = fadd float %112, %118, !dbg !46
+  %129 = fmul float %121, %121, !dbg !47
+  %130 = fmul float %129, 4.000000e+00, !dbg !50
+  %131 = fmul float %125, %130, !dbg !48
+  %132 = fadd float %128, %131, !dbg !49
+  %133 = bitcast float %127 to i32, !dbg !51
+  %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
+  %135 = bitcast i32 %134 to float, !dbg !51
+  %136 = bitcast float %132 to i32, !dbg !51
+  %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
+  %138 = bitcast i32 %137 to float, !dbg !51
+  %139 = bitcast float %122 to i32, !dbg !51
+  %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
+  %141 = bitcast i32 %140 to float, !dbg !51
+  %142 = fsub float %135, %127, !dbg !39
+  %143 = fadd float %122, %141, !dbg !53
+  %144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
+  %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
+  %146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
+  %147 = fmul float %146, %142, !dbg !44
+  %148 = fadd float %127, %147, !dbg !45
+  %149 = fadd float %132, %138, !dbg !46
+  %150 = fmul float %142, %142, !dbg !47
+  %151 = fmul float %122, %150, !dbg !50
+  %152 = fmul float %146, %151, !dbg !48
+  %153 = fadd float %149, %152, !dbg !49
+  %154 = bitcast float %148 to i32, !dbg !51
+  %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
+  %156 = bitcast i32 %155 to float, !dbg !51
+  %157 = bitcast float %153 to i32, !dbg !51
+  %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
+  %159 = bitcast i32 %158 to float, !dbg !51
+  %160 = bitcast float %143 to i32, !dbg !51
+  %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
+  %162 = bitcast i32 %161 to float, !dbg !51
+  %163 = fsub float %156, %148, !dbg !39
+  %164 = fadd float %143, %162, !dbg !53
+  %165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
+  %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
+  %167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
+  %168 = fmul float %167, %163, !dbg !44
+  %169 = fadd float %148, %168, !dbg !45
+  %170 = fadd float %153, %159, !dbg !46
+  %171 = fmul float %163, %163, !dbg !47
+  %172 = fmul float %143, %171, !dbg !50
+  %173 = fmul float %167, %172, !dbg !48
+  %174 = fadd float %170, %173, !dbg !49
+  %175 = bitcast float %169 to i32, !dbg !51
+  %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
+  %177 = bitcast i32 %176 to float, !dbg !51
+  %178 = bitcast float %174 to i32, !dbg !51
+  %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
+  %180 = bitcast i32 %179 to float, !dbg !51
+  %181 = bitcast float %164 to i32, !dbg !51
+  %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
+  %183 = bitcast i32 %182 to float, !dbg !51
+  %184 = fsub float %177, %169, !dbg !39
+  %185 = fadd float %164, %183, !dbg !53
+  %186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
+  %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
+  %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
+  %189 = fmul float %188, %184, !dbg !44
+  %190 = fadd float %169, %189, !dbg !45
+  %191 = fadd float %174, %180, !dbg !46
+  %192 = fmul float %184, %184, !dbg !47
+  %193 = fmul float %164, %192, !dbg !50
+  %194 = fmul float %188, %193, !dbg !48
+  %195 = fadd float %191, %194, !dbg !49
+  %196 = bitcast float %190 to i32, !dbg !51
+  %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
+  %198 = bitcast i32 %197 to float, !dbg !51
+  %199 = bitcast float %195 to i32, !dbg !51
+  %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
+  %201 = bitcast i32 %200 to float, !dbg !51
+  %202 = bitcast float %185 to i32, !dbg !51
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
+  %204 = bitcast i32 %203 to float, !dbg !51
+  %205 = fsub float %198, %190, !dbg !39
+  %206 = fadd float %185, %204, !dbg !53
+  %207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
+  %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
+  %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
+  %210 = fmul float %205, %209, !dbg !44
+  %211 = fadd float %190, %210, !dbg !45
+  %212 = fadd float %195, %201, !dbg !46
+  %213 = fmul float %205, %205, !dbg !47
+  %214 = fmul float %185, %213, !dbg !50
+  %215 = fmul float %209, %214, !dbg !48
+  %216 = fadd float %212, %215, !dbg !49
+  %217 = icmp eq i32 %9, 0, !dbg !51
+  %218 = shl nuw nsw i32 %12, 1, !dbg !51
+  %219 = or i32 %218, %14, !dbg !51
+  %220 = zext nneg i32 %219 to i64, !dbg !51
+  %221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
+  %222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
+  %223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
+  tail call void @llvm.nvvm.barrier0(), !dbg !51
+  %224 = icmp slt i32 %8, 4, !dbg !51
+  %225 = sext i32 %8 to i64, !dbg !51
+  %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
+  %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
+  %228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
+  %229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
+  %230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
+  %231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
+  %232 = bitcast float %227 to i32, !dbg !51
+  %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
+  %234 = bitcast i32 %233 to float, !dbg !51
+  %235 = bitcast float %229 to i32, !dbg !51
+  %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
+  %237 = bitcast i32 %236 to float, !dbg !51
+  %238 = bitcast float %231 to i32, !dbg !51
+  %239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
+  %240 = bitcast i32 %239 to float, !dbg !51
+  %241 = fsub float %234, %227, !dbg !39
+  %242 = fadd float %231, %240, !dbg !53
+  %243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
+  %244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
+  %245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
+  %246 = fmul float %241, %245, !dbg !44
+  %247 = fadd float %227, %246, !dbg !45
+  %248 = fadd float %229, %237, !dbg !46
+  %249 = fmul float %241, %241, !dbg !47
+  %250 = fmul float %231, %249, !dbg !50
+  %251 = fmul float %250, %245, !dbg !48
+  %252 = fadd float %248, %251, !dbg !49
+  %253 = icmp eq i32 %13, 0, !dbg !51
+  %254 = and i1 %224, %253, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
+  tail call void @llvm.nvvm.barrier0(), !dbg !51
+  %255 = zext nneg i32 %218 to i64, !dbg !51
+  %256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
+  %257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
+  %258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
+  %259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
+  %260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
+  %261 = zext nneg i32 %17 to i64, !dbg !57
+  %262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
+  %263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
+  br i1 %49, label %264, label %265, !dbg !59
+
+264:                                              ; preds = %51
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
+  br label %265, !dbg !59
+
+265:                                              ; preds = %264, %51
+  %266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
+  %267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
+  %271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
+  %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %.not.i = icmp eq i32 %272, 0, !dbg !63
+  br i1 %.not.i, label %275, label %273, !dbg !63
+
+273:                                              ; preds = %265
+  %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+
+275:                                              ; preds = %265
+  %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
+  br label %__nv_rsqrtf.exit, !dbg !63
+
+__nv_rsqrtf.exit:                                 ; preds = %273, %275
+  %.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
+  %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
+  %280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
+  %281 = bitcast i32 %280 to float, !dbg !60
+  %282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
+  %283 = bitcast i32 %282 to float, !dbg !56
+  %284 = fadd float %283, %281, !dbg !64
+  %285 = fsub float %284, %257, !dbg !65
+  %286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
+  %287 = bitcast i32 %286 to float, !dbg !60
+  %288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
+  %289 = bitcast i32 %288 to float, !dbg !56
+  %290 = fadd float %289, %287, !dbg !64
+  %291 = fsub float %290, %257, !dbg !65
+  %292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
+  %293 = bitcast i32 %292 to float, !dbg !60
+  %294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
+  %295 = bitcast i32 %294 to float, !dbg !56
+  %296 = fadd float %295, %293, !dbg !64
+  %297 = fsub float %296, %257, !dbg !65
+  %298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
+  %299 = bitcast i32 %298 to float, !dbg !60
+  %300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
+  %301 = bitcast i32 %300 to float, !dbg !56
+  %302 = fadd float %301, %299, !dbg !64
+  %303 = fsub float %302, %257, !dbg !65
+  %304 = extractvalue { i32, i32 } %263, 0, !dbg !58
+  %305 = extractvalue { i32, i32 } %263, 1, !dbg !58
+  %306 = fmul float %303, %.0.i, !dbg !66
+  %307 = fmul float %297, %.0.i, !dbg !66
+  %308 = fmul float %291, %.0.i, !dbg !66
+  %309 = fmul float %285, %.0.i, !dbg !66
+  tail call void @llvm.nvvm.barrier0(), !dbg !67
+  %310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
+  %311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
+  %312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
+  store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
+  tail call void @llvm.nvvm.barrier0(), !dbg !67
+  %313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
+  %314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
+  %315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
+  %316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
+  %317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
+  %318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
+  %319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
+  %320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
+  %321 = fmul float %306, %314, !dbg !67
+  %322 = fmul float %307, %316, !dbg !67
+  %323 = fmul float %308, %318, !dbg !67
+  %324 = fmul float %309, %320, !dbg !67
+  %325 = shl i32 %20, 8, !dbg !68
+  %326 = or i32 %325, %15, !dbg !69
+  %327 = sext i32 %326 to i64, !dbg !70
+  %328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
+  %329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
+  %330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
+  %331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
+  %332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
+  %333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
+  %334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
+  %335 = bitcast <2 x i16> %334 to i32, !dbg !71
+  %336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
+  %337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
+  %338 = bitcast <2 x i16> %337 to i32, !dbg !71
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
+  ret void, !dbg !72
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 35, column: 40, scope: !7)
+!20 = !DILocation(line: 35, column: 34, scope: !7)
+!21 = !DILocation(line: 35, column: 50, scope: !7)
+!22 = !DILocation(line: 36, column: 22, scope: !7)
+!23 = !DILocation(line: 37, column: 22, scope: !7)
+!24 = !DILocation(line: 38, column: 36, scope: !7)
+!25 = !DILocation(line: 39, column: 40, scope: !7)
+!26 = !DILocation(line: 39, column: 55, scope: !7)
+!27 = !DILocation(line: 40, column: 44, scope: !7)
+!28 = !DILocation(line: 40, column: 40, scope: !7)
+!29 = !DILocation(line: 40, column: 34, scope: !7)
+!30 = !DILocation(line: 40, column: 52, scope: !7)
+!31 = !DILocation(line: 41, column: 22, scope: !7)
+!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
+!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
+!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!35 = !DILocation(line: 44, column: 38, scope: !33)
+!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
+!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
+!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
+!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
+!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
+!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 41, scope: !40)
+!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
+!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
+!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
+!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
+!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
+!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
+!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
+!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
+!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
+!52 = !DILocation(line: 50, column: 41, scope: !33)
+!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
+!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
+!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
+!56 = !DILocation(line: 59, column: 51, scope: !7)
+!57 = !DILocation(line: 60, column: 35, scope: !7)
+!58 = !DILocation(line: 60, column: 40, scope: !7)
+!59 = !DILocation(line: 64, column: 57, scope: !7)
+!60 = !DILocation(line: 65, column: 54, scope: !7)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 72, column: 30, scope: !7)
+!64 = !DILocation(line: 66, column: 24, scope: !7)
+!65 = !DILocation(line: 67, column: 24, scope: !7)
+!66 = !DILocation(line: 73, column: 24, scope: !7)
+!67 = !DILocation(line: 74, column: 24, scope: !7)
+!68 = !DILocation(line: 76, column: 39, scope: !7)
+!69 = !DILocation(line: 76, column: 35, scope: !7)
+!70 = !DILocation(line: 76, column: 29, scope: !7)
+!71 = !DILocation(line: 76, column: 52, scope: !7)
+!72 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..0b5f40b5e7eb9129cbd52fb5bfd25419609a39a6
--- /dev/null
+++ b/.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir
@@ -0,0 +1,125 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
+    %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
+    %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
+    %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
+    %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
+    %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
+    %cst_10 = arith.constant 0.000000e+00 : f32
+    %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
+    %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
+    %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
+    %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
+    %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
+    %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
+    %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
+    %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
+    %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
+    %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
+    %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
+    %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
+    %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %31 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
+    %32 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
+    %33 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
+    %34 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
+    %35 = arith.select %33, %31, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
+    %36 = arith.select %34, %32, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
+    %37 = arith.cmpi sge, %36, %cst_8 : tensor<2x1xi64, #blocked2>
+    %38 = arith.cmpi slt, %36, %cst_9 : tensor<2x1xi64, #blocked2>
+    %39 = arith.andi %37, %38 : tensor<2x1xi1, #blocked2>
+    tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %40 = arith.muli %35, %cst_5 : tensor<2x1xi64, #blocked>
+    %41 = tt.broadcast %40 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %44 = arith.addi %43, %41 : tensor<2x256xi64, #blocked>
+    %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %46 = tt.addptr %45, %44 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
+    %47 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %48 = arith.addf %47, %30 : tensor<2x256xf32, #blocked>
+    %49 = arith.addf %48, %cst_13 : tensor<2x256xf32, #blocked>
+    %50 = arith.subf %48, %49 : tensor<2x256xf32, #blocked>
+    %51 = arith.mulf %48, %50 : tensor<2x256xf32, #blocked>
+    %52 = arith.addf %51, %cst_13 : tensor<2x256xf32, #blocked>
+    %53 = arith.select %29, %49, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %54 = arith.select %29, %52, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %55 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %82 = arith.subf %arg10, %arg7 : f32
+      %83 = arith.addf %arg9, %arg12 : f32
+      %84 = arith.cmpf oeq, %83, %cst_10 : f32
+      %85 = arith.divf %arg12, %83 : f32
+      %86 = arith.select %84, %cst_10, %85 : f32
+      %87 = arith.mulf %82, %86 : f32
+      %88 = arith.addf %arg7, %87 : f32
+      %89 = arith.addf %arg8, %arg11 : f32
+      %90 = arith.mulf %82, %82 : f32
+      %91 = arith.mulf %90, %arg9 : f32
+      %92 = arith.mulf %91, %86 : f32
+      %93 = arith.addf %89, %92 : f32
+      tt.reduce.return %88, %93, %83 : f32, f32, f32
+    }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %60 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
+    %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
+    %63 = tt.load %62, %22, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
+    tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %64 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %65 = arith.addf %64, %60 : tensor<2x256xf32, #blocked>
+    %66 = tt.broadcast %58 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %67 = arith.subf %65, %66 : tensor<2x256xf32, #blocked>
+    %68 = arith.divf %59, %cst_12 : tensor<2x1xf32, #blocked>
+    %69 = arith.addf %68, %cst_11 : tensor<2x1xf32, #blocked>
+    %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
+    %71 = tt.broadcast %70 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %72 = arith.mulf %67, %71 : tensor<2x256xf32, #blocked>
+    %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
+    %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %75 = arith.mulf %72, %74 : tensor<2x256xf32, #blocked>
+    %76 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
+    %77 = tt.broadcast %76 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %78 = arith.addi %24, %77 : tensor<2x256xi32, #blocked>
+    %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
+    %80 = tt.addptr %79, %78 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %81 = arith.truncf %75 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
+    tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir b/.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..536c96af2e3168e2444dcf62e8eab15fea9b346c
--- /dev/null
+++ b/.triton/dump/345a87a492fd703c73ab83265a21fcb6/triton_.ttgir
@@ -0,0 +1,125 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
+    %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
+    %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
+    %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
+    %cst_9 = arith.constant 0.000000e+00 : f32
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
+    %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
+    %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
+    %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
+    %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
+    %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
+    %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
+    %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
+    %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
+    %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
+    %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
+    %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %31 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
+    %32 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
+    %33 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
+    %34 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
+    %35 = arith.select %33, %31, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
+    %36 = arith.select %34, %32, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
+    %37 = arith.cmpi sge, %36, %cst_7 : tensor<16x1xi64, #blocked1>
+    %38 = arith.cmpi slt, %36, %cst_8 : tensor<16x1xi64, #blocked1>
+    %39 = arith.andi %37, %38 : tensor<16x1xi1, #blocked1>
+    tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
+    %40 = arith.muli %35, %cst_4 : tensor<16x1xi64, #blocked>
+    %41 = tt.broadcast %40 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
+    %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
+    %44 = arith.addi %43, %41 : tensor<16x256xi64, #blocked>
+    %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
+    %46 = tt.addptr %45, %44 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
+    %47 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %48 = arith.addf %47, %30 : tensor<16x256xf32, #blocked>
+    %49 = arith.addf %48, %cst_14 : tensor<16x256xf32, #blocked>
+    %50 = arith.subf %48, %49 : tensor<16x256xf32, #blocked>
+    %51 = arith.mulf %48, %50 : tensor<16x256xf32, #blocked>
+    %52 = arith.addf %51, %cst_14 : tensor<16x256xf32, #blocked>
+    %53 = arith.select %29, %49, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
+    %54 = arith.select %29, %52, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
+    %55 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %82 = arith.subf %arg10, %arg7 : f32
+      %83 = arith.addf %arg9, %arg12 : f32
+      %84 = arith.cmpf oeq, %83, %cst_9 : f32
+      %85 = arith.divf %arg12, %83 : f32
+      %86 = arith.select %84, %cst_9, %85 : f32
+      %87 = arith.mulf %82, %86 : f32
+      %88 = arith.addf %arg7, %87 : f32
+      %89 = arith.addf %arg8, %arg11 : f32
+      %90 = arith.mulf %82, %82 : f32
+      %91 = arith.mulf %90, %arg9 : f32
+      %92 = arith.mulf %91, %86 : f32
+      %93 = arith.addf %89, %92 : f32
+      tt.reduce.return %88, %93, %83 : f32, f32, f32
+    }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %60 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
+    %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
+    %63 = tt.load %62, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
+    tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
+    %64 = tt.load %46, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
+    %65 = arith.addf %64, %60 : tensor<16x256xf32, #blocked>
+    %66 = tt.broadcast %58 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %67 = arith.subf %65, %66 : tensor<16x256xf32, #blocked>
+    %68 = arith.divf %59, %cst_13 : tensor<16x1xf32, #blocked>
+    %69 = arith.addf %68, %cst_12 : tensor<16x1xf32, #blocked>
+    %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
+    %71 = tt.broadcast %70 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %72 = arith.mulf %67, %71 : tensor<16x256xf32, #blocked>
+    %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
+    %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
+    %75 = arith.mulf %72, %74 : tensor<16x256xf32, #blocked>
+    %76 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
+    %77 = tt.broadcast %76 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
+    %78 = arith.addi %24, %77 : tensor<16x256xi32, #blocked>
+    %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
+    %80 = tt.addptr %79, %78 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
+    %81 = arith.truncf %75 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
+    tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..c08911a056519cbfce8a8991f889ab5659121a49
--- /dev/null
+++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx
@@ -0,0 +1,456 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3de4e
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2d3de4e(
+	.param .u64 triton__0d1d2d3de4e_param_0,
+	.param .u64 triton__0d1d2d3de4e_param_1,
+	.param .u64 triton__0d1d2d3de4e_param_2,
+	.param .u32 triton__0d1d2d3de4e_param_3,
+	.param .u32 triton__0d1d2d3de4e_param_4
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<10>;
+	.reg .b32 	%r<44>;
+	.reg .f32 	%f<11>;
+	.reg .b64 	%rd<16>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2d3de4e_param_2];
+	ld.param.u64 	%rd2, [triton__0d1d2d3de4e_param_1];
+	ld.param.u64 	%rd1, [triton__0d1d2d3de4e_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 63;
+	.loc	1 24 33
+	bfe.u32 	%r3, %r1, 6, 2;
+	.loc	1 21 28
+	mov.u32 %r10, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r12, %r10, 6;
+	.loc	1 22 23
+	or.b32  	%r4, %r12, %r2;
+	.loc	1 27 36
+	shl.b32 	%r13, %r3, 17;
+	add.s32 	%r14, %r13, %r12;
+	or.b32  	%r42, %r14, %r2;
+	mov.f32 	%f10, 0f00000000;
+	mov.b32 	%r43, -4;
+	mov.pred 	%p4, -1;
+$L__BB0_1:
+	.loc	1 31 34
+	mul.wide.s32 	%rd5, %r42, 4;
+	add.s64 	%rd4, %rd1, %rd5;
+	mov.b32 	%r16, 0;
+	.loc	1 31 53
+	mov.u32 %r15, 0x0;
+	@%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ];
+	@!%p4 mov.u32 %r15, %r16;
+	mov.b32 	%f4, %r15;
+	.loc	1 34 38
+	add.f32 	%f10, %f10, %f4;
+	.loc	1 27 36
+	add.s32 	%r43, %r43, 4;
+	add.s32 	%r42, %r42, 524288;
+	setp.lt.u32 	%p3, %r43, 116;
+	@%p3 bra 	$L__BB0_1;
+$L__tmp1:
+	.loc	2 243 36
+	shl.b32 	%r25, %r3, 2;
+	shl.b32 	%r26, %r2, 4;
+	or.b32  	%r27, %r26, %r25;
+	mov.u32 	%r28, global_smem;
+	add.s32 	%r17, %r28, %r27;
+	mov.b32 	%r18, %f10;
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r18;
+	bar.sync 	0;
+	setp.lt.s32 	%p5, %r1, 256;
+	shl.b32 	%r29, %r1, 2;
+	add.s32 	%r20, %r28, %r29;
+	@%p5 ld.shared.b32 %r19, [ %r20 + 0 ];
+	mov.b32 	%f5, %r19;
+	shfl.sync.bfly.b32	%r30, %r19, 2, 31, -1;
+	mov.b32 	%f6, %r30;
+$L__tmp2:
+	.loc	2 233 15
+	add.f32 	%f7, %f5, %f6;
+$L__tmp3:
+	.loc	2 243 36
+	mov.b32 	%r31, %f7;
+	shfl.sync.bfly.b32	%r32, %r31, 1, 31, -1;
+	mov.b32 	%f8, %r32;
+$L__tmp4:
+	.loc	2 233 15
+	add.f32 	%f9, %f7, %f8;
+$L__tmp5:
+	.loc	2 243 36
+	and.b32  	%r33, %r1, 3;
+	setp.eq.s32 	%p9, %r33, 0;
+	and.pred  	%p6, %p5, %p9;
+	mov.b32 	%r22, %f9;
+	@%p6 st.shared.b32 [ %r20 + 0 ], %r22;
+	bar.sync 	0;
+	add.s32 	%r34, %r28, %r26;
+$L__tmp6:
+	.loc	1 36 20
+	shr.s32 	%r36, %r4, 31;
+	shr.u32 	%r37, %r36, 24;
+	add.s32 	%r38, %r4, %r37;
+	shr.s32 	%r39, %r38, 8;
+	and.b32  	%r40, %r38, -256;
+	sub.s32 	%r41, %r4, %r40;
+	.loc	1 38 30
+	mul.wide.s32 	%rd9, %r39, 8;
+	add.s64 	%rd7, %rd2, %rd9;
+	.loc	1 45 55
+	ld.shared.u32 	%r24, [%r34];
+	.loc	1 38 35
+	mov.u64 %rd6, 0x0;
+	@%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
+	.loc	1 41 32
+	shr.u64 	%rd10, %rd6, 54;
+	and.b64  	%rd11, %rd10, 512;
+	add.s64 	%rd12, %rd11, %rd6;
+	.loc	1 45 30
+	shl.b64 	%rd13, %rd12, 10;
+	add.s64 	%rd14, %rd3, %rd13;
+	mul.wide.s32 	%rd15, %r41, 4;
+	add.s64 	%rd8, %rd14, %rd15;
+	.loc	1 45 55
+	setp.eq.s32 	%p8, %r3, 0;
+	mov.u32 %r23, 0x0;
+	@%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24;
+	.loc	1 45 4
+	ret;
+$L__tmp7:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 264
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 54
+.b8 105
+.b8 107
+.b8 53
+.b8 118
+.b8 120
+.b8 55
+.b8 112
+.b8 50
+.b8 50
+.b8 102
+.b8 112
+.b8 107
+.b8 52
+.b8 100
+.b8 99
+.b8 118
+.b8 104
+.b8 53
+.b8 53
+.b8 122
+.b8 105
+.b8 109
+.b8 119
+.b8 52
+.b8 116
+.b8 53
+.b8 110
+.b8 114
+.b8 53
+.b8 122
+.b8 110
+.b8 50
+.b8 98
+.b8 55
+.b8 105
+.b8 110
+.b8 117
+.b8 106
+.b8 120
+.b8 106
+.b8 97
+.b8 117
+.b8 120
+.b8 115
+.b8 104
+.b8 108
+.b8 106
+.b8 117
+.b8 109
+.b8 109
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 54
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp6
+.b8 2
+.b8 35
+.b8 25
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp5
+.b8 2
+.b8 35
+.b8 25
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp5
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 268
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 101
+.b8 52
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 268
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f76247793b898b0e6e78963e133f5ff530eb0a9b
--- /dev/null
+++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir
@@ -0,0 +1,62 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x4xi32, #blocked>
+    %cst_4 = arith.constant dense<120> : tensor<1x4xi32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c120_i32 = arith.constant 120 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32, #blocked>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
+    %8 = tt.broadcast %5 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x4xf32, #blocked>)  : i32 {
+      %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32, #blocked>
+      %28 = arith.addi %27, %7 : tensor<1x4xi32, #blocked>
+      %29 = arith.cmpi slt, %28, %cst_4 : tensor<1x4xi32, #blocked>
+      %30 = arith.muli %28, %cst_3 : tensor<1x4xi32, #blocked>
+      %31 = tt.broadcast %30 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %32 = arith.addi %8, %31 : tensor<64x4xi32, #blocked>
+      %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %34 = tt.broadcast %29 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %35 = tt.load %33, %34, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %36 = arith.addf %arg6, %35 : tensor<64x4xf32, #blocked>
+      %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      scf.yield %37 : tensor<64x4xf32, #blocked>
+    }
+    %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %27 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %27 : f32
+    }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %13 = arith.divsi %5, %cst_2 : tensor<64x1xi32, #blocked>
+    %14 = arith.remsi %5, %cst_2 : tensor<64x1xi32, #blocked>
+    %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %18 = arith.addi %17, %cst_1 : tensor<64x1xi64, #blocked>
+    %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64, #blocked>
+    %20 = arith.select %19, %18, %17 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %21 = arith.muli %20, %cst : tensor<64x1xi64, #blocked>
+    %22 = arith.extsi %14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
+    %23 = arith.addi %22, %21 : tensor<64x1xi64, #blocked>
+    %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
+    %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
+    %26 = "tt.atomic_rmw"(%25, %12, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..42ac242162058bd1f96006963526019a8211b9b3
--- /dev/null
+++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir
@@ -0,0 +1,61 @@
+module {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_0 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_1 = arith.constant dense<512> : tensor<64x1xi64>
+    %c4_i32 = arith.constant 4 : i32
+    %c120_i32 = arith.constant 120 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<true> : tensor<64x1xi1>
+    %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_4 = arith.constant dense<131072> : tensor<1x4xi32>
+    %cst_5 = arith.constant dense<120> : tensor<1x4xi32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
+    %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x4xf32>)  : i32 {
+      %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32>
+      %28 = arith.addi %27, %7 : tensor<1x4xi32>
+      %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x4xi32>
+      %30 = arith.muli %28, %cst_4 : tensor<1x4xi32>
+      %31 = tt.broadcast %30 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %32 = arith.addi %8, %31 : tensor<64x4xi32>
+      %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %34 = tt.broadcast %29 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
+      %36 = arith.addf %arg6, %35 : tensor<64x4xf32>
+      %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1>, tensor<64x4xf32>
+      scf.yield %37 : tensor<64x4xf32>
+    }
+    %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %27 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %27 : f32
+    }) : (tensor<64x4xf32>) -> tensor<64xf32>
+    %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
+    %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
+    %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
+    %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
+    %21 = arith.muli %20, %cst : tensor<64x1xi64>
+    %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
+    %23 = arith.addi %22, %21 : tensor<64x1xi64>
+    %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
+    %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
+    %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..99876af7f5c7e389af6c7c0b535c4a74117764aa
Binary files /dev/null and b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin differ
diff --git a/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..0c4f89187c137d04c4b917750d3a2bafc357b218
--- /dev/null
+++ b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.llir
@@ -0,0 +1,355 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = lshr i32 %9, 2, !dbg !10
+  %11 = and i32 %10, 63, !dbg !10
+  %12 = and i32 %9, 63, !dbg !10
+  %13 = and i32 %9, 3, !dbg !11
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !12
+  %15 = shl i32 %14, 6, !dbg !13
+  %16 = or i32 %15, %11, !dbg !14
+  %17 = or i32 %15, %12, !dbg !14
+  %18 = sext i32 %16 to i64, !dbg !15
+  %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
+  %20 = sext i32 %17 to i64, !dbg !15
+  %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #5, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #5, !dbg !16
+  %24 = srem i32 %16, 512, !dbg !17
+  %25 = shl nsw i32 %24, 8, !dbg !18
+  %26 = shl i32 %16, 8, !dbg !19
+  %27 = add i64 %23, 50257, !dbg !20
+  %28 = icmp slt i64 %22, 0, !dbg !21
+  %29 = icmp slt i64 %23, 0, !dbg !21
+  %30 = select i1 %29, i64 %27, i64 %23, !dbg !22
+  %.fr8 = freeze i64 %30, !dbg !23
+  %31 = icmp ugt i64 %.fr8, 50256, !dbg !23
+  %32 = shl i64 %22, 8, !dbg !24
+  %33 = add i64 %32, 12865792, !dbg !24
+  %34 = select i1 %28, i64 %33, i64 %32, !dbg !24
+  %35 = getelementptr float, ptr addrspace(1) %1, i64 %34
+  br i1 %31, label %.split.us, label %.split, !dbg !25
+
+.split.us:                                        ; preds = %8, %.split.us
+  %36 = phi float [ %58, %.split.us ], [ 0.000000e+00, %8 ]
+  %37 = phi float [ %63, %.split.us ], [ 0.000000e+00, %8 ]
+  %38 = phi float [ %60, %.split.us ], [ 0.000000e+00, %8 ]
+  %39 = phi i32 [ %64, %.split.us ], [ 0, %8 ]
+  %40 = or i32 %39, %13, !dbg !26
+  %41 = add i32 %40, %25, !dbg !27
+  %42 = sext i32 %41 to i64, !dbg !28
+  %43 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !28
+  %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true) #5, !dbg !29
+  %45 = bitcast i32 %44 to float, !dbg !29
+  %46 = add i32 %40, %26, !dbg !30
+  %47 = sext i32 %46 to i64, !dbg !31
+  %48 = getelementptr i16, ptr addrspace(1) %3, i64 %47, !dbg !31
+  %49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %48, i1 true, i16 0, i1 true) #5, !dbg !32
+  %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #5, !dbg !33
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !25
+  %51 = zext nneg i32 %40 to i64, !dbg !34
+  %52 = getelementptr float, ptr addrspace(1) %35, i64 %51, !dbg !35
+  %53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true) #5, !dbg !36
+  %54 = bitcast i32 %53 to float, !dbg !36
+  %55 = fadd float %45, %54, !dbg !37
+  %56 = fadd float %50, %55, !dbg !38
+  %57 = fsub float %56, %38, !dbg !39
+  %58 = fadd float %36, 1.000000e+00, !dbg !43
+  %59 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %57, float %58) #5, !dbg !44
+  %60 = fadd float %38, %59, !dbg !45
+  %61 = fsub float %56, %60, !dbg !46
+  %62 = fmul float %57, %61, !dbg !47
+  %63 = fadd float %37, %62, !dbg !48
+  %64 = add nuw nsw i32 %39, 4, !dbg !49
+  %65 = icmp ult i32 %39, 252, !dbg !49
+  br i1 %65, label %.split.us, label %.split5.us, !dbg !49
+
+.split:                                           ; preds = %8, %.split
+  %66 = phi float [ %88, %.split ], [ 0.000000e+00, %8 ]
+  %67 = phi float [ %93, %.split ], [ 0.000000e+00, %8 ]
+  %68 = phi float [ %90, %.split ], [ 0.000000e+00, %8 ]
+  %69 = phi i32 [ %94, %.split ], [ 0, %8 ]
+  %70 = or i32 %69, %13, !dbg !26
+  %71 = add i32 %70, %25, !dbg !27
+  %72 = sext i32 %71 to i64, !dbg !28
+  %73 = getelementptr float, ptr addrspace(1) %2, i64 %72, !dbg !28
+  %74 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %73, i1 true, i32 0, i1 true) #5, !dbg !29
+  %75 = bitcast i32 %74 to float, !dbg !29
+  %76 = add i32 %70, %26, !dbg !30
+  %77 = sext i32 %76 to i64, !dbg !31
+  %78 = getelementptr i16, ptr addrspace(1) %3, i64 %77, !dbg !31
+  %79 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %78, i1 true, i16 0, i1 true) #5, !dbg !32
+  %80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %79) #5, !dbg !33
+  %81 = zext nneg i32 %70 to i64, !dbg !34
+  %82 = getelementptr float, ptr addrspace(1) %35, i64 %81, !dbg !35
+  %83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true) #5, !dbg !36
+  %84 = bitcast i32 %83 to float, !dbg !36
+  %85 = fadd float %75, %84, !dbg !37
+  %86 = fadd float %80, %85, !dbg !38
+  %87 = fsub float %86, %68, !dbg !39
+  %88 = fadd float %66, 1.000000e+00, !dbg !43
+  %89 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %87, float %88) #5, !dbg !44
+  %90 = fadd float %68, %89, !dbg !45
+  %91 = fsub float %86, %90, !dbg !46
+  %92 = fmul float %87, %91, !dbg !47
+  %93 = fadd float %67, %92, !dbg !48
+  %94 = add nuw nsw i32 %69, 4, !dbg !49
+  %95 = icmp ult i32 %69, 252, !dbg !49
+  br i1 %95, label %.split, label %.split5.us, !dbg !49
+
+.split5.us:                                       ; preds = %.split, %.split.us
+  %.us-phi = phi float [ %60, %.split.us ], [ %90, %.split ]
+  %.us-phi6 = phi float [ %63, %.split.us ], [ %93, %.split ]
+  %.us-phi7 = phi float [ %58, %.split.us ], [ %88, %.split ]
+  %96 = bitcast float %.us-phi to i32, !dbg !50
+  %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 2, i32 31), !dbg !50
+  %98 = bitcast i32 %97 to float, !dbg !50
+  %99 = bitcast float %.us-phi6 to i32, !dbg !50
+  %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 2, i32 31), !dbg !50
+  %101 = bitcast i32 %100 to float, !dbg !50
+  %102 = bitcast float %.us-phi7 to i32, !dbg !50
+  %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !50
+  %104 = bitcast i32 %103 to float, !dbg !50
+  %105 = fsub float %98, %.us-phi, !dbg !52
+  %106 = fadd float %.us-phi7, %104, !dbg !56
+  %107 = fcmp oeq float %106, 0.000000e+00, !dbg !57
+  %108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %104, float %106) #5, !dbg !58
+  %109 = select i1 %107, float 0.000000e+00, float %108, !dbg !59
+  %110 = fmul float %105, %109, !dbg !60
+  %111 = fadd float %.us-phi, %110, !dbg !61
+  %112 = fadd float %.us-phi6, %101, !dbg !62
+  %113 = fmul float %105, %105, !dbg !63
+  %114 = fmul float %.us-phi7, %113, !dbg !64
+  %115 = fmul float %114, %109, !dbg !65
+  %116 = fadd float %112, %115, !dbg !66
+  %117 = bitcast float %111 to i32, !dbg !50
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !50
+  %119 = bitcast i32 %118 to float, !dbg !50
+  %120 = bitcast float %116 to i32, !dbg !50
+  %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 1, i32 31), !dbg !50
+  %122 = bitcast i32 %121 to float, !dbg !50
+  %123 = bitcast float %106 to i32, !dbg !50
+  %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !50
+  %125 = bitcast i32 %124 to float, !dbg !50
+  %126 = fsub float %119, %111, !dbg !52
+  %127 = fadd float %106, %125, !dbg !56
+  %128 = fcmp oeq float %127, 0.000000e+00, !dbg !57
+  %129 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %125, float %127) #5, !dbg !58
+  %130 = select i1 %128, float 0.000000e+00, float %129, !dbg !59
+  %131 = fmul float %126, %130, !dbg !60
+  %132 = fadd float %111, %131, !dbg !61
+  %133 = fadd float %116, %122, !dbg !62
+  %134 = fmul float %126, %126, !dbg !63
+  %135 = fmul float %106, %134, !dbg !64
+  %136 = fmul float %130, %135, !dbg !65
+  %137 = fadd float %133, %136, !dbg !66
+  %138 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float 2.560000e+02) #5, !dbg !67
+  %139 = fadd float %138, 0x3EE4F8B580000000, !dbg !68
+  br label %140, !dbg !69
+
+140:                                              ; preds = %.split5.us, %__nv_rsqrtf.exit
+  %141 = phi i32 [ 0, %.split5.us ], [ %174, %__nv_rsqrtf.exit ]
+  %142 = or i32 %141, %13, !dbg !70
+  %143 = add i32 %142, %25, !dbg !71
+  %144 = sext i32 %143 to i64, !dbg !72
+  %145 = getelementptr float, ptr addrspace(1) %2, i64 %144, !dbg !72
+  %146 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %145, i1 true, i32 0, i1 true) #5, !dbg !73
+  %147 = bitcast i32 %146 to float, !dbg !73
+  %148 = add i32 %142, %26, !dbg !74
+  %149 = sext i32 %148 to i64, !dbg !75
+  %150 = getelementptr i16, ptr addrspace(1) %3, i64 %149, !dbg !75
+  %151 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %150, i1 true, i16 0, i1 true) #5, !dbg !76
+  %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %151) #5, !dbg !77
+  %153 = zext nneg i32 %142 to i64, !dbg !78
+  %154 = getelementptr float, ptr addrspace(1) %4, i64 %153, !dbg !78
+  %155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %154, i1 true, i32 0, i1 true) #5, !dbg !79
+  %156 = bitcast i32 %155 to float, !dbg !79
+  br i1 %31, label %157, label %158, !dbg !80
+
+157:                                              ; preds = %140
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
+  br label %158, !dbg !80
+
+158:                                              ; preds = %157, %140
+  %159 = getelementptr float, ptr addrspace(1) %35, i64 %153, !dbg !81
+  %160 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true) #5, !dbg !82
+  %161 = bitcast i32 %160 to float, !dbg !82
+  %162 = fadd float %147, %161, !dbg !83
+  %163 = fadd float %152, %162, !dbg !84
+  %164 = fsub float %163, %132, !dbg !85
+  %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !86
+  %.not.i = icmp eq i32 %165, 0, !dbg !86
+  br i1 %.not.i, label %168, label %166, !dbg !86
+
+166:                                              ; preds = %158
+  %167 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %139), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+168:                                              ; preds = %158
+  %169 = tail call float @llvm.nvvm.rsqrt.approx.f(float %139), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+__nv_rsqrtf.exit:                                 ; preds = %166, %168
+  %.0.i = phi float [ %167, %166 ], [ %169, %168 ], !dbg !86
+  %170 = fmul float %164, %.0.i, !dbg !87
+  %171 = fmul float %170, %156, !dbg !88
+  %172 = getelementptr i16, ptr addrspace(1) %5, i64 %149, !dbg !89
+  %173 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %171) #5, !dbg !90
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %173, ptr addrspace(1) %172, i1 true) #5, !dbg !90
+  %174 = add nuw nsw i32 %141, 4, !dbg !69
+  %175 = icmp ult i32 %141, 252, !dbg !69
+  br i1 %175, label %140, label %176, !dbg !69
+
+176:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !91
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #2 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py", directory: "/tmp/torchinductor_root/ci")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 44, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 22, scope: !7)
+!22 = !DILocation(line: 39, column: 36, scope: !7)
+!23 = !DILocation(line: 40, column: 40, scope: !7)
+!24 = !DILocation(line: 41, column: 44, scope: !7)
+!25 = !DILocation(line: 40, column: 55, scope: !7)
+!26 = !DILocation(line: 32, column: 27, scope: !7)
+!27 = !DILocation(line: 35, column: 40, scope: !7)
+!28 = !DILocation(line: 35, column: 34, scope: !7)
+!29 = !DILocation(line: 35, column: 50, scope: !7)
+!30 = !DILocation(line: 36, column: 40, scope: !7)
+!31 = !DILocation(line: 36, column: 34, scope: !7)
+!32 = !DILocation(line: 36, column: 50, scope: !7)
+!33 = !DILocation(line: 36, column: 101, scope: !7)
+!34 = !DILocation(line: 41, column: 40, scope: !7)
+!35 = !DILocation(line: 41, column: 34, scope: !7)
+!36 = !DILocation(line: 41, column: 52, scope: !7)
+!37 = !DILocation(line: 42, column: 22, scope: !7)
+!38 = !DILocation(line: 44, column: 22, scope: !7)
+!39 = !DILocation(line: 96, column: 20, scope: !40, inlinedAt: !42)
+!40 = distinct !DILexicalBlockFile(scope: !7, file: !41, discriminator: 0)
+!41 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!42 = !DILocation(line: 47, column: 41, scope: !40)
+!43 = !DILocation(line: 97, column: 26, scope: !40, inlinedAt: !42)
+!44 = !DILocation(line: 98, column: 30, scope: !40, inlinedAt: !42)
+!45 = !DILocation(line: 98, column: 22, scope: !40, inlinedAt: !42)
+!46 = !DILocation(line: 101, column: 30, scope: !40, inlinedAt: !42)
+!47 = !DILocation(line: 101, column: 22, scope: !40, inlinedAt: !42)
+!48 = !DILocation(line: 50, column: 50, scope: !7)
+!49 = !DILocation(line: 31, column: 36, scope: !7)
+!50 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !51)
+!51 = !DILocation(line: 53, column: 44, scope: !40)
+!52 = !DILocation(line: 108, column: 21, scope: !53, inlinedAt: !54)
+!53 = distinct !DILexicalBlockFile(scope: !40, file: !41, discriminator: 0)
+!54 = !DILocation(line: 120, column: 46, scope: !53, inlinedAt: !55)
+!55 = !DILocation(line: 53, column: 44, scope: !53)
+!56 = !DILocation(line: 109, column: 28, scope: !53, inlinedAt: !54)
+!57 = !DILocation(line: 110, column: 39, scope: !53, inlinedAt: !54)
+!58 = !DILocation(line: 110, column: 60, scope: !53, inlinedAt: !54)
+!59 = !DILocation(line: 110, column: 49, scope: !53, inlinedAt: !54)
+!60 = !DILocation(line: 112, column: 25, scope: !53, inlinedAt: !54)
+!61 = !DILocation(line: 112, column: 17, scope: !53, inlinedAt: !54)
+!62 = !DILocation(line: 113, column: 15, scope: !53, inlinedAt: !54)
+!63 = !DILocation(line: 113, column: 30, scope: !53, inlinedAt: !54)
+!64 = !DILocation(line: 113, column: 38, scope: !53, inlinedAt: !54)
+!65 = !DILocation(line: 113, column: 49, scope: !53, inlinedAt: !54)
+!66 = !DILocation(line: 113, column: 22, scope: !53, inlinedAt: !54)
+!67 = !DILocation(line: 75, column: 24, scope: !7)
+!68 = !DILocation(line: 77, column: 24, scope: !7)
+!69 = !DILocation(line: 58, column: 36, scope: !7)
+!70 = !DILocation(line: 59, column: 27, scope: !7)
+!71 = !DILocation(line: 62, column: 41, scope: !7)
+!72 = !DILocation(line: 62, column: 35, scope: !7)
+!73 = !DILocation(line: 62, column: 51, scope: !7)
+!74 = !DILocation(line: 63, column: 41, scope: !7)
+!75 = !DILocation(line: 63, column: 35, scope: !7)
+!76 = !DILocation(line: 63, column: 51, scope: !7)
+!77 = !DILocation(line: 63, column: 103, scope: !7)
+!78 = !DILocation(line: 64, column: 35, scope: !7)
+!79 = !DILocation(line: 64, column: 40, scope: !7)
+!80 = !DILocation(line: 68, column: 57, scope: !7)
+!81 = !DILocation(line: 69, column: 35, scope: !7)
+!82 = !DILocation(line: 69, column: 54, scope: !7)
+!83 = !DILocation(line: 70, column: 24, scope: !7)
+!84 = !DILocation(line: 72, column: 24, scope: !7)
+!85 = !DILocation(line: 73, column: 24, scope: !7)
+!86 = !DILocation(line: 78, column: 30, scope: !7)
+!87 = !DILocation(line: 79, column: 24, scope: !7)
+!88 = !DILocation(line: 80, column: 24, scope: !7)
+!89 = !DILocation(line: 82, column: 29, scope: !7)
+!90 = !DILocation(line: 82, column: 52, scope: !7)
+!91 = !DILocation(line: 58, column: 4, scope: !7)
diff --git a/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..898aba1d1b099a49da74fbe60b9e306105ff158c
--- /dev/null
+++ b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir
@@ -0,0 +1,153 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
+    %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x4xi32>
+    %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
+    %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
+    %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
+    %23 = arith.andi %21, %22 : tensor<64x1xi1>
+    %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
+    %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>)  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
+      %52 = arith.addi %51, %7 : tensor<1x4xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
+      %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %55 = arith.addi %54, %13 : tensor<64x4xi32>
+      %56 = tt.addptr %14, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %59 = arith.addi %54, %16 : tensor<64x4xi32>
+      %60 = tt.addptr %17, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16>
+      %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
+      tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %63 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
+      %64 = tt.broadcast %63 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %65 = arith.addi %64, %25 : tensor<64x4xi64>
+      %66 = tt.addptr %26, %65 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %68 = arith.addf %67, %58 : tensor<64x4xf32>
+      %69 = arith.addf %68, %62 : tensor<64x4xf32>
+      %70 = arith.subf %69, %arg9 : tensor<64x4xf32>
+      %71 = arith.addf %arg11, %cst_1 : tensor<64x4xf32>
+      %72 = arith.divf %70, %71 : tensor<64x4xf32>
+      %73 = arith.addf %arg9, %72 : tensor<64x4xf32>
+      %74 = arith.subf %69, %73 : tensor<64x4xf32>
+      %75 = arith.mulf %70, %74 : tensor<64x4xf32>
+      %76 = arith.addf %arg10, %75 : tensor<64x4xf32>
+      %77 = arith.select %57, %73, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
+      %78 = arith.select %57, %76, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
+      %79 = arith.select %57, %71, %arg11 : tensor<64x4xi1>, tensor<64x4xf32>
+      scf.yield %77, %78, %79 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
+    }
+    %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %51 = arith.subf %arg11, %arg8 : f32
+      %52 = arith.addf %arg10, %arg13 : f32
+      %53 = arith.cmpf oeq, %52, %cst_0 : f32
+      %54 = arith.divf %arg13, %52 : f32
+      %55 = arith.select %53, %cst_0, %54 : f32
+      %56 = arith.mulf %51, %55 : f32
+      %57 = arith.addf %arg8, %56 : f32
+      %58 = arith.addf %arg9, %arg12 : f32
+      %59 = arith.mulf %51, %51 : f32
+      %60 = arith.mulf %59, %arg10 : f32
+      %61 = arith.mulf %60, %55 : f32
+      %62 = arith.addf %58, %61 : f32
+      tt.reduce.return %57, %62, %52 : f32, f32, f32
+    }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
+    %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
+    %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
+    %43 = arith.andi %41, %42 : tensor<64x1xi1>
+    %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
+    %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+    %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
+    %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
+    %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
+      %52 = arith.addi %51, %7 : tensor<1x4xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
+      %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %55 = arith.addi %54, %32 : tensor<64x4xi32>
+      %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %59 = arith.addi %54, %35 : tensor<64x4xi32>
+      %60 = tt.addptr %36, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
+      %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
+      %63 = tt.addptr %37, %52 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
+      %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
+      tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %65 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
+      %66 = tt.broadcast %65 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %67 = arith.addi %66, %45 : tensor<64x4xi64>
+      %68 = tt.addptr %46, %67 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
+      %70 = arith.addf %69, %58 : tensor<64x4xf32>
+      %71 = arith.addf %70, %62 : tensor<64x4xf32>
+      %72 = arith.subf %71, %47 : tensor<64x4xf32>
+      %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+      %75 = arith.mulf %72, %74 : tensor<64x4xf32>
+      %76 = tt.broadcast %64 : (tensor<1x4xf32>) -> tensor<64x4xf32>
+      %77 = arith.mulf %75, %76 : tensor<64x4xf32>
+      %78 = tt.addptr %50, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %79 = arith.truncf %77 : tensor<64x4xf32> to tensor<64x4xbf16>
+      tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..f7fec3b790850675dc182d4dff538abcd91e5f45
--- /dev/null
+++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx
@@ -0,0 +1,717 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<26>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<88>;
+	.reg .f32 	%f<78>;
+	.reg .b64 	%rd<14>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5de6de_param_0];
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5de6de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r56, %tid.x;
+	and.b32  	%r57, %r56, 31;
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5de6de_param_2];
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5de6de_param_4];
+	shl.b32 	%r58, %r56, 2;
+	and.b32  	%r59, %r58, 252;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r60, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r61, %r60, %r59;
+	.loc	1 30 30
+	mul.wide.s32 	%rd11, %r61, 4;
+	add.s64 	%rd1, %rd6, %rd11;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r4;
+	mov.b32 	%f2, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd12, %r61, 2;
+	add.s64 	%rd2, %rd7, %rd12;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f3, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f4, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f5, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f6, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd8, %rd12;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f7, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f8, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f9, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f10, %r25;
+	.loc	1 33 31
+	mul.wide.u32 	%rd13, %r59, 4;
+	add.s64 	%rd4, %rd9, %rd13;
+	.loc	1 33 36
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	@!%p1 mov.u32 %r28, %r6;
+	@!%p1 mov.u32 %r29, %r6;
+	.loc	1 35 18
+	add.f32 	%f11, %f5, %f1;
+	add.f32 	%f12, %f6, %f2;
+	.loc	1 30 46
+	mov.b32 	%f13, %r3;
+	mov.b32 	%f14, %r2;
+	.loc	1 35 18
+	add.f32 	%f15, %f3, %f14;
+	add.f32 	%f16, %f4, %f13;
+	.loc	1 37 18
+	add.f32 	%f17, %f16, %f8;
+	add.f32 	%f18, %f15, %f7;
+	add.f32 	%f19, %f11, %f9;
+	add.f32 	%f20, %f12, %f10;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f21, %f18, %f17;
+	add.f32 	%f22, %f21, %f19;
+	add.f32 	%f23, %f22, %f20;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r62, %f23;
+	shfl.sync.bfly.b32	%r63, %r62, 16, 31, -1;
+	mov.b32 	%f24, %r63;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r64, %f25;
+	shfl.sync.bfly.b32	%r65, %r64, 8, 31, -1;
+	mov.b32 	%f26, %r65;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f27, %f25, %f26;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r66, %f27;
+	shfl.sync.bfly.b32	%r67, %r66, 4, 31, -1;
+	mov.b32 	%f28, %r67;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f29, %f27, %f28;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r68, %f29;
+	shfl.sync.bfly.b32	%r69, %r68, 2, 31, -1;
+	mov.b32 	%f30, %r69;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f31, %f29, %f30;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r70, %f31;
+	shfl.sync.bfly.b32	%r71, %r70, 1, 31, -1;
+	mov.b32 	%f32, %r71;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p17, %r57, 0;
+	shr.u32 	%r72, %r56, 3;
+	and.b32  	%r73, %r72, 4;
+	mov.u32 	%r74, global_smem;
+	add.s32 	%r34, %r74, %r73;
+	mov.b32 	%r35, %f33;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r35;
+	bar.sync 	0;
+	setp.lt.s32 	%p18, %r56, 2;
+	add.s32 	%r37, %r74, %r58;
+	@%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
+	mov.b32 	%f34, %r36;
+	shfl.sync.bfly.b32	%r75, %r36, 1, 31, -1;
+	mov.b32 	%f35, %r75;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f36, %f34, %f35;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r76, %r56, 1;
+	setp.eq.b32 	%p24, %r76, 1;
+	not.pred 	%p25, %p24;
+	and.pred  	%p19, %p18, %p25;
+	mov.b32 	%r39, %f36;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r39;
+	bar.sync 	0;
+	ld.shared.f32 	%f37, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f38, %f37, 0f00000000;
+$L__tmp16:
+	.loc	1 45 20
+	mov.b32 	%r41, %f38;
+	mov.b32 	%r42, 1132462080;
+	div.full.f32 %r40, %r41, %r42;
+	mov.b32 	%f39, %r40;
+	.loc	1 46 19
+	sub.f32 	%f40, %f18, %f39;
+	sub.f32 	%f41, %f17, %f39;
+	sub.f32 	%f42, %f19, %f39;
+	sub.f32 	%f43, %f20, %f39;
+	.loc	1 47 20
+	mul.f32 	%f44, %f41, %f41;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f45, %f40, %f40, %f44;
+	fma.rn.f32 	%f46, %f42, %f42, %f45;
+	fma.rn.f32 	%f47, %f43, %f43, %f46;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r77, %f47;
+	shfl.sync.bfly.b32	%r78, %r77, 16, 31, -1;
+	mov.b32 	%f48, %r78;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r79, %f49;
+	shfl.sync.bfly.b32	%r80, %r79, 8, 31, -1;
+	mov.b32 	%f50, %r80;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f51, %f49, %f50;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r81, %f51;
+	shfl.sync.bfly.b32	%r82, %r81, 4, 31, -1;
+	mov.b32 	%f52, %r82;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f53, %f51, %f52;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r83, %f53;
+	shfl.sync.bfly.b32	%r84, %r83, 2, 31, -1;
+	mov.b32 	%f54, %r84;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f55, %f53, %f54;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r85, %f55;
+	shfl.sync.bfly.b32	%r86, %r85, 1, 31, -1;
+	mov.b32 	%f56, %r86;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r44, %f57;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r44;
+	bar.sync 	0;
+	@%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
+	mov.b32 	%f58, %r45;
+	shfl.sync.bfly.b32	%r87, %r45, 1, 31, -1;
+	mov.b32 	%f59, %r87;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f60, %f58, %f59;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r48, %f60;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r48;
+	bar.sync 	0;
+	ld.shared.f32 	%f61, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f62, %f61, 0f00000000;
+$L__tmp33:
+	.loc	1 53 20
+	mov.b32 	%r50, %f62;
+	div.full.f32 %r49, %r50, %r42;
+	mov.b32 	%f63, %r49;
+	.loc	1 55 20
+	add.f32 	%f64, %f63, 0f3727C5AC;
+	.loc	1 56 26
+	rsqrt.approx.ftz.f32 	%f65, %f64;
+	.loc	1 33 36
+	mov.b32 	%f66, %r29;
+	mov.b32 	%f67, %r28;
+	mov.b32 	%f68, %r27;
+	mov.b32 	%f69, %r26;
+	.loc	1 57 20
+	mul.f32 	%f70, %f40, %f65;
+	mul.f32 	%f71, %f41, %f65;
+	mul.f32 	%f72, %f42, %f65;
+	mul.f32 	%f73, %f43, %f65;
+	.loc	1 58 20
+	mul.f32 	%f74, %f70, %f69;
+	mul.f32 	%f75, %f71, %f68;
+	mul.f32 	%f76, %f72, %f67;
+	mul.f32 	%f77, %f73, %f66;
+	.loc	1 59 25
+	add.s64 	%rd5, %rd10, %rd11;
+	.loc	1 59 48
+	mov.b32 	%r52, %f74;
+	mov.b32 	%r53, %f75;
+	mov.b32 	%r54, %f76;
+	mov.b32 	%r55, %f77;
+	@%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r52, %r53, %r54, %r55 };
+	.loc	1 59 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/pe/cpedrbcgvftrmo3x6vfpo6dhkxbweq3ucfj5jibyyvr3hf67gsvx.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 395
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 101
+.b8 100
+.b8 114
+.b8 98
+.b8 99
+.b8 103
+.b8 118
+.b8 102
+.b8 116
+.b8 114
+.b8 109
+.b8 111
+.b8 51
+.b8 120
+.b8 54
+.b8 118
+.b8 102
+.b8 112
+.b8 111
+.b8 54
+.b8 100
+.b8 104
+.b8 107
+.b8 120
+.b8 98
+.b8 119
+.b8 101
+.b8 113
+.b8 51
+.b8 117
+.b8 99
+.b8 102
+.b8 106
+.b8 53
+.b8 106
+.b8 105
+.b8 98
+.b8 121
+.b8 121
+.b8 118
+.b8 114
+.b8 51
+.b8 104
+.b8 102
+.b8 54
+.b8 55
+.b8 103
+.b8 115
+.b8 118
+.b8 120
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 101
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 42
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 50
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 50
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 50
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 399
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 399
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..99650564700076119036147fd33db6a793273933
--- /dev/null
+++ b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir
@@ -0,0 +1,109 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2d34e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 7, !dbg !8
+  %8 = zext nneg i32 %7 to i64, !dbg !9
+  %9 = getelementptr float, ptr addrspace(1) %1, i64 %8, !dbg !9
+  %10 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %9, i1 true, i32 0, i1 true) #3, !dbg !10
+  %11 = bitcast i32 %10 to float, !dbg !10
+  %12 = getelementptr i64, ptr addrspace(1) %2, i64 %8, !dbg !11
+  %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %12, i1 true, i1 true) #3, !dbg !12
+  %14 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %10, i32 4, i32 31), !dbg !13
+  %15 = bitcast i32 %14 to float, !dbg !13
+  %16 = fadd float %11, %15, !dbg !17
+  %17 = bitcast float %16 to i32, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %17, i32 2, i32 31), !dbg !13
+  %19 = bitcast i32 %18 to float, !dbg !13
+  %20 = fadd float %16, %19, !dbg !17
+  %21 = bitcast float %20 to i32, !dbg !13
+  %22 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %21, i32 1, i32 31), !dbg !13
+  %23 = bitcast i32 %22 to float, !dbg !13
+  %24 = fadd float %20, %23, !dbg !17
+  %25 = trunc i64 %13 to i32, !dbg !21
+  %26 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %25, i32 4, i32 31), !dbg !21
+  %bc = bitcast i64 %13 to <2 x i32>, !dbg !21
+  %27 = extractelement <2 x i32> %bc, i64 1, !dbg !21
+  %28 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %27, i32 4, i32 31), !dbg !21
+  %29 = insertelement <2 x i32> undef, i32 %26, i64 0, !dbg !21
+  %30 = insertelement <2 x i32> %29, i32 %28, i64 1, !dbg !21
+  %31 = bitcast <2 x i32> %30 to i64, !dbg !21
+  %32 = add i64 %13, %31, !dbg !23
+  %33 = trunc i64 %32 to i32, !dbg !21
+  %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 2, i32 31), !dbg !21
+  %bc1 = bitcast i64 %32 to <2 x i32>, !dbg !21
+  %35 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
+  %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !21
+  %37 = insertelement <2 x i32> undef, i32 %34, i64 0, !dbg !21
+  %38 = insertelement <2 x i32> %37, i32 %36, i64 1, !dbg !21
+  %39 = bitcast <2 x i32> %38 to i64, !dbg !21
+  %40 = add i64 %32, %39, !dbg !23
+  %41 = trunc i64 %40 to i32, !dbg !21
+  %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !21
+  %bc2 = bitcast i64 %40 to <2 x i32>, !dbg !21
+  %43 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
+  %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 1, i32 31), !dbg !21
+  %45 = insertelement <2 x i32> undef, i32 %42, i64 0, !dbg !21
+  %46 = insertelement <2 x i32> %45, i32 %44, i64 1, !dbg !21
+  %47 = bitcast <2 x i32> %46 to i64, !dbg !21
+  %48 = add i64 %40, %47, !dbg !23
+  %49 = sitofp i64 %48 to float, !dbg !26
+  %50 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %24, float %49) #3, !dbg !27
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %51 = and i32 %6, 63, !dbg !29
+  %52 = icmp eq i32 %51, 0, !dbg !29
+  %53 = bitcast float %50 to i32, !dbg !29
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %53, ptr addrspace(1) %0, i1 %52) #3, !dbg !29
+  ret void, !dbg !30
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c2qomesxoic3sfzpdzftrhej7z6hhd6pritis2f4ye2ckqoetmyt.py", directory: "/tmp/torchinductor_root/2q")
+!3 = !{ptr @triton__0d1d2d34e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d34e, !"maxntidx", i32 64}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d34e", linkageName: "triton__0d1d2d34e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 25, column: 34, scope: !5)
+!9 = !DILocation(line: 28, column: 30, scope: !5)
+!10 = !DILocation(line: 28, column: 35, scope: !5)
+!11 = !DILocation(line: 29, column: 30, scope: !5)
+!12 = !DILocation(line: 29, column: 35, scope: !5)
+!13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
+!15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!16 = !DILocation(line: 32, column: 24, scope: !14)
+!17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
+!18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
+!19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
+!20 = !DILocation(line: 32, column: 24, scope: !18)
+!21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
+!22 = !DILocation(line: 35, column: 24, scope: !14)
+!23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
+!24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
+!25 = !DILocation(line: 35, column: 24, scope: !18)
+!26 = !DILocation(line: 36, column: 20, scope: !5)
+!27 = !DILocation(line: 37, column: 19, scope: !5)
+!28 = !DILocation(line: 38, column: 4, scope: !5)
+!29 = !DILocation(line: 39, column: 71, scope: !5)
+!30 = !DILocation(line: 39, column: 4, scope: !5)
diff --git a/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..84eab422344b89e73a95f9b982944d1d0171de24
--- /dev/null
+++ b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx
@@ -0,0 +1,1054 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<56>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<185>;
+	.reg .f32 	%f<169>;
+	.reg .b64 	%rd<59>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
+	ld.param.u64 	%rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	and.b32  	%r2, %r1, 31;
+	ld.param.u64 	%rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
+	bfe.u32 	%r3, %r1, 6, 1;
+	and.b32  	%r4, %r1, 1;
+	.loc	1 24 33
+	bfe.u32 	%r5, %r1, 5, 1;
+	shl.b32 	%r31, %r1, 2;
+	and.b32  	%r6, %r31, 252;
+	shl.b32 	%r32, %r1, 1;
+	and.b32  	%r7, %r32, 254;
+	.loc	1 21 28
+	mov.u32 %r14, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r33, %r14, 1;
+	.loc	1 22 23
+	or.b32  	%r34, %r33, %r3;
+	or.b32  	%r35, %r33, %r4;
+	.loc	1 26 30
+	mul.wide.s32 	%rd25, %r34, 8;
+	add.s64 	%rd11, %rd22, %rd25;
+	mul.wide.s32 	%rd26, %r35, 8;
+	add.s64 	%rd19, %rd22, %rd26;
+	mov.pred 	%p50, -1;
+	.loc	1 26 35
+	mov.u64 %rd10, 0x0;
+	@%p50 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
+	mov.u64 %rd12, 0x0;
+	@%p50 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
+	mov.u64 %rd14, 0x0;
+	@%p50 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
+	mov.u64 %rd16, 0x0;
+	@%p50 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
+	mov.u64 %rd18, 0x0;
+	@%p50 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r36, %r14, 30, 1;
+	shr.u32 	%r37, %r36, 23;
+	add.s32 	%r38, %r34, %r37;
+	and.b32  	%r39, %r38, 16776704;
+	sub.s32 	%r40, %r34, %r39;
+	.loc	1 35 44
+	shl.b32 	%r41, %r40, 8;
+	.loc	1 35 40
+	or.b32  	%r42, %r41, %r6;
+	.loc	1 35 34
+	mul.wide.s32 	%rd27, %r42, 4;
+	add.s64 	%rd38, %rd23, %rd27;
+	mov.b32 	%r155, 0;
+	.loc	1 35 50
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	mov.u32 %r18, 0x0;
+	@%p50 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd38 + 0 ];
+	@!%p50 mov.u32 %r15, %r155;
+	@!%p50 mov.u32 %r16, %r155;
+	@!%p50 mov.u32 %r17, %r155;
+	@!%p50 mov.u32 %r18, %r155;
+	mov.b32 	%f2, %r15;
+	mov.b32 	%f1, %r16;
+	mov.b32 	%f3, %r17;
+	mov.b32 	%f4, %r18;
+	.loc	1 36 44
+	shl.b32 	%r43, %r34, 8;
+	.loc	1 36 40
+	or.b32  	%r44, %r43, %r6;
+	.loc	1 36 34
+	mul.wide.s32 	%rd28, %r44, 2;
+	add.s64 	%rd39, %rd24, %rd28;
+	.loc	1 36 50
+	mov.u32 %r23, 0x0;
+	mov.u32 %r24, 0x0;
+	@%p50 ld.global.L1::evict_last.v2.b32 { %r23, %r24 }, [ %rd39 + 0 ];
+	@!%p50 mov.u32 %r23, %r155;
+	@!%p50 mov.u32 %r24, %r155;
+	cvt.u16.u32 	%rs1, %r23;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r23; }
+	cvt.u16.u32 	%rs3, %r24;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r24; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r27, %rs1;
+	mov.b32 	%f5, %r27;
+	cvt.f32.bf16 %r28, %rs2;
+	mov.b32 	%f6, %r28;
+	cvt.f32.bf16 %r29, %rs3;
+	mov.b32 	%f7, %r29;
+	cvt.f32.bf16 %r30, %rs4;
+	mov.b32 	%f8, %r30;
+	.loc	1 37 22
+	add.s64 	%rd29, %rd18, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p14, %rd18, 0;
+	.loc	1 39 36
+	selp.b64 	%rd5, %rd29, %rd18, %p14;
+	.loc	1 40 40
+	setp.lt.u64 	%p15, %rd5, 50257;
+	mov.b32 	%r184, 883;
+	mov.u64 	%rd58, 1;
+	.loc	1 40 55
+	@%p15 bra 	$L__BB0_2;
+	mov.u64 	%rd30, assertMessage_0;
+	cvta.global.u64 	%rd31, %rd30;
+	mov.u64 	%rd32, assertFile_0;
+	cvta.global.u64 	%rd33, %rd32;
+	mov.u64 	%rd34, assertFunc_0;
+	cvta.global.u64 	%rd35, %rd34;
+	{ // callseq 4, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd31;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd33;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r184;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd35;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd58;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 4
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
+	cvt.s64.s32 	%rd3, %r44;
+	.loc	1 38 22
+	setp.lt.s64 	%p42, %rd10, 0;
+	.loc	1 41 44
+	shl.b64 	%rd41, %rd10, 8;
+	add.s64 	%rd42, %rd41, 12865792;
+	selp.b64 	%rd43, %rd42, %rd41, %p42;
+	cvt.u64.u32 	%rd44, %r6;
+	.loc	1 41 40
+	or.b64  	%rd45, %rd43, %rd44;
+	.loc	1 41 34
+	shl.b64 	%rd46, %rd45, 2;
+	add.s64 	%rd55, %rd7, %rd46;
+	.loc	1 41 52
+	mov.u32 %r46, 0x0;
+	mov.u32 %r47, 0x0;
+	mov.u32 %r48, 0x0;
+	mov.u32 %r49, 0x0;
+	@%p50 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd55 + 0 ];
+	@!%p50 mov.u32 %r46, %r155;
+	@!%p50 mov.u32 %r47, %r155;
+	@!%p50 mov.u32 %r48, %r155;
+	@!%p50 mov.u32 %r49, %r155;
+	mov.b32 	%f15, %r48;
+	mov.b32 	%f16, %r49;
+	.loc	1 42 22
+	add.f32 	%f17, %f3, %f15;
+	add.f32 	%f18, %f4, %f16;
+	.loc	1 44 22
+	add.f32 	%f19, %f7, %f17;
+	add.f32 	%f20, %f8, %f18;
+	.loc	1 41 52
+	mov.b32 	%f21, %r46;
+	mov.b32 	%f22, %r47;
+	.loc	1 42 22
+	add.f32 	%f23, %f1, %f22;
+	add.f32 	%f24, %f2, %f21;
+	.loc	1 44 22
+	add.f32 	%f25, %f5, %f24;
+	add.f32 	%f26, %f6, %f23;
+$L__tmp1:
+	.loc	2 98 22
+	add.f32 	%f27, %f26, 0f00000000;
+	add.f32 	%f28, %f25, 0f00000000;
+	add.f32 	%f29, %f19, 0f00000000;
+	add.f32 	%f30, %f20, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f31, %f25, %f28;
+	sub.f32 	%f32, %f26, %f27;
+	sub.f32 	%f33, %f19, %f29;
+	sub.f32 	%f34, %f20, %f30;
+	.loc	2 101 13
+	fma.rn.f32 	%f35, %f25, %f31, 0f00000000;
+	fma.rn.f32 	%f36, %f26, %f32, 0f00000000;
+	fma.rn.f32 	%f37, %f19, %f33, 0f00000000;
+	fma.rn.f32 	%f38, %f20, %f34, 0f00000000;
+$L__tmp2:
+	.loc	2 108 21
+	sub.f32 	%f39, %f27, %f28;
+	mov.b32 	%r55, 1065353216;
+	mov.b32 	%r56, 1073741824;
+	.loc	2 110 60
+	div.full.f32 %r54, %r55, %r56;
+	mov.b32 	%f40, %r54;
+	.loc	2 112 17
+	fma.rn.f32 	%f41, %f40, %f39, %f28;
+	.loc	2 113 15
+	add.f32 	%f42, %f35, %f36;
+	.loc	2 113 30
+	mul.f32 	%f43, %f39, %f39;
+	.loc	2 113 22
+	fma.rn.f32 	%f44, %f40, %f43, %f42;
+	.loc	2 108 21
+	sub.f32 	%f45, %f29, %f41;
+	mov.b32 	%r59, 1077936128;
+	.loc	2 110 60
+	div.full.f32 %r57, %r55, %r59;
+	mov.b32 	%f46, %r57;
+	.loc	2 112 17
+	fma.rn.f32 	%f47, %f46, %f45, %f41;
+	.loc	2 113 15
+	add.f32 	%f48, %f37, %f44;
+	.loc	2 113 30
+	mul.f32 	%f49, %f45, %f45;
+	.loc	2 113 38
+	fma.rn.f32 	%f50, %f45, %f45, %f49;
+	.loc	2 113 22
+	fma.rn.f32 	%f51, %f46, %f50, %f48;
+	.loc	2 108 21
+	sub.f32 	%f52, %f30, %f47;
+	mov.b32 	%r62, 1082130432;
+	.loc	2 110 60
+	div.full.f32 %r60, %r55, %r62;
+	mov.b32 	%f53, %r60;
+	.loc	2 112 17
+	fma.rn.f32 	%f54, %f53, %f52, %f47;
+	.loc	2 113 15
+	add.f32 	%f55, %f38, %f51;
+	.loc	2 113 30
+	mul.f32 	%f56, %f52, %f52;
+	.loc	2 113 38
+	mul.f32 	%f57, %f56, 0f40400000;
+	.loc	2 113 22
+	fma.rn.f32 	%f58, %f53, %f57, %f55;
+$L__tmp3:
+	.loc	2 120 46
+	mov.b32 	%r119, %f54;
+	shfl.sync.bfly.b32	%r120, %r119, 16, 31, -1;
+	mov.b32 	%f59, %r120;
+	mov.b32 	%r121, %f58;
+	shfl.sync.bfly.b32	%r122, %r121, 16, 31, -1;
+	mov.b32 	%f60, %r122;
+	shfl.sync.bfly.b32	%r64, %r62, 16, 31, -1;
+	mov.b32 	%f61, %r64;
+$L__tmp4:
+	.loc	2 108 21
+	sub.f32 	%f62, %f59, %f54;
+	.loc	2 109 28
+	add.f32 	%f63, %f61, 0f40800000;
+	.loc	2 110 39
+	setp.eq.f32 	%p43, %f63, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r65, %f63;
+	div.full.f32 %r63, %r64, %r65;
+	mov.b32 	%f64, %r63;
+	.loc	2 110 49
+	selp.f32 	%f65, 0f00000000, %f64, %p43;
+	.loc	2 112 17
+	fma.rn.f32 	%f66, %f65, %f62, %f54;
+	.loc	2 113 15
+	add.f32 	%f67, %f58, %f60;
+	.loc	2 113 30
+	mul.f32 	%f68, %f62, %f62;
+	.loc	2 113 38
+	mul.f32 	%f69, %f68, 0f40800000;
+	.loc	2 113 22
+	fma.rn.f32 	%f70, %f65, %f69, %f67;
+$L__tmp5:
+	.loc	2 120 46
+	mov.b32 	%r123, %f66;
+	shfl.sync.bfly.b32	%r124, %r123, 8, 31, -1;
+	mov.b32 	%f71, %r124;
+	mov.b32 	%r125, %f70;
+	shfl.sync.bfly.b32	%r126, %r125, 8, 31, -1;
+	mov.b32 	%f72, %r126;
+	shfl.sync.bfly.b32	%r67, %r65, 8, 31, -1;
+	mov.b32 	%f73, %r67;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f74, %f71, %f66;
+	.loc	2 109 28
+	add.f32 	%f75, %f63, %f73;
+	.loc	2 110 39
+	setp.eq.f32 	%p44, %f75, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r68, %f75;
+	div.full.f32 %r66, %r67, %r68;
+	mov.b32 	%f76, %r66;
+	.loc	2 110 49
+	selp.f32 	%f77, 0f00000000, %f76, %p44;
+	.loc	2 112 17
+	fma.rn.f32 	%f78, %f77, %f74, %f66;
+	.loc	2 113 15
+	add.f32 	%f79, %f70, %f72;
+	.loc	2 113 30
+	mul.f32 	%f80, %f74, %f74;
+	.loc	2 113 38
+	mul.f32 	%f81, %f63, %f80;
+	.loc	2 113 22
+	fma.rn.f32 	%f82, %f77, %f81, %f79;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r127, %f78;
+	shfl.sync.bfly.b32	%r128, %r127, 4, 31, -1;
+	mov.b32 	%f83, %r128;
+	mov.b32 	%r129, %f82;
+	shfl.sync.bfly.b32	%r130, %r129, 4, 31, -1;
+	mov.b32 	%f84, %r130;
+	shfl.sync.bfly.b32	%r70, %r68, 4, 31, -1;
+	mov.b32 	%f85, %r70;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f86, %f83, %f78;
+	.loc	2 109 28
+	add.f32 	%f87, %f75, %f85;
+	.loc	2 110 39
+	setp.eq.f32 	%p45, %f87, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r71, %f87;
+	div.full.f32 %r69, %r70, %r71;
+	mov.b32 	%f88, %r69;
+	.loc	2 110 49
+	selp.f32 	%f89, 0f00000000, %f88, %p45;
+	.loc	2 112 17
+	fma.rn.f32 	%f90, %f89, %f86, %f78;
+	.loc	2 113 15
+	add.f32 	%f91, %f82, %f84;
+	.loc	2 113 30
+	mul.f32 	%f92, %f86, %f86;
+	.loc	2 113 38
+	mul.f32 	%f93, %f75, %f92;
+	.loc	2 113 22
+	fma.rn.f32 	%f94, %f89, %f93, %f91;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r131, %f90;
+	shfl.sync.bfly.b32	%r132, %r131, 2, 31, -1;
+	mov.b32 	%f95, %r132;
+	mov.b32 	%r133, %f94;
+	shfl.sync.bfly.b32	%r134, %r133, 2, 31, -1;
+	mov.b32 	%f96, %r134;
+	shfl.sync.bfly.b32	%r73, %r71, 2, 31, -1;
+	mov.b32 	%f97, %r73;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f98, %f95, %f90;
+	.loc	2 109 28
+	add.f32 	%f99, %f87, %f97;
+	.loc	2 110 39
+	setp.eq.f32 	%p46, %f99, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r74, %f99;
+	div.full.f32 %r72, %r73, %r74;
+	mov.b32 	%f100, %r72;
+	.loc	2 110 49
+	selp.f32 	%f101, 0f00000000, %f100, %p46;
+	.loc	2 112 17
+	fma.rn.f32 	%f102, %f101, %f98, %f90;
+	.loc	2 113 15
+	add.f32 	%f103, %f94, %f96;
+	.loc	2 113 30
+	mul.f32 	%f104, %f98, %f98;
+	.loc	2 113 38
+	mul.f32 	%f105, %f87, %f104;
+	.loc	2 113 22
+	fma.rn.f32 	%f106, %f101, %f105, %f103;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r135, %f102;
+	shfl.sync.bfly.b32	%r136, %r135, 1, 31, -1;
+	mov.b32 	%f107, %r136;
+	mov.b32 	%r137, %f106;
+	shfl.sync.bfly.b32	%r138, %r137, 1, 31, -1;
+	mov.b32 	%f108, %r138;
+	shfl.sync.bfly.b32	%r76, %r74, 1, 31, -1;
+	mov.b32 	%f109, %r76;
+$L__tmp12:
+	.loc	2 108 21
+	sub.f32 	%f110, %f107, %f102;
+	.loc	2 109 28
+	add.f32 	%f111, %f99, %f109;
+	.loc	2 110 39
+	setp.eq.f32 	%p47, %f111, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r77, %f111;
+	div.full.f32 %r75, %r76, %r77;
+	mov.b32 	%f112, %r75;
+	.loc	2 110 49
+	selp.f32 	%f113, 0f00000000, %f112, %p47;
+	.loc	2 112 17
+	fma.rn.f32 	%f114, %f113, %f110, %f102;
+	.loc	2 113 15
+	add.f32 	%f115, %f106, %f108;
+	.loc	2 113 30
+	mul.f32 	%f116, %f110, %f110;
+	.loc	2 113 38
+	mul.f32 	%f117, %f99, %f116;
+	.loc	2 113 22
+	fma.rn.f32 	%f118, %f113, %f117, %f115;
+$L__tmp13:
+	.loc	2 120 46
+	setp.eq.s32 	%p21, %r2, 0;
+	shl.b32 	%r139, %r5, 2;
+	shl.b32 	%r140, %r3, 3;
+	or.b32  	%r141, %r140, %r139;
+	mov.u32 	%r142, global_smem;
+	add.s32 	%r78, %r142, %r141;
+	mov.b32 	%r79, %f114;
+	@%p21 st.shared.b32 [ %r78 + 0 ], %r79;
+	add.s32 	%r143, %r142, 16;
+	add.s32 	%r80, %r143, %r141;
+	mov.b32 	%r81, %f118;
+	@%p21 st.shared.b32 [ %r80 + 0 ], %r81;
+	add.s32 	%r144, %r142, 32;
+	add.s32 	%r82, %r144, %r141;
+	@%p21 st.shared.b32 [ %r82 + 0 ], %r77;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r1, 4;
+	add.s32 	%r85, %r142, %r31;
+	@%p24 ld.shared.b32 %r84, [ %r85 + 0 ];
+	mov.b32 	%f119, %r84;
+	add.s32 	%r87, %r143, %r31;
+	@%p24 ld.shared.b32 %r86, [ %r87 + 0 ];
+	mov.b32 	%f120, %r86;
+	add.s32 	%r89, %r144, %r31;
+	@%p24 ld.shared.b32 %r88, [ %r89 + 0 ];
+	mov.b32 	%f121, %r88;
+	shfl.sync.bfly.b32	%r146, %r84, 1, 31, -1;
+	mov.b32 	%f122, %r146;
+	shfl.sync.bfly.b32	%r147, %r86, 1, 31, -1;
+	mov.b32 	%f123, %r147;
+	shfl.sync.bfly.b32	%r91, %r88, 1, 31, -1;
+	mov.b32 	%f124, %r91;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f125, %f122, %f119;
+	.loc	2 109 28
+	add.f32 	%f126, %f121, %f124;
+	.loc	2 110 39
+	setp.eq.f32 	%p48, %f126, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r92, %f126;
+	div.full.f32 %r90, %r91, %r92;
+	mov.b32 	%f127, %r90;
+	.loc	2 110 49
+	selp.f32 	%f128, 0f00000000, %f127, %p48;
+	.loc	2 112 17
+	fma.rn.f32 	%f129, %f125, %f128, %f119;
+	.loc	2 113 15
+	add.f32 	%f130, %f120, %f123;
+	.loc	2 113 30
+	mul.f32 	%f131, %f125, %f125;
+	.loc	2 113 38
+	mul.f32 	%f132, %f121, %f131;
+	.loc	2 113 22
+	fma.rn.f32 	%f133, %f132, %f128, %f130;
+$L__tmp15:
+	.loc	2 120 46
+	setp.eq.s32 	%p49, %r4, 0;
+	and.pred  	%p27, %p24, %p49;
+	mov.b32 	%r94, %f129;
+	@%p27 st.shared.b32 [ %r85 + 0 ], %r94;
+	mov.b32 	%r96, %f133;
+	@%p27 st.shared.b32 [ %r87 + 0 ], %r96;
+	@%p27 st.shared.b32 [ %r89 + 0 ], %r92;
+	bar.sync 	0;
+	add.s32 	%r148, %r142, %r140;
+	ld.shared.f32 	%f9, [%r148];
+	add.s32 	%r149, %r143, %r140;
+	ld.shared.f32 	%f10, [%r149];
+$L__tmp16:
+	.loc	1 62 51
+	mov.u32 %r99, 0x0;
+	mov.u32 %r100, 0x0;
+	mov.u32 %r101, 0x0;
+	mov.u32 %r102, 0x0;
+	@%p50 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd38 + 0 ];
+	@!%p50 mov.u32 %r99, %r155;
+	@!%p50 mov.u32 %r100, %r155;
+	@!%p50 mov.u32 %r101, %r155;
+	@!%p50 mov.u32 %r102, %r155;
+	.loc	1 63 51
+	mov.u32 %r107, 0x0;
+	mov.u32 %r108, 0x0;
+	@%p50 ld.global.L1::evict_first.v2.b32 { %r107, %r108 }, [ %rd39 + 0 ];
+	@!%p50 mov.u32 %r107, %r155;
+	@!%p50 mov.u32 %r108, %r155;
+	cvt.u16.u32 	%rs5, %r107;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r107; }
+	cvt.u16.u32 	%rs7, %r108;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r108; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r111, %rs5;
+	mov.b32 	%f11, %r111;
+	cvt.f32.bf16 %r112, %rs6;
+	mov.b32 	%f12, %r112;
+	cvt.f32.bf16 %r113, %rs7;
+	mov.b32 	%f13, %r113;
+	cvt.f32.bf16 %r114, %rs8;
+	mov.b32 	%f14, %r114;
+	.loc	1 64 35
+	mul.wide.u32 	%rd47, %r7, 4;
+	add.s64 	%rd40, %rd8, %rd47;
+	.loc	1 64 40
+	mov.u32 %r115, 0x0;
+	mov.u32 %r116, 0x0;
+	@%p50 ld.global.L1::evict_last.v2.b32 { %r115, %r116 }, [ %rd40 + 0 ];
+	@!%p50 mov.u32 %r115, %r155;
+	@!%p50 mov.u32 %r116, %r155;
+	.loc	1 68 57
+	@%p15 bra 	$L__BB0_4;
+	mov.u64 	%rd48, assertMessage_1;
+	cvta.global.u64 	%rd49, %rd48;
+	mov.u64 	%rd50, assertFile_1;
+	cvta.global.u64 	%rd51, %rd50;
+	mov.u64 	%rd52, assertFunc_1;
+	cvta.global.u64 	%rd53, %rd52;
+	{ // callseq 5, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd49;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd51;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r184;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd53;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd58;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 5
+$L__BB0_4:
+	.loc	1 69 54
+	mov.u32 %r151, 0x0;
+	mov.u32 %r152, 0x0;
+	mov.u32 %r153, 0x0;
+	mov.u32 %r154, 0x0;
+	@%p50 ld.global.L1::evict_first.v4.b32 { %r151, %r152, %r153, %r154 }, [ %rd55 + 0 ];
+	@!%p50 mov.u32 %r151, %r155;
+	@!%p50 mov.u32 %r152, %r155;
+	@!%p50 mov.u32 %r153, %r155;
+	@!%p50 mov.u32 %r154, %r155;
+	.loc	1 75 24
+	mov.b32 	%r160, %f10;
+	mov.b32 	%r161, 1132462080;
+	div.full.f32 %r159, %r160, %r161;
+	mov.b32 	%f134, %r159;
+	.loc	1 77 24
+	add.f32 	%f135, %f134, 0f3727C5AC;
+	.loc	1 78 30
+	rsqrt.approx.ftz.f32 	%f136, %f135;
+	.loc	1 69 54
+	mov.b32 	%f137, %r154;
+	.loc	1 62 51
+	mov.b32 	%f138, %r102;
+	.loc	1 70 24
+	add.f32 	%f139, %f138, %f137;
+	.loc	1 72 24
+	add.f32 	%f140, %f14, %f139;
+	.loc	1 73 24
+	sub.f32 	%f141, %f140, %f9;
+	.loc	1 69 54
+	mov.b32 	%f142, %r153;
+	.loc	1 62 51
+	mov.b32 	%f143, %r101;
+	.loc	1 70 24
+	add.f32 	%f144, %f143, %f142;
+	.loc	1 72 24
+	add.f32 	%f145, %f13, %f144;
+	.loc	1 73 24
+	sub.f32 	%f146, %f145, %f9;
+	.loc	1 69 54
+	mov.b32 	%f147, %r152;
+	.loc	1 62 51
+	mov.b32 	%f148, %r100;
+	.loc	1 70 24
+	add.f32 	%f149, %f148, %f147;
+	.loc	1 72 24
+	add.f32 	%f150, %f12, %f149;
+	.loc	1 73 24
+	sub.f32 	%f151, %f150, %f9;
+	.loc	1 69 54
+	mov.b32 	%f152, %r151;
+	.loc	1 62 51
+	mov.b32 	%f153, %r99;
+	.loc	1 70 24
+	add.f32 	%f154, %f153, %f152;
+	.loc	1 72 24
+	add.f32 	%f155, %f11, %f154;
+	.loc	1 73 24
+	sub.f32 	%f156, %f155, %f9;
+	.loc	1 79 24
+	mul.f32 	%f157, %f156, %f136;
+	mul.f32 	%f158, %f151, %f136;
+	mul.f32 	%f159, %f146, %f136;
+	mul.f32 	%f160, %f141, %f136;
+	.loc	1 80 24
+	bar.sync 	0;
+	shl.b32 	%r177, %r7, 2;
+	add.s32 	%r179, %r142, %r177;
+	st.shared.v2.u32 	[%r179], {%r115, %r116};
+	bar.sync 	0;
+	shl.b32 	%r180, %r6, 2;
+	add.s32 	%r181, %r142, %r180;
+	ld.shared.v4.f32 	{%f161, %f162, %f163, %f164}, [%r181];
+	mul.f32 	%f165, %f157, %f161;
+	mul.f32 	%f166, %f158, %f162;
+	mul.f32 	%f167, %f159, %f163;
+	mul.f32 	%f168, %f160, %f164;
+	.loc	1 82 29
+	shl.b64 	%rd57, %rd3, 1;
+	add.s64 	%rd56, %rd9, %rd57;
+	.loc	1 82 52
+	mov.b32 	%r171, %f165;
+	cvt.rn.bf16.f32 %rs9, %r171;
+	mov.b32 	%r172, %f166;
+	cvt.rn.bf16.f32 %rs10, %r172;
+	mov.b32 	%r173, %f167;
+	cvt.rn.bf16.f32 %rs11, %r173;
+	mov.b32 	%r174, %f168;
+	cvt.rn.bf16.f32 %rs12, %r174;
+	mov.b32 	%r182, {%rs9, %rs10};
+	mov.b32 	%r183, {%rs11, %rs12};
+	@%p50 st.global.v2.b32 [ %rd56 + 0 ], { %r182, %r183 };
+	.loc	1 58 4
+	ret;
+$L__tmp17:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 103
+.b8 54
+.b8 102
+.b8 107
+.b8 105
+.b8 54
+.b8 112
+.b8 52
+.b8 108
+.b8 120
+.b8 114
+.b8 100
+.b8 109
+.b8 103
+.b8 103
+.b8 54
+.b8 101
+.b8 117
+.b8 100
+.b8 97
+.b8 104
+.b8 105
+.b8 101
+.b8 120
+.b8 99
+.b8 118
+.b8 117
+.b8 101
+.b8 101
+.b8 111
+.b8 108
+.b8 50
+.b8 112
+.b8 52
+.b8 113
+.b8 112
+.b8 53
+.b8 51
+.b8 50
+.b8 112
+.b8 118
+.b8 118
+.b8 101
+.b8 50
+.b8 121
+.b8 52
+.b8 54
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 47
+.b8 41
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp16
+.b8 2
+.b8 53
+.b8 44
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..25abc169d1d1374501fdd159109f68bf547dab9b
--- /dev/null
+++ b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttgir
@@ -0,0 +1,134 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
+    %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
+    %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
+    %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
+    %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
+    %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
+    %cst_10 = arith.constant 0.000000e+00 : f32
+    %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
+    %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x256xbf16, #blocked>
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
+    %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
+    %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
+    %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
+    %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
+    %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
+    %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
+    %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
+    %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
+    %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
+    %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
+    %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %31 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
+    %32 = tt.broadcast %31 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
+    %33 = arith.addi %24, %32 : tensor<2x256xi32, #blocked>
+    %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
+    %35 = tt.addptr %34, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %36 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
+    %37 = arith.extf %36 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
+    %38 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
+    %39 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
+    %40 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
+    %41 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
+    %42 = arith.select %40, %38, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
+    %43 = arith.select %41, %39, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
+    %44 = arith.cmpi sge, %43, %cst_8 : tensor<2x1xi64, #blocked2>
+    %45 = arith.cmpi slt, %43, %cst_9 : tensor<2x1xi64, #blocked2>
+    %46 = arith.andi %44, %45 : tensor<2x1xi1, #blocked2>
+    tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %47 = arith.muli %42, %cst_5 : tensor<2x1xi64, #blocked>
+    %48 = tt.broadcast %47 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
+    %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
+    %51 = arith.addi %50, %48 : tensor<2x256xi64, #blocked>
+    %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
+    %53 = tt.addptr %52, %51 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
+    %54 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %55 = arith.addf %54, %30 : tensor<2x256xf32, #blocked>
+    %56 = arith.addf %55, %37 : tensor<2x256xf32, #blocked>
+    %57 = arith.addf %56, %cst_13 : tensor<2x256xf32, #blocked>
+    %58 = arith.subf %56, %57 : tensor<2x256xf32, #blocked>
+    %59 = arith.mulf %56, %58 : tensor<2x256xf32, #blocked>
+    %60 = arith.addf %59, %cst_13 : tensor<2x256xf32, #blocked>
+    %61 = arith.select %29, %57, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %62 = arith.select %29, %60, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
+    %63 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
+    %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %90 = arith.subf %arg11, %arg8 : f32
+      %91 = arith.addf %arg10, %arg13 : f32
+      %92 = arith.cmpf oeq, %91, %cst_10 : f32
+      %93 = arith.divf %arg13, %91 : f32
+      %94 = arith.select %92, %cst_10, %93 : f32
+      %95 = arith.mulf %90, %94 : f32
+      %96 = arith.addf %arg8, %95 : f32
+      %97 = arith.addf %arg9, %arg12 : f32
+      %98 = arith.mulf %90, %90 : f32
+      %99 = arith.mulf %98, %arg10 : f32
+      %100 = arith.mulf %99, %94 : f32
+      %101 = arith.addf %97, %100 : f32
+      tt.reduce.return %96, %101, %91 : f32, f32, f32
+    }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %68 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %69 = tt.load %35, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16, #blocked>
+    %70 = arith.extf %69 : tensor<2x256xbf16, #blocked> to tensor<2x256xf32, #blocked>
+    %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
+    %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
+    %73 = tt.load %72, %22, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
+    tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
+    %74 = tt.load %53, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
+    %75 = arith.addf %74, %68 : tensor<2x256xf32, #blocked>
+    %76 = arith.addf %75, %70 : tensor<2x256xf32, #blocked>
+    %77 = tt.broadcast %66 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %78 = arith.subf %76, %77 : tensor<2x256xf32, #blocked>
+    %79 = arith.divf %67, %cst_12 : tensor<2x1xf32, #blocked>
+    %80 = arith.addf %79, %cst_11 : tensor<2x1xf32, #blocked>
+    %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
+    %82 = tt.broadcast %81 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %83 = arith.mulf %78, %82 : tensor<2x256xf32, #blocked>
+    %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
+    %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
+    %86 = arith.mulf %83, %85 : tensor<2x256xf32, #blocked>
+    %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
+    %88 = tt.addptr %87, %33 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
+    %89 = arith.truncf %86 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
+    tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d16069eaed93a19c35a18c9eaed23aebe1acde8d
--- /dev/null
+++ b/.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ttir
@@ -0,0 +1,113 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<2x256xbf16>
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %cst_3 = arith.constant dense<256> : tensor<2x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<2x1xi64>
+    %cst_5 = arith.constant dense<0> : tensor<2x1xi64>
+    %cst_6 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32>
+    %cst_7 = arith.constant dense<2.560000e+02> : tensor<2x1xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<2x256xf32>
+    %cst_9 = arith.constant dense<256> : tensor<2x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x256xi32>
+    %cst_11 = arith.constant dense<512> : tensor<2x1xi32>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32>) -> tensor<2x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<2x1xi32>
+    %5 = arith.addi %4, %3 : tensor<2x1xi32>
+    %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<2x1x!tt.ptr<i64, 1>>, tensor<2x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<2x1xi32>
+    %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
+    %13 = arith.muli %11, %cst_9 : tensor<2x1xi32>
+    %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<2x256xi32>
+    %15 = tt.broadcast %13 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %16 = arith.addi %14, %15 : tensor<2x256xi32>
+    %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %16 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi32>
+    %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<2x256xi1>
+    %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %21 = arith.muli %5, %cst_9 : tensor<2x1xi32>
+    %22 = tt.broadcast %21 : (tensor<2x1xi32>) -> tensor<2x256xi32>
+    %23 = arith.addi %14, %22 : tensor<2x256xi32>
+    %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %25 = tt.addptr %24, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xbf16>
+    %27 = arith.extf %26 : tensor<2x256xbf16> to tensor<2x256xf32>
+    %28 = arith.addi %10, %cst_4 : tensor<2x1xi64>
+    %29 = arith.cmpi slt, %10, %cst_5 : tensor<2x1xi64>
+    %30 = arith.select %29, %28, %10 : tensor<2x1xi1>, tensor<2x1xi64>
+    %31 = arith.cmpi sge, %30, %cst_5 : tensor<2x1xi64>
+    %32 = arith.cmpi slt, %30, %cst_4 : tensor<2x1xi64>
+    %33 = arith.andi %31, %32 : tensor<2x1xi1>
+    tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %34 = arith.muli %30, %cst_3 : tensor<2x1xi64>
+    %35 = tt.broadcast %34 : (tensor<2x1xi64>) -> tensor<2x256xi64>
+    %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
+    %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<2x256xi64>
+    %38 = arith.addi %37, %35 : tensor<2x256xi64>
+    %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>>
+    %40 = tt.addptr %39, %38 : tensor<2x256x!tt.ptr<f32, 1>>, tensor<2x256xi64>
+    %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %42 = arith.addf %41, %20 : tensor<2x256xf32>
+    %43 = arith.addf %42, %27 : tensor<2x256xf32>
+    %44 = arith.addf %43, %cst_8 : tensor<2x256xf32>
+    %45 = arith.subf %43, %44 : tensor<2x256xf32>
+    %46 = arith.mulf %43, %45 : tensor<2x256xf32>
+    %47 = arith.addf %46, %cst_8 : tensor<2x256xf32>
+    %48 = arith.select %19, %44, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
+    %49 = arith.select %19, %47, %cst_8 : tensor<2x256xi1>, tensor<2x256xf32>
+    %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
+    %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %76 = arith.subf %arg11, %arg8 : f32
+      %77 = arith.addf %arg10, %arg13 : f32
+      %78 = arith.cmpf oeq, %77, %cst_2 : f32
+      %79 = arith.divf %arg13, %77 : f32
+      %80 = arith.select %78, %cst_2, %79 : f32
+      %81 = arith.mulf %76, %80 : f32
+      %82 = arith.addf %arg8, %81 : f32
+      %83 = arith.addf %arg9, %arg12 : f32
+      %84 = arith.mulf %76, %76 : f32
+      %85 = arith.mulf %84, %arg10 : f32
+      %86 = arith.mulf %85, %80 : f32
+      %87 = arith.addf %83, %86 : f32
+      tt.reduce.return %82, %87, %77 : f32, f32, f32
+    }) : (tensor<2x256xf32>, tensor<2x256xf32>, tensor<2x256xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+    %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<2xf32>) -> tensor<2x1xf32>
+    %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32>
+    %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xbf16>
+    %57 = arith.extf %56 : tensor<2x256xbf16> to tensor<2x256xf32>
+    %58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
+    %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
+    %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
+    tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1>
+    %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32>
+    %62 = arith.addf %61, %55 : tensor<2x256xf32>
+    %63 = arith.addf %62, %57 : tensor<2x256xf32>
+    %64 = tt.broadcast %53 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %65 = arith.subf %63, %64 : tensor<2x256xf32>
+    %66 = arith.divf %54, %cst_7 : tensor<2x1xf32>
+    %67 = arith.addf %66, %cst_6 : tensor<2x1xf32>
+    %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %69 = tt.broadcast %68 : (tensor<2x1xf32>) -> tensor<2x256xf32>
+    %70 = arith.mulf %65, %69 : tensor<2x256xf32>
+    %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<2x256xf32>
+    %72 = arith.mulf %70, %71 : tensor<2x256xf32>
+    %73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>>
+    %74 = tt.addptr %73, %23 : tensor<2x256x!tt.ptr<bf16, 1>>, tensor<2x256xi32>
+    %75 = arith.truncf %72 : tensor<2x256xf32> to tensor<2x256xbf16>
+    tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.cubin b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..396fa0107e6635f2a59374a21bbde735a4b385fa
Binary files /dev/null and b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.cubin differ
diff --git a/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.cubin b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..01b1208c87030794a015a44c51f873ce40ec1b04
Binary files /dev/null and b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.cubin differ
diff --git a/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..cb9125176de501ceec8230e4435035948c5a5ea8
--- /dev/null
+++ b/.triton/dump/4d7b96448927b8146af43cb9f39e0544/triton_.ttir
@@ -0,0 +1,37 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.398942292> : tensor<512xf32>
+    %cst_0 = arith.constant dense<-5.000000e-01> : tensor<512xf32>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32>
+    %cst_3 = arith.constant dense<0.707106769> : tensor<512xf32>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %12 = arith.extf %11 : tensor<512xbf16> to tensor<512xf32>
+    %13 = arith.mulf %12, %cst_3 : tensor<512xf32>
+    %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
+    %15 = arith.addf %14, %cst_2 : tensor<512xf32>
+    %16 = arith.mulf %15, %cst_1 : tensor<512xf32>
+    %17 = arith.mulf %12, %12 : tensor<512xf32>
+    %18 = arith.mulf %17, %cst_0 : tensor<512xf32>
+    %19 = math.exp %18 : tensor<512xf32>
+    %20 = arith.mulf %19, %cst : tensor<512xf32>
+    %21 = arith.mulf %12, %20 : tensor<512xf32>
+    %22 = arith.addf %16, %21 : tensor<512xf32>
+    %23 = arith.mulf %8, %22 : tensor<512xf32>
+    %24 = arith.truncf %23 : tensor<512xf32> to tensor<512xbf16>
+    tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8d9937ed7a7272d335d241f652100f7c11e67ad7
--- /dev/null
+++ b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir
@@ -0,0 +1,1211 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = lshr i32 %8, 3, !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = and i32 %8, 63, !dbg !10
+  %12 = shl i32 %8, 3, !dbg !11
+  %13 = and i32 %12, 56, !dbg !11
+  %14 = or i32 %13, 4, !dbg !11
+  %15 = lshr i32 %8, 6, !dbg !12
+  %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %17 = shl i32 %16, 6, !dbg !14
+  %18 = or i32 %17, %10, !dbg !15
+  %19 = or i32 %18, 32, !dbg !15
+  %20 = or i32 %17, %11, !dbg !15
+  %21 = sext i32 %18 to i64, !dbg !16
+  %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !16
+  %23 = sext i32 %19 to i64, !dbg !16
+  %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !16
+  %25 = sext i32 %20 to i64, !dbg !16
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
+  %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %44 = srem i32 %18, 512, !dbg !18
+  %45 = srem i32 %19, 512, !dbg !18
+  %46 = shl nsw i32 %44, 8, !dbg !19
+  %47 = shl nsw i32 %45, 8, !dbg !19
+  %48 = add i64 %43, 50257, !dbg !20
+  %49 = icmp slt i64 %27, 0, !dbg !21
+  %50 = icmp slt i64 %35, 0, !dbg !21
+  %51 = icmp slt i64 %43, 0, !dbg !21
+  %52 = select i1 %51, i64 %48, i64 %43, !dbg !22
+  %53 = icmp ugt i64 %52, 50256, !dbg !23
+  %54 = shl i64 %27, 8, !dbg !24
+  %55 = add i64 %54, 12865792, !dbg !24
+  %56 = select i1 %49, i64 %55, i64 %54, !dbg !24
+  %57 = shl i64 %35, 8, !dbg !24
+  %58 = add i64 %57, 12865792, !dbg !24
+  %59 = select i1 %50, i64 %58, i64 %57, !dbg !24
+  %60 = getelementptr float, ptr addrspace(1) %1, i64 %56
+  %61 = getelementptr float, ptr addrspace(1) %1, i64 %59
+  br label %62, !dbg !12
+
+62:                                               ; preds = %7, %179
+  %63 = phi float [ 0.000000e+00, %7 ], [ %254, %179 ]
+  %64 = phi float [ 0.000000e+00, %7 ], [ %255, %179 ]
+  %65 = phi float [ 0.000000e+00, %7 ], [ %256, %179 ]
+  %66 = phi float [ 0.000000e+00, %7 ], [ %257, %179 ]
+  %67 = phi float [ 0.000000e+00, %7 ], [ %258, %179 ]
+  %68 = phi float [ 0.000000e+00, %7 ], [ %259, %179 ]
+  %69 = phi float [ 0.000000e+00, %7 ], [ %260, %179 ]
+  %70 = phi float [ 0.000000e+00, %7 ], [ %261, %179 ]
+  %71 = phi float [ 0.000000e+00, %7 ], [ %262, %179 ]
+  %72 = phi float [ 0.000000e+00, %7 ], [ %263, %179 ]
+  %73 = phi float [ 0.000000e+00, %7 ], [ %264, %179 ]
+  %74 = phi float [ 0.000000e+00, %7 ], [ %265, %179 ]
+  %75 = phi float [ 0.000000e+00, %7 ], [ %266, %179 ]
+  %76 = phi float [ 0.000000e+00, %7 ], [ %267, %179 ]
+  %77 = phi float [ 0.000000e+00, %7 ], [ %268, %179 ]
+  %78 = phi float [ 0.000000e+00, %7 ], [ %269, %179 ]
+  %79 = phi float [ 0.000000e+00, %7 ], [ %270, %179 ]
+  %80 = phi float [ 0.000000e+00, %7 ], [ %271, %179 ]
+  %81 = phi float [ 0.000000e+00, %7 ], [ %272, %179 ]
+  %82 = phi float [ 0.000000e+00, %7 ], [ %273, %179 ]
+  %83 = phi float [ 0.000000e+00, %7 ], [ %274, %179 ]
+  %84 = phi float [ 0.000000e+00, %7 ], [ %275, %179 ]
+  %85 = phi float [ 0.000000e+00, %7 ], [ %276, %179 ]
+  %86 = phi float [ 0.000000e+00, %7 ], [ %277, %179 ]
+  %87 = phi float [ 0.000000e+00, %7 ], [ %278, %179 ]
+  %88 = phi float [ 0.000000e+00, %7 ], [ %279, %179 ]
+  %89 = phi float [ 0.000000e+00, %7 ], [ %280, %179 ]
+  %90 = phi float [ 0.000000e+00, %7 ], [ %281, %179 ]
+  %91 = phi float [ 0.000000e+00, %7 ], [ %282, %179 ]
+  %92 = phi float [ 0.000000e+00, %7 ], [ %283, %179 ]
+  %93 = phi float [ 0.000000e+00, %7 ], [ %284, %179 ]
+  %94 = phi float [ 0.000000e+00, %7 ], [ %285, %179 ]
+  %95 = phi float [ 0.000000e+00, %7 ], [ %350, %179 ]
+  %96 = phi float [ 0.000000e+00, %7 ], [ %351, %179 ]
+  %97 = phi float [ 0.000000e+00, %7 ], [ %352, %179 ]
+  %98 = phi float [ 0.000000e+00, %7 ], [ %353, %179 ]
+  %99 = phi float [ 0.000000e+00, %7 ], [ %354, %179 ]
+  %100 = phi float [ 0.000000e+00, %7 ], [ %355, %179 ]
+  %101 = phi float [ 0.000000e+00, %7 ], [ %356, %179 ]
+  %102 = phi float [ 0.000000e+00, %7 ], [ %357, %179 ]
+  %103 = phi float [ 0.000000e+00, %7 ], [ %358, %179 ]
+  %104 = phi float [ 0.000000e+00, %7 ], [ %359, %179 ]
+  %105 = phi float [ 0.000000e+00, %7 ], [ %360, %179 ]
+  %106 = phi float [ 0.000000e+00, %7 ], [ %361, %179 ]
+  %107 = phi float [ 0.000000e+00, %7 ], [ %362, %179 ]
+  %108 = phi float [ 0.000000e+00, %7 ], [ %363, %179 ]
+  %109 = phi float [ 0.000000e+00, %7 ], [ %364, %179 ]
+  %110 = phi float [ 0.000000e+00, %7 ], [ %365, %179 ]
+  %111 = phi float [ 0.000000e+00, %7 ], [ %302, %179 ]
+  %112 = phi float [ 0.000000e+00, %7 ], [ %303, %179 ]
+  %113 = phi float [ 0.000000e+00, %7 ], [ %304, %179 ]
+  %114 = phi float [ 0.000000e+00, %7 ], [ %305, %179 ]
+  %115 = phi float [ 0.000000e+00, %7 ], [ %306, %179 ]
+  %116 = phi float [ 0.000000e+00, %7 ], [ %307, %179 ]
+  %117 = phi float [ 0.000000e+00, %7 ], [ %308, %179 ]
+  %118 = phi float [ 0.000000e+00, %7 ], [ %309, %179 ]
+  %119 = phi float [ 0.000000e+00, %7 ], [ %310, %179 ]
+  %120 = phi float [ 0.000000e+00, %7 ], [ %311, %179 ]
+  %121 = phi float [ 0.000000e+00, %7 ], [ %312, %179 ]
+  %122 = phi float [ 0.000000e+00, %7 ], [ %313, %179 ]
+  %123 = phi float [ 0.000000e+00, %7 ], [ %314, %179 ]
+  %124 = phi float [ 0.000000e+00, %7 ], [ %315, %179 ]
+  %125 = phi float [ 0.000000e+00, %7 ], [ %316, %179 ]
+  %126 = phi float [ 0.000000e+00, %7 ], [ %317, %179 ]
+  %127 = phi i32 [ 0, %7 ], [ %366, %179 ]
+  %128 = or i32 %127, %13, !dbg !25
+  %129 = or i32 %127, %14, !dbg !25
+  %130 = add i32 %128, %46, !dbg !26
+  %131 = add i32 %129, %46, !dbg !26
+  %132 = add i32 %128, %47, !dbg !26
+  %133 = add i32 %129, %47, !dbg !26
+  %134 = sext i32 %130 to i64, !dbg !27
+  %135 = getelementptr float, ptr addrspace(1) %2, i64 %134, !dbg !27
+  %136 = sext i32 %131 to i64, !dbg !27
+  %137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !27
+  %138 = sext i32 %132 to i64, !dbg !27
+  %139 = getelementptr float, ptr addrspace(1) %2, i64 %138, !dbg !27
+  %140 = sext i32 %133 to i64, !dbg !27
+  %141 = getelementptr float, ptr addrspace(1) %2, i64 %140, !dbg !27
+  %142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %135, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !28
+  %144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !28
+  %145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !28
+  %146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !28
+  %147 = bitcast i32 %143 to float, !dbg !28
+  %148 = bitcast i32 %144 to float, !dbg !28
+  %149 = bitcast i32 %145 to float, !dbg !28
+  %150 = bitcast i32 %146 to float, !dbg !28
+  %151 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %152 = extractvalue { i32, i32, i32, i32 } %151, 0, !dbg !28
+  %153 = extractvalue { i32, i32, i32, i32 } %151, 1, !dbg !28
+  %154 = extractvalue { i32, i32, i32, i32 } %151, 2, !dbg !28
+  %155 = extractvalue { i32, i32, i32, i32 } %151, 3, !dbg !28
+  %156 = bitcast i32 %152 to float, !dbg !28
+  %157 = bitcast i32 %153 to float, !dbg !28
+  %158 = bitcast i32 %154 to float, !dbg !28
+  %159 = bitcast i32 %155 to float, !dbg !28
+  %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %139, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !28
+  %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !28
+  %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !28
+  %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !28
+  %165 = bitcast i32 %161 to float, !dbg !28
+  %166 = bitcast i32 %162 to float, !dbg !28
+  %167 = bitcast i32 %163 to float, !dbg !28
+  %168 = bitcast i32 %164 to float, !dbg !28
+  %169 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %170 = extractvalue { i32, i32, i32, i32 } %169, 0, !dbg !28
+  %171 = extractvalue { i32, i32, i32, i32 } %169, 1, !dbg !28
+  %172 = extractvalue { i32, i32, i32, i32 } %169, 2, !dbg !28
+  %173 = extractvalue { i32, i32, i32, i32 } %169, 3, !dbg !28
+  %174 = bitcast i32 %170 to float, !dbg !28
+  %175 = bitcast i32 %171 to float, !dbg !28
+  %176 = bitcast i32 %172 to float, !dbg !28
+  %177 = bitcast i32 %173 to float, !dbg !28
+  br i1 %53, label %178, label %179, !dbg !29
+
+178:                                              ; preds = %62
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
+  br label %179, !dbg !29
+
+179:                                              ; preds = %178, %62
+  %180 = zext nneg i32 %128 to i64, !dbg !30
+  %181 = zext nneg i32 %129 to i64, !dbg !30
+  %182 = getelementptr float, ptr addrspace(1) %60, i64 %180, !dbg !31
+  %183 = getelementptr float, ptr addrspace(1) %60, i64 %181, !dbg !31
+  %184 = getelementptr float, ptr addrspace(1) %61, i64 %180, !dbg !31
+  %185 = getelementptr float, ptr addrspace(1) %61, i64 %181, !dbg !31
+  %186 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %182, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %187 = extractvalue { i32, i32, i32, i32 } %186, 0, !dbg !32
+  %188 = extractvalue { i32, i32, i32, i32 } %186, 1, !dbg !32
+  %189 = extractvalue { i32, i32, i32, i32 } %186, 2, !dbg !32
+  %190 = extractvalue { i32, i32, i32, i32 } %186, 3, !dbg !32
+  %191 = bitcast i32 %187 to float, !dbg !32
+  %192 = bitcast i32 %188 to float, !dbg !32
+  %193 = bitcast i32 %189 to float, !dbg !32
+  %194 = bitcast i32 %190 to float, !dbg !32
+  %195 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %183, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %196 = extractvalue { i32, i32, i32, i32 } %195, 0, !dbg !32
+  %197 = extractvalue { i32, i32, i32, i32 } %195, 1, !dbg !32
+  %198 = extractvalue { i32, i32, i32, i32 } %195, 2, !dbg !32
+  %199 = extractvalue { i32, i32, i32, i32 } %195, 3, !dbg !32
+  %200 = bitcast i32 %196 to float, !dbg !32
+  %201 = bitcast i32 %197 to float, !dbg !32
+  %202 = bitcast i32 %198 to float, !dbg !32
+  %203 = bitcast i32 %199 to float, !dbg !32
+  %204 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %205 = extractvalue { i32, i32, i32, i32 } %204, 0, !dbg !32
+  %206 = extractvalue { i32, i32, i32, i32 } %204, 1, !dbg !32
+  %207 = extractvalue { i32, i32, i32, i32 } %204, 2, !dbg !32
+  %208 = extractvalue { i32, i32, i32, i32 } %204, 3, !dbg !32
+  %209 = bitcast i32 %205 to float, !dbg !32
+  %210 = bitcast i32 %206 to float, !dbg !32
+  %211 = bitcast i32 %207 to float, !dbg !32
+  %212 = bitcast i32 %208 to float, !dbg !32
+  %213 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %185, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %214 = extractvalue { i32, i32, i32, i32 } %213, 0, !dbg !32
+  %215 = extractvalue { i32, i32, i32, i32 } %213, 1, !dbg !32
+  %216 = extractvalue { i32, i32, i32, i32 } %213, 2, !dbg !32
+  %217 = extractvalue { i32, i32, i32, i32 } %213, 3, !dbg !32
+  %218 = bitcast i32 %214 to float, !dbg !32
+  %219 = bitcast i32 %215 to float, !dbg !32
+  %220 = bitcast i32 %216 to float, !dbg !32
+  %221 = bitcast i32 %217 to float, !dbg !32
+  %222 = fadd float %147, %191, !dbg !33
+  %223 = fadd float %148, %192, !dbg !33
+  %224 = fadd float %149, %193, !dbg !33
+  %225 = fadd float %150, %194, !dbg !33
+  %226 = fadd float %156, %200, !dbg !33
+  %227 = fadd float %157, %201, !dbg !33
+  %228 = fadd float %158, %202, !dbg !33
+  %229 = fadd float %159, %203, !dbg !33
+  %230 = fadd float %165, %209, !dbg !33
+  %231 = fadd float %166, %210, !dbg !33
+  %232 = fadd float %167, %211, !dbg !33
+  %233 = fadd float %168, %212, !dbg !33
+  %234 = fadd float %174, %218, !dbg !33
+  %235 = fadd float %175, %219, !dbg !33
+  %236 = fadd float %176, %220, !dbg !33
+  %237 = fadd float %177, %221, !dbg !33
+  %238 = fsub float %222, %111, !dbg !34
+  %239 = fsub float %223, %112, !dbg !34
+  %240 = fsub float %224, %113, !dbg !34
+  %241 = fsub float %225, %114, !dbg !34
+  %242 = fsub float %226, %115, !dbg !34
+  %243 = fsub float %227, %116, !dbg !34
+  %244 = fsub float %228, %117, !dbg !34
+  %245 = fsub float %229, %118, !dbg !34
+  %246 = fsub float %230, %119, !dbg !34
+  %247 = fsub float %231, %120, !dbg !34
+  %248 = fsub float %232, %121, !dbg !34
+  %249 = fsub float %233, %122, !dbg !34
+  %250 = fsub float %234, %123, !dbg !34
+  %251 = fsub float %235, %124, !dbg !34
+  %252 = fsub float %236, %125, !dbg !34
+  %253 = fsub float %237, %126, !dbg !34
+  %254 = fadd float %63, 1.000000e+00, !dbg !38
+  %255 = fadd float %64, 1.000000e+00, !dbg !38
+  %256 = fadd float %65, 1.000000e+00, !dbg !38
+  %257 = fadd float %66, 1.000000e+00, !dbg !38
+  %258 = fadd float %67, 1.000000e+00, !dbg !38
+  %259 = fadd float %68, 1.000000e+00, !dbg !38
+  %260 = fadd float %69, 1.000000e+00, !dbg !38
+  %261 = fadd float %70, 1.000000e+00, !dbg !38
+  %262 = fadd float %71, 1.000000e+00, !dbg !38
+  %263 = fadd float %72, 1.000000e+00, !dbg !38
+  %264 = fadd float %73, 1.000000e+00, !dbg !38
+  %265 = fadd float %74, 1.000000e+00, !dbg !38
+  %266 = fadd float %75, 1.000000e+00, !dbg !38
+  %267 = fadd float %76, 1.000000e+00, !dbg !38
+  %268 = fadd float %77, 1.000000e+00, !dbg !38
+  %269 = fadd float %78, 1.000000e+00, !dbg !38
+  %270 = fadd float %79, 1.000000e+00, !dbg !38
+  %271 = fadd float %80, 1.000000e+00, !dbg !38
+  %272 = fadd float %81, 1.000000e+00, !dbg !38
+  %273 = fadd float %82, 1.000000e+00, !dbg !38
+  %274 = fadd float %83, 1.000000e+00, !dbg !38
+  %275 = fadd float %84, 1.000000e+00, !dbg !38
+  %276 = fadd float %85, 1.000000e+00, !dbg !38
+  %277 = fadd float %86, 1.000000e+00, !dbg !38
+  %278 = fadd float %87, 1.000000e+00, !dbg !38
+  %279 = fadd float %88, 1.000000e+00, !dbg !38
+  %280 = fadd float %89, 1.000000e+00, !dbg !38
+  %281 = fadd float %90, 1.000000e+00, !dbg !38
+  %282 = fadd float %91, 1.000000e+00, !dbg !38
+  %283 = fadd float %92, 1.000000e+00, !dbg !38
+  %284 = fadd float %93, 1.000000e+00, !dbg !38
+  %285 = fadd float %94, 1.000000e+00, !dbg !38
+  %286 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %238, float %254) #6, !dbg !39
+  %287 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %239, float %255) #6, !dbg !39
+  %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %256) #6, !dbg !39
+  %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %241, float %257) #6, !dbg !39
+  %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %242, float %258) #6, !dbg !39
+  %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %243, float %259) #6, !dbg !39
+  %292 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %260) #6, !dbg !39
+  %293 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float %261) #6, !dbg !39
+  %294 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %246, float %262) #6, !dbg !39
+  %295 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %247, float %263) #6, !dbg !39
+  %296 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %264) #6, !dbg !39
+  %297 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %249, float %265) #6, !dbg !39
+  %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %266) #6, !dbg !39
+  %299 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %251, float %267) #6, !dbg !39
+  %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %268) #6, !dbg !39
+  %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %269) #6, !dbg !39
+  %302 = fadd float %111, %286, !dbg !40
+  %303 = fadd float %112, %287, !dbg !40
+  %304 = fadd float %113, %288, !dbg !40
+  %305 = fadd float %114, %289, !dbg !40
+  %306 = fadd float %115, %290, !dbg !40
+  %307 = fadd float %116, %291, !dbg !40
+  %308 = fadd float %117, %292, !dbg !40
+  %309 = fadd float %118, %293, !dbg !40
+  %310 = fadd float %119, %294, !dbg !40
+  %311 = fadd float %120, %295, !dbg !40
+  %312 = fadd float %121, %296, !dbg !40
+  %313 = fadd float %122, %297, !dbg !40
+  %314 = fadd float %123, %298, !dbg !40
+  %315 = fadd float %124, %299, !dbg !40
+  %316 = fadd float %125, %300, !dbg !40
+  %317 = fadd float %126, %301, !dbg !40
+  %318 = fsub float %222, %302, !dbg !41
+  %319 = fsub float %223, %303, !dbg !41
+  %320 = fsub float %224, %304, !dbg !41
+  %321 = fsub float %225, %305, !dbg !41
+  %322 = fsub float %226, %306, !dbg !41
+  %323 = fsub float %227, %307, !dbg !41
+  %324 = fsub float %228, %308, !dbg !41
+  %325 = fsub float %229, %309, !dbg !41
+  %326 = fsub float %230, %310, !dbg !41
+  %327 = fsub float %231, %311, !dbg !41
+  %328 = fsub float %232, %312, !dbg !41
+  %329 = fsub float %233, %313, !dbg !41
+  %330 = fsub float %234, %314, !dbg !41
+  %331 = fsub float %235, %315, !dbg !41
+  %332 = fsub float %236, %316, !dbg !41
+  %333 = fsub float %237, %317, !dbg !41
+  %334 = fmul float %238, %318, !dbg !42
+  %335 = fmul float %239, %319, !dbg !42
+  %336 = fmul float %240, %320, !dbg !42
+  %337 = fmul float %241, %321, !dbg !42
+  %338 = fmul float %242, %322, !dbg !42
+  %339 = fmul float %243, %323, !dbg !42
+  %340 = fmul float %244, %324, !dbg !42
+  %341 = fmul float %245, %325, !dbg !42
+  %342 = fmul float %246, %326, !dbg !42
+  %343 = fmul float %247, %327, !dbg !42
+  %344 = fmul float %248, %328, !dbg !42
+  %345 = fmul float %249, %329, !dbg !42
+  %346 = fmul float %250, %330, !dbg !42
+  %347 = fmul float %251, %331, !dbg !42
+  %348 = fmul float %252, %332, !dbg !42
+  %349 = fmul float %253, %333, !dbg !42
+  %350 = fadd float %95, %334, !dbg !43
+  %351 = fadd float %96, %335, !dbg !43
+  %352 = fadd float %97, %336, !dbg !43
+  %353 = fadd float %98, %337, !dbg !43
+  %354 = fadd float %99, %338, !dbg !43
+  %355 = fadd float %100, %339, !dbg !43
+  %356 = fadd float %101, %340, !dbg !43
+  %357 = fadd float %102, %341, !dbg !43
+  %358 = fadd float %103, %342, !dbg !43
+  %359 = fadd float %104, %343, !dbg !43
+  %360 = fadd float %105, %344, !dbg !43
+  %361 = fadd float %106, %345, !dbg !43
+  %362 = fadd float %107, %346, !dbg !43
+  %363 = fadd float %108, %347, !dbg !43
+  %364 = fadd float %109, %348, !dbg !43
+  %365 = fadd float %110, %349, !dbg !43
+  %366 = add nuw nsw i32 %127, 64, !dbg !12
+  %367 = icmp ult i32 %127, 192, !dbg !12
+  br i1 %367, label %62, label %368, !dbg !12
+
+368:                                              ; preds = %179
+  %369 = and i32 %15, 3, !dbg !12
+  %370 = mul nuw nsw i32 %369, 72, !dbg !12
+  %371 = add nuw nsw i32 %370, %11, !dbg !12
+  %372 = zext nneg i32 %371 to i64, !dbg !12
+  %373 = getelementptr float, ptr addrspace(3) @global_smem, i64 %372, !dbg !12
+  %374 = insertelement <1 x float> undef, float %270, i64 0, !dbg !12
+  store <1 x float> %374, ptr addrspace(3) %373, align 4, !dbg !12
+  %375 = add nuw nsw i32 %11, 288, !dbg !12
+  %376 = add nuw nsw i32 %375, %370, !dbg !12
+  %377 = zext nneg i32 %376 to i64, !dbg !12
+  %378 = getelementptr float, ptr addrspace(3) @global_smem, i64 %377, !dbg !12
+  %379 = insertelement <1 x float> undef, float %271, i64 0, !dbg !12
+  store <1 x float> %379, ptr addrspace(3) %378, align 4, !dbg !12
+  %380 = or i32 %11, 576, !dbg !12
+  %381 = add nuw nsw i32 %380, %370, !dbg !12
+  %382 = zext nneg i32 %381 to i64, !dbg !12
+  %383 = getelementptr float, ptr addrspace(3) @global_smem, i64 %382, !dbg !12
+  %384 = insertelement <1 x float> undef, float %272, i64 0, !dbg !12
+  store <1 x float> %384, ptr addrspace(3) %383, align 4, !dbg !12
+  %385 = add nuw nsw i32 %11, 864, !dbg !12
+  %386 = add nuw nsw i32 %385, %370, !dbg !12
+  %387 = zext nneg i32 %386 to i64, !dbg !12
+  %388 = getelementptr float, ptr addrspace(3) @global_smem, i64 %387, !dbg !12
+  %389 = insertelement <1 x float> undef, float %273, i64 0, !dbg !12
+  store <1 x float> %389, ptr addrspace(3) %388, align 4, !dbg !12
+  %390 = or i32 %11, 1152, !dbg !12
+  %391 = add nuw nsw i32 %390, %370, !dbg !12
+  %392 = zext nneg i32 %391 to i64, !dbg !12
+  %393 = getelementptr float, ptr addrspace(3) @global_smem, i64 %392, !dbg !12
+  %394 = insertelement <1 x float> undef, float %274, i64 0, !dbg !12
+  store <1 x float> %394, ptr addrspace(3) %393, align 4, !dbg !12
+  %395 = add nuw nsw i32 %11, 1440, !dbg !12
+  %396 = add nuw nsw i32 %395, %370, !dbg !12
+  %397 = zext nneg i32 %396 to i64, !dbg !12
+  %398 = getelementptr float, ptr addrspace(3) @global_smem, i64 %397, !dbg !12
+  %399 = insertelement <1 x float> undef, float %275, i64 0, !dbg !12
+  store <1 x float> %399, ptr addrspace(3) %398, align 4, !dbg !12
+  %400 = or i32 %11, 1728, !dbg !12
+  %401 = add nuw nsw i32 %400, %370, !dbg !12
+  %402 = zext nneg i32 %401 to i64, !dbg !12
+  %403 = getelementptr float, ptr addrspace(3) @global_smem, i64 %402, !dbg !12
+  %404 = insertelement <1 x float> undef, float %276, i64 0, !dbg !12
+  store <1 x float> %404, ptr addrspace(3) %403, align 4, !dbg !12
+  %405 = add nuw nsw i32 %11, 2016, !dbg !12
+  %406 = add nuw nsw i32 %405, %370, !dbg !12
+  %407 = zext nneg i32 %406 to i64, !dbg !12
+  %408 = getelementptr float, ptr addrspace(3) @global_smem, i64 %407, !dbg !12
+  %409 = insertelement <1 x float> undef, float %277, i64 0, !dbg !12
+  store <1 x float> %409, ptr addrspace(3) %408, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %410 = mul nuw nsw i32 %10, 72, !dbg !12
+  %411 = add nuw nsw i32 %410, %13, !dbg !12
+  %412 = zext nneg i32 %411 to i64, !dbg !12
+  %413 = getelementptr float, ptr addrspace(3) @global_smem, i64 %412, !dbg !12
+  %414 = load float, ptr addrspace(3) %413, align 32, !dbg !12
+  %415 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 1, !dbg !12
+  %416 = load float, ptr addrspace(3) %415, align 4, !dbg !12
+  %417 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 2, !dbg !12
+  %418 = load float, ptr addrspace(3) %417, align 8, !dbg !12
+  %419 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 3, !dbg !12
+  %420 = load float, ptr addrspace(3) %419, align 4, !dbg !12
+  %421 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 4, !dbg !12
+  %422 = load float, ptr addrspace(3) %421, align 16, !dbg !12
+  %423 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 5, !dbg !12
+  %424 = load float, ptr addrspace(3) %423, align 4, !dbg !12
+  %425 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 6, !dbg !12
+  %426 = load float, ptr addrspace(3) %425, align 8, !dbg !12
+  %427 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 7, !dbg !12
+  %428 = load float, ptr addrspace(3) %427, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %429 = insertelement <1 x float> undef, float %278, i64 0, !dbg !12
+  store <1 x float> %429, ptr addrspace(3) %373, align 4, !dbg !12
+  %430 = insertelement <1 x float> undef, float %279, i64 0, !dbg !12
+  store <1 x float> %430, ptr addrspace(3) %378, align 4, !dbg !12
+  %431 = insertelement <1 x float> undef, float %280, i64 0, !dbg !12
+  store <1 x float> %431, ptr addrspace(3) %383, align 4, !dbg !12
+  %432 = insertelement <1 x float> undef, float %281, i64 0, !dbg !12
+  store <1 x float> %432, ptr addrspace(3) %388, align 4, !dbg !12
+  %433 = insertelement <1 x float> undef, float %282, i64 0, !dbg !12
+  store <1 x float> %433, ptr addrspace(3) %393, align 4, !dbg !12
+  %434 = insertelement <1 x float> undef, float %283, i64 0, !dbg !12
+  store <1 x float> %434, ptr addrspace(3) %398, align 4, !dbg !12
+  %435 = insertelement <1 x float> undef, float %284, i64 0, !dbg !12
+  store <1 x float> %435, ptr addrspace(3) %403, align 4, !dbg !12
+  %436 = insertelement <1 x float> undef, float %285, i64 0, !dbg !12
+  store <1 x float> %436, ptr addrspace(3) %408, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %437 = load float, ptr addrspace(3) %413, align 32, !dbg !12
+  %438 = load float, ptr addrspace(3) %415, align 4, !dbg !12
+  %439 = load float, ptr addrspace(3) %417, align 8, !dbg !12
+  %440 = load float, ptr addrspace(3) %419, align 4, !dbg !12
+  %441 = load float, ptr addrspace(3) %421, align 16, !dbg !12
+  %442 = load float, ptr addrspace(3) %423, align 4, !dbg !12
+  %443 = load float, ptr addrspace(3) %425, align 8, !dbg !12
+  %444 = load float, ptr addrspace(3) %427, align 4, !dbg !12
+  %445 = fsub float %303, %302, !dbg !44
+  %446 = fadd float %414, %416, !dbg !48
+  %447 = fcmp oeq float %446, 0.000000e+00, !dbg !49
+  %448 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %446) #6, !dbg !50
+  %449 = select i1 %447, float 0.000000e+00, float %448, !dbg !51
+  %450 = fmul float %445, %449, !dbg !52
+  %451 = fadd float %302, %450, !dbg !53
+  %452 = fadd float %350, %351, !dbg !54
+  %453 = fmul float %445, %445, !dbg !55
+  %454 = fmul float %453, %414, !dbg !56
+  %455 = fmul float %454, %449, !dbg !57
+  %456 = fadd float %452, %455, !dbg !58
+  %457 = fsub float %304, %451, !dbg !44
+  %458 = fadd float %418, %446, !dbg !48
+  %459 = fcmp oeq float %458, 0.000000e+00, !dbg !49
+  %460 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %418, float %458) #6, !dbg !50
+  %461 = select i1 %459, float 0.000000e+00, float %460, !dbg !51
+  %462 = fmul float %461, %457, !dbg !52
+  %463 = fadd float %451, %462, !dbg !53
+  %464 = fadd float %352, %456, !dbg !54
+  %465 = fmul float %457, %457, !dbg !55
+  %466 = fmul float %446, %465, !dbg !56
+  %467 = fmul float %461, %466, !dbg !57
+  %468 = fadd float %464, %467, !dbg !58
+  %469 = fsub float %305, %463, !dbg !44
+  %470 = fadd float %420, %458, !dbg !48
+  %471 = fcmp oeq float %470, 0.000000e+00, !dbg !49
+  %472 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %420, float %470) #6, !dbg !50
+  %473 = select i1 %471, float 0.000000e+00, float %472, !dbg !51
+  %474 = fmul float %473, %469, !dbg !52
+  %475 = fadd float %463, %474, !dbg !53
+  %476 = fadd float %353, %468, !dbg !54
+  %477 = fmul float %469, %469, !dbg !55
+  %478 = fmul float %458, %477, !dbg !56
+  %479 = fmul float %473, %478, !dbg !57
+  %480 = fadd float %476, %479, !dbg !58
+  %481 = fsub float %306, %475, !dbg !44
+  %482 = fadd float %422, %470, !dbg !48
+  %483 = fcmp oeq float %482, 0.000000e+00, !dbg !49
+  %484 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %422, float %482) #6, !dbg !50
+  %485 = select i1 %483, float 0.000000e+00, float %484, !dbg !51
+  %486 = fmul float %485, %481, !dbg !52
+  %487 = fadd float %475, %486, !dbg !53
+  %488 = fadd float %354, %480, !dbg !54
+  %489 = fmul float %481, %481, !dbg !55
+  %490 = fmul float %470, %489, !dbg !56
+  %491 = fmul float %485, %490, !dbg !57
+  %492 = fadd float %488, %491, !dbg !58
+  %493 = fsub float %307, %487, !dbg !44
+  %494 = fadd float %424, %482, !dbg !48
+  %495 = fcmp oeq float %494, 0.000000e+00, !dbg !49
+  %496 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %424, float %494) #6, !dbg !50
+  %497 = select i1 %495, float 0.000000e+00, float %496, !dbg !51
+  %498 = fmul float %497, %493, !dbg !52
+  %499 = fadd float %487, %498, !dbg !53
+  %500 = fadd float %355, %492, !dbg !54
+  %501 = fmul float %493, %493, !dbg !55
+  %502 = fmul float %482, %501, !dbg !56
+  %503 = fmul float %497, %502, !dbg !57
+  %504 = fadd float %500, %503, !dbg !58
+  %505 = fsub float %308, %499, !dbg !44
+  %506 = fadd float %426, %494, !dbg !48
+  %507 = fcmp oeq float %506, 0.000000e+00, !dbg !49
+  %508 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %426, float %506) #6, !dbg !50
+  %509 = select i1 %507, float 0.000000e+00, float %508, !dbg !51
+  %510 = fmul float %509, %505, !dbg !52
+  %511 = fadd float %499, %510, !dbg !53
+  %512 = fadd float %356, %504, !dbg !54
+  %513 = fmul float %505, %505, !dbg !55
+  %514 = fmul float %494, %513, !dbg !56
+  %515 = fmul float %509, %514, !dbg !57
+  %516 = fadd float %512, %515, !dbg !58
+  %517 = fsub float %309, %511, !dbg !44
+  %518 = fadd float %428, %506, !dbg !48
+  %519 = fcmp oeq float %518, 0.000000e+00, !dbg !49
+  %520 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float %518) #6, !dbg !50
+  %521 = select i1 %519, float 0.000000e+00, float %520, !dbg !51
+  %522 = fmul float %521, %517, !dbg !52
+  %523 = fadd float %511, %522, !dbg !53
+  %524 = fadd float %357, %516, !dbg !54
+  %525 = fmul float %517, %517, !dbg !55
+  %526 = fmul float %506, %525, !dbg !56
+  %527 = fmul float %521, %526, !dbg !57
+  %528 = fadd float %524, %527, !dbg !58
+  %529 = fsub float %311, %310, !dbg !44
+  %530 = fadd float %437, %438, !dbg !48
+  %531 = fcmp oeq float %530, 0.000000e+00, !dbg !49
+  %532 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %438, float %530) #6, !dbg !50
+  %533 = select i1 %531, float 0.000000e+00, float %532, !dbg !51
+  %534 = fmul float %529, %533, !dbg !52
+  %535 = fadd float %310, %534, !dbg !53
+  %536 = fadd float %358, %359, !dbg !54
+  %537 = fmul float %529, %529, !dbg !55
+  %538 = fmul float %537, %437, !dbg !56
+  %539 = fmul float %538, %533, !dbg !57
+  %540 = fadd float %536, %539, !dbg !58
+  %541 = fsub float %312, %535, !dbg !44
+  %542 = fadd float %439, %530, !dbg !48
+  %543 = fcmp oeq float %542, 0.000000e+00, !dbg !49
+  %544 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %439, float %542) #6, !dbg !50
+  %545 = select i1 %543, float 0.000000e+00, float %544, !dbg !51
+  %546 = fmul float %545, %541, !dbg !52
+  %547 = fadd float %535, %546, !dbg !53
+  %548 = fadd float %360, %540, !dbg !54
+  %549 = fmul float %541, %541, !dbg !55
+  %550 = fmul float %530, %549, !dbg !56
+  %551 = fmul float %545, %550, !dbg !57
+  %552 = fadd float %548, %551, !dbg !58
+  %553 = fsub float %313, %547, !dbg !44
+  %554 = fadd float %440, %542, !dbg !48
+  %555 = fcmp oeq float %554, 0.000000e+00, !dbg !49
+  %556 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %440, float %554) #6, !dbg !50
+  %557 = select i1 %555, float 0.000000e+00, float %556, !dbg !51
+  %558 = fmul float %557, %553, !dbg !52
+  %559 = fadd float %547, %558, !dbg !53
+  %560 = fadd float %361, %552, !dbg !54
+  %561 = fmul float %553, %553, !dbg !55
+  %562 = fmul float %542, %561, !dbg !56
+  %563 = fmul float %557, %562, !dbg !57
+  %564 = fadd float %560, %563, !dbg !58
+  %565 = fsub float %314, %559, !dbg !44
+  %566 = fadd float %441, %554, !dbg !48
+  %567 = fcmp oeq float %566, 0.000000e+00, !dbg !49
+  %568 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %441, float %566) #6, !dbg !50
+  %569 = select i1 %567, float 0.000000e+00, float %568, !dbg !51
+  %570 = fmul float %569, %565, !dbg !52
+  %571 = fadd float %559, %570, !dbg !53
+  %572 = fadd float %362, %564, !dbg !54
+  %573 = fmul float %565, %565, !dbg !55
+  %574 = fmul float %554, %573, !dbg !56
+  %575 = fmul float %569, %574, !dbg !57
+  %576 = fadd float %572, %575, !dbg !58
+  %577 = fsub float %315, %571, !dbg !44
+  %578 = fadd float %442, %566, !dbg !48
+  %579 = fcmp oeq float %578, 0.000000e+00, !dbg !49
+  %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %442, float %578) #6, !dbg !50
+  %581 = select i1 %579, float 0.000000e+00, float %580, !dbg !51
+  %582 = fmul float %581, %577, !dbg !52
+  %583 = fadd float %571, %582, !dbg !53
+  %584 = fadd float %363, %576, !dbg !54
+  %585 = fmul float %577, %577, !dbg !55
+  %586 = fmul float %566, %585, !dbg !56
+  %587 = fmul float %581, %586, !dbg !57
+  %588 = fadd float %584, %587, !dbg !58
+  %589 = fsub float %316, %583, !dbg !44
+  %590 = fadd float %443, %578, !dbg !48
+  %591 = fcmp oeq float %590, 0.000000e+00, !dbg !49
+  %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %443, float %590) #6, !dbg !50
+  %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !51
+  %594 = fmul float %593, %589, !dbg !52
+  %595 = fadd float %583, %594, !dbg !53
+  %596 = fadd float %364, %588, !dbg !54
+  %597 = fmul float %589, %589, !dbg !55
+  %598 = fmul float %578, %597, !dbg !56
+  %599 = fmul float %593, %598, !dbg !57
+  %600 = fadd float %596, %599, !dbg !58
+  %601 = fsub float %317, %595, !dbg !44
+  %602 = fadd float %444, %590, !dbg !48
+  %603 = fcmp oeq float %602, 0.000000e+00, !dbg !49
+  %604 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %444, float %602) #6, !dbg !50
+  %605 = select i1 %603, float 0.000000e+00, float %604, !dbg !51
+  %606 = fmul float %605, %601, !dbg !52
+  %607 = fadd float %595, %606, !dbg !53
+  %608 = fadd float %365, %600, !dbg !54
+  %609 = fmul float %601, %601, !dbg !55
+  %610 = fmul float %590, %609, !dbg !56
+  %611 = fmul float %605, %610, !dbg !57
+  %612 = fadd float %608, %611, !dbg !58
+  %613 = bitcast float %523 to i32, !dbg !59
+  %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !59
+  %615 = bitcast i32 %614 to float, !dbg !59
+  %616 = bitcast float %528 to i32, !dbg !59
+  %617 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %616, i32 4, i32 31), !dbg !59
+  %618 = bitcast i32 %617 to float, !dbg !59
+  %619 = bitcast float %518 to i32, !dbg !59
+  %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !59
+  %621 = bitcast i32 %620 to float, !dbg !59
+  %622 = fsub float %615, %523, !dbg !44
+  %623 = fadd float %518, %621, !dbg !48
+  %624 = fcmp oeq float %623, 0.000000e+00, !dbg !49
+  %625 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %621, float %623) #6, !dbg !50
+  %626 = select i1 %624, float 0.000000e+00, float %625, !dbg !51
+  %627 = fmul float %626, %622, !dbg !52
+  %628 = fadd float %523, %627, !dbg !53
+  %629 = fadd float %528, %618, !dbg !54
+  %630 = fmul float %622, %622, !dbg !55
+  %631 = fmul float %518, %630, !dbg !56
+  %632 = fmul float %626, %631, !dbg !57
+  %633 = fadd float %629, %632, !dbg !58
+  %634 = bitcast float %628 to i32, !dbg !59
+  %635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %634, i32 2, i32 31), !dbg !59
+  %636 = bitcast i32 %635 to float, !dbg !59
+  %637 = bitcast float %633 to i32, !dbg !59
+  %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 2, i32 31), !dbg !59
+  %639 = bitcast i32 %638 to float, !dbg !59
+  %640 = bitcast float %623 to i32, !dbg !59
+  %641 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 2, i32 31), !dbg !59
+  %642 = bitcast i32 %641 to float, !dbg !59
+  %643 = fsub float %636, %628, !dbg !44
+  %644 = fadd float %623, %642, !dbg !48
+  %645 = fcmp oeq float %644, 0.000000e+00, !dbg !49
+  %646 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %642, float %644) #6, !dbg !50
+  %647 = select i1 %645, float 0.000000e+00, float %646, !dbg !51
+  %648 = fmul float %647, %643, !dbg !52
+  %649 = fadd float %628, %648, !dbg !53
+  %650 = fadd float %633, %639, !dbg !54
+  %651 = fmul float %643, %643, !dbg !55
+  %652 = fmul float %623, %651, !dbg !56
+  %653 = fmul float %647, %652, !dbg !57
+  %654 = fadd float %650, %653, !dbg !58
+  %655 = bitcast float %649 to i32, !dbg !59
+  %656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 1, i32 31), !dbg !59
+  %657 = bitcast i32 %656 to float, !dbg !59
+  %658 = bitcast float %654 to i32, !dbg !59
+  %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 1, i32 31), !dbg !59
+  %660 = bitcast i32 %659 to float, !dbg !59
+  %661 = bitcast float %644 to i32, !dbg !59
+  %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !59
+  %663 = bitcast i32 %662 to float, !dbg !59
+  %664 = fsub float %657, %649, !dbg !44
+  %665 = fadd float %644, %663, !dbg !48
+  %666 = fcmp oeq float %665, 0.000000e+00, !dbg !49
+  %667 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %663, float %665) #6, !dbg !50
+  %668 = select i1 %666, float 0.000000e+00, float %667, !dbg !51
+  %669 = fmul float %664, %668, !dbg !52
+  %670 = fadd float %649, %669, !dbg !53
+  %671 = fadd float %654, %660, !dbg !54
+  %672 = fmul float %664, %664, !dbg !55
+  %673 = fmul float %644, %672, !dbg !56
+  %674 = fmul float %668, %673, !dbg !57
+  %675 = fadd float %671, %674, !dbg !58
+  %676 = bitcast float %607 to i32, !dbg !59
+  %677 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 4, i32 31), !dbg !59
+  %678 = bitcast i32 %677 to float, !dbg !59
+  %679 = bitcast float %612 to i32, !dbg !59
+  %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %679, i32 4, i32 31), !dbg !59
+  %681 = bitcast i32 %680 to float, !dbg !59
+  %682 = bitcast float %602 to i32, !dbg !59
+  %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 4, i32 31), !dbg !59
+  %684 = bitcast i32 %683 to float, !dbg !59
+  %685 = fsub float %678, %607, !dbg !44
+  %686 = fadd float %602, %684, !dbg !48
+  %687 = fcmp oeq float %686, 0.000000e+00, !dbg !49
+  %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %684, float %686) #6, !dbg !50
+  %689 = select i1 %687, float 0.000000e+00, float %688, !dbg !51
+  %690 = fmul float %685, %689, !dbg !52
+  %691 = fadd float %607, %690, !dbg !53
+  %692 = fadd float %612, %681, !dbg !54
+  %693 = fmul float %685, %685, !dbg !55
+  %694 = fmul float %602, %693, !dbg !56
+  %695 = fmul float %694, %689, !dbg !57
+  %696 = fadd float %692, %695, !dbg !58
+  %697 = bitcast float %691 to i32, !dbg !59
+  %698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 2, i32 31), !dbg !59
+  %699 = bitcast i32 %698 to float, !dbg !59
+  %700 = bitcast float %696 to i32, !dbg !59
+  %701 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %700, i32 2, i32 31), !dbg !59
+  %702 = bitcast i32 %701 to float, !dbg !59
+  %703 = bitcast float %686 to i32, !dbg !59
+  %704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 2, i32 31), !dbg !59
+  %705 = bitcast i32 %704 to float, !dbg !59
+  %706 = fsub float %699, %691, !dbg !44
+  %707 = fadd float %686, %705, !dbg !48
+  %708 = fcmp oeq float %707, 0.000000e+00, !dbg !49
+  %709 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %705, float %707) #6, !dbg !50
+  %710 = select i1 %708, float 0.000000e+00, float %709, !dbg !51
+  %711 = fmul float %706, %710, !dbg !52
+  %712 = fadd float %691, %711, !dbg !53
+  %713 = fadd float %696, %702, !dbg !54
+  %714 = fmul float %706, %706, !dbg !55
+  %715 = fmul float %686, %714, !dbg !56
+  %716 = fmul float %710, %715, !dbg !57
+  %717 = fadd float %713, %716, !dbg !58
+  %718 = bitcast float %712 to i32, !dbg !59
+  %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !59
+  %720 = bitcast i32 %719 to float, !dbg !59
+  %721 = bitcast float %717 to i32, !dbg !59
+  %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !59
+  %723 = bitcast i32 %722 to float, !dbg !59
+  %724 = bitcast float %707 to i32, !dbg !59
+  %725 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %724, i32 1, i32 31), !dbg !59
+  %726 = bitcast i32 %725 to float, !dbg !59
+  %727 = fsub float %720, %712, !dbg !44
+  %728 = fadd float %707, %726, !dbg !48
+  %729 = fcmp oeq float %728, 0.000000e+00, !dbg !49
+  %730 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %726, float %728) #6, !dbg !50
+  %731 = select i1 %729, float 0.000000e+00, float %730, !dbg !51
+  %732 = fmul float %727, %731, !dbg !52
+  %733 = fadd float %712, %732, !dbg !53
+  %734 = fadd float %717, %723, !dbg !54
+  %735 = fmul float %727, %727, !dbg !55
+  %736 = fmul float %707, %735, !dbg !56
+  %737 = fmul float %731, %736, !dbg !57
+  %738 = fadd float %734, %737, !dbg !58
+  %739 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %740 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %741 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %742 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %743 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %744 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %745 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %746 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
+  %747 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %748 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %749 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %750 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %751 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %752 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %753 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %754 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
+  %755 = fadd float %739, 0x3EE4F8B580000000, !dbg !62
+  %756 = fadd float %747, 0x3EE4F8B580000000, !dbg !62
+  %757 = shl i32 %18, 8, !dbg !63
+  %758 = shl i32 %19, 8, !dbg !63
+  br label %759, !dbg !64
+
+759:                                              ; preds = %368, %__nv_rsqrtf.exit25
+  %760 = phi i32 [ 0, %368 ], [ %1009, %__nv_rsqrtf.exit25 ]
+  %761 = or i32 %760, %13, !dbg !65
+  %762 = or i32 %760, %14, !dbg !65
+  %763 = add i32 %761, %46, !dbg !66
+  %764 = add i32 %762, %46, !dbg !66
+  %765 = add i32 %761, %47, !dbg !66
+  %766 = add i32 %762, %47, !dbg !66
+  %767 = sext i32 %763 to i64, !dbg !67
+  %768 = getelementptr float, ptr addrspace(1) %2, i64 %767, !dbg !67
+  %769 = sext i32 %764 to i64, !dbg !67
+  %770 = getelementptr float, ptr addrspace(1) %2, i64 %769, !dbg !67
+  %771 = sext i32 %765 to i64, !dbg !67
+  %772 = getelementptr float, ptr addrspace(1) %2, i64 %771, !dbg !67
+  %773 = sext i32 %766 to i64, !dbg !67
+  %774 = getelementptr float, ptr addrspace(1) %2, i64 %773, !dbg !67
+  %775 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %768, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %776 = extractvalue { i32, i32, i32, i32 } %775, 0, !dbg !68
+  %777 = extractvalue { i32, i32, i32, i32 } %775, 1, !dbg !68
+  %778 = extractvalue { i32, i32, i32, i32 } %775, 2, !dbg !68
+  %779 = extractvalue { i32, i32, i32, i32 } %775, 3, !dbg !68
+  %780 = bitcast i32 %776 to float, !dbg !68
+  %781 = bitcast i32 %777 to float, !dbg !68
+  %782 = bitcast i32 %778 to float, !dbg !68
+  %783 = bitcast i32 %779 to float, !dbg !68
+  %784 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %770, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %785 = extractvalue { i32, i32, i32, i32 } %784, 0, !dbg !68
+  %786 = extractvalue { i32, i32, i32, i32 } %784, 1, !dbg !68
+  %787 = extractvalue { i32, i32, i32, i32 } %784, 2, !dbg !68
+  %788 = extractvalue { i32, i32, i32, i32 } %784, 3, !dbg !68
+  %789 = bitcast i32 %785 to float, !dbg !68
+  %790 = bitcast i32 %786 to float, !dbg !68
+  %791 = bitcast i32 %787 to float, !dbg !68
+  %792 = bitcast i32 %788 to float, !dbg !68
+  %793 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %772, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %794 = extractvalue { i32, i32, i32, i32 } %793, 0, !dbg !68
+  %795 = extractvalue { i32, i32, i32, i32 } %793, 1, !dbg !68
+  %796 = extractvalue { i32, i32, i32, i32 } %793, 2, !dbg !68
+  %797 = extractvalue { i32, i32, i32, i32 } %793, 3, !dbg !68
+  %798 = bitcast i32 %794 to float, !dbg !68
+  %799 = bitcast i32 %795 to float, !dbg !68
+  %800 = bitcast i32 %796 to float, !dbg !68
+  %801 = bitcast i32 %797 to float, !dbg !68
+  %802 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %774, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %803 = extractvalue { i32, i32, i32, i32 } %802, 0, !dbg !68
+  %804 = extractvalue { i32, i32, i32, i32 } %802, 1, !dbg !68
+  %805 = extractvalue { i32, i32, i32, i32 } %802, 2, !dbg !68
+  %806 = extractvalue { i32, i32, i32, i32 } %802, 3, !dbg !68
+  %807 = bitcast i32 %803 to float, !dbg !68
+  %808 = bitcast i32 %804 to float, !dbg !68
+  %809 = bitcast i32 %805 to float, !dbg !68
+  %810 = bitcast i32 %806 to float, !dbg !68
+  %811 = zext nneg i32 %761 to i64, !dbg !69
+  %812 = getelementptr float, ptr addrspace(1) %3, i64 %811, !dbg !69
+  %813 = zext nneg i32 %762 to i64, !dbg !69
+  %814 = getelementptr float, ptr addrspace(1) %3, i64 %813, !dbg !69
+  %815 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %812, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %816 = extractvalue { i32, i32, i32, i32 } %815, 0, !dbg !70
+  %817 = extractvalue { i32, i32, i32, i32 } %815, 1, !dbg !70
+  %818 = extractvalue { i32, i32, i32, i32 } %815, 2, !dbg !70
+  %819 = extractvalue { i32, i32, i32, i32 } %815, 3, !dbg !70
+  %820 = bitcast i32 %816 to float, !dbg !70
+  %821 = bitcast i32 %817 to float, !dbg !70
+  %822 = bitcast i32 %818 to float, !dbg !70
+  %823 = bitcast i32 %819 to float, !dbg !70
+  %824 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %814, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %825 = extractvalue { i32, i32, i32, i32 } %824, 0, !dbg !70
+  %826 = extractvalue { i32, i32, i32, i32 } %824, 1, !dbg !70
+  %827 = extractvalue { i32, i32, i32, i32 } %824, 2, !dbg !70
+  %828 = extractvalue { i32, i32, i32, i32 } %824, 3, !dbg !70
+  %829 = bitcast i32 %825 to float, !dbg !70
+  %830 = bitcast i32 %826 to float, !dbg !70
+  %831 = bitcast i32 %827 to float, !dbg !70
+  %832 = bitcast i32 %828 to float, !dbg !70
+  br i1 %53, label %833, label %834, !dbg !71
+
+833:                                              ; preds = %759
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %834, !dbg !71
+
+834:                                              ; preds = %833, %759
+  %835 = getelementptr float, ptr addrspace(1) %60, i64 %811, !dbg !72
+  %836 = getelementptr float, ptr addrspace(1) %60, i64 %813, !dbg !72
+  %837 = getelementptr float, ptr addrspace(1) %61, i64 %811, !dbg !72
+  %838 = getelementptr float, ptr addrspace(1) %61, i64 %813, !dbg !72
+  %839 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %840 = extractvalue { i32, i32, i32, i32 } %839, 0, !dbg !73
+  %841 = extractvalue { i32, i32, i32, i32 } %839, 1, !dbg !73
+  %842 = extractvalue { i32, i32, i32, i32 } %839, 2, !dbg !73
+  %843 = extractvalue { i32, i32, i32, i32 } %839, 3, !dbg !73
+  %844 = bitcast i32 %840 to float, !dbg !73
+  %845 = bitcast i32 %841 to float, !dbg !73
+  %846 = bitcast i32 %842 to float, !dbg !73
+  %847 = bitcast i32 %843 to float, !dbg !73
+  %848 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %836, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %849 = extractvalue { i32, i32, i32, i32 } %848, 0, !dbg !73
+  %850 = extractvalue { i32, i32, i32, i32 } %848, 1, !dbg !73
+  %851 = extractvalue { i32, i32, i32, i32 } %848, 2, !dbg !73
+  %852 = extractvalue { i32, i32, i32, i32 } %848, 3, !dbg !73
+  %853 = bitcast i32 %849 to float, !dbg !73
+  %854 = bitcast i32 %850 to float, !dbg !73
+  %855 = bitcast i32 %851 to float, !dbg !73
+  %856 = bitcast i32 %852 to float, !dbg !73
+  %857 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %858 = extractvalue { i32, i32, i32, i32 } %857, 0, !dbg !73
+  %859 = extractvalue { i32, i32, i32, i32 } %857, 1, !dbg !73
+  %860 = extractvalue { i32, i32, i32, i32 } %857, 2, !dbg !73
+  %861 = extractvalue { i32, i32, i32, i32 } %857, 3, !dbg !73
+  %862 = bitcast i32 %858 to float, !dbg !73
+  %863 = bitcast i32 %859 to float, !dbg !73
+  %864 = bitcast i32 %860 to float, !dbg !73
+  %865 = bitcast i32 %861 to float, !dbg !73
+  %866 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %838, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %867 = extractvalue { i32, i32, i32, i32 } %866, 0, !dbg !73
+  %868 = extractvalue { i32, i32, i32, i32 } %866, 1, !dbg !73
+  %869 = extractvalue { i32, i32, i32, i32 } %866, 2, !dbg !73
+  %870 = extractvalue { i32, i32, i32, i32 } %866, 3, !dbg !73
+  %871 = bitcast i32 %867 to float, !dbg !73
+  %872 = bitcast i32 %868 to float, !dbg !73
+  %873 = bitcast i32 %869 to float, !dbg !73
+  %874 = bitcast i32 %870 to float, !dbg !73
+  %875 = fadd float %780, %844, !dbg !74
+  %876 = fadd float %781, %845, !dbg !74
+  %877 = fadd float %782, %846, !dbg !74
+  %878 = fadd float %783, %847, !dbg !74
+  %879 = fadd float %789, %853, !dbg !74
+  %880 = fadd float %790, %854, !dbg !74
+  %881 = fadd float %791, %855, !dbg !74
+  %882 = fadd float %792, %856, !dbg !74
+  %883 = fadd float %798, %862, !dbg !74
+  %884 = fadd float %799, %863, !dbg !74
+  %885 = fadd float %800, %864, !dbg !74
+  %886 = fadd float %801, %865, !dbg !74
+  %887 = fadd float %807, %871, !dbg !74
+  %888 = fadd float %808, %872, !dbg !74
+  %889 = fadd float %809, %873, !dbg !74
+  %890 = fadd float %810, %874, !dbg !74
+  %891 = fsub float %875, %670, !dbg !75
+  %892 = fsub float %876, %670, !dbg !75
+  %893 = fsub float %877, %670, !dbg !75
+  %894 = fsub float %878, %670, !dbg !75
+  %895 = fsub float %879, %670, !dbg !75
+  %896 = fsub float %880, %670, !dbg !75
+  %897 = fsub float %881, %670, !dbg !75
+  %898 = fsub float %882, %670, !dbg !75
+  %899 = fsub float %883, %733, !dbg !75
+  %900 = fsub float %884, %733, !dbg !75
+  %901 = fsub float %885, %733, !dbg !75
+  %902 = fsub float %886, %733, !dbg !75
+  %903 = fsub float %887, %733, !dbg !75
+  %904 = fsub float %888, %733, !dbg !75
+  %905 = fsub float %889, %733, !dbg !75
+  %906 = fsub float %890, %733, !dbg !75
+  %907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i = icmp eq i32 %907, 0, !dbg !76
+  br i1 %.not.i, label %910, label %908, !dbg !76
+
+908:                                              ; preds = %834
+  %909 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %755), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+910:                                              ; preds = %834
+  %911 = tail call float @llvm.nvvm.rsqrt.approx.f(float %755), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+__nv_rsqrtf.exit:                                 ; preds = %908, %910
+  %.0.i = phi float [ %909, %908 ], [ %911, %910 ], !dbg !76
+  %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i23 = icmp eq i32 %919, 0, !dbg !76
+  br i1 %.not.i23, label %922, label %920, !dbg !76
+
+920:                                              ; preds = %__nv_rsqrtf.exit
+  %921 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %756), !dbg !76
+  br label %__nv_rsqrtf.exit25, !dbg !76
+
+922:                                              ; preds = %__nv_rsqrtf.exit
+  %923 = tail call float @llvm.nvvm.rsqrt.approx.f(float %756), !dbg !76
+  br label %__nv_rsqrtf.exit25, !dbg !76
+
+__nv_rsqrtf.exit25:                               ; preds = %920, %922
+  %.0.i24 = phi float [ %921, %920 ], [ %923, %922 ], !dbg !76
+  %924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %928 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %931 = fmul float %891, %.0.i, !dbg !77
+  %932 = fmul float %892, %.0.i, !dbg !77
+  %933 = fmul float %893, %.0.i, !dbg !77
+  %934 = fmul float %894, %.0.i, !dbg !77
+  %935 = fmul float %895, %.0.i, !dbg !77
+  %936 = fmul float %896, %.0.i, !dbg !77
+  %937 = fmul float %897, %.0.i, !dbg !77
+  %938 = fmul float %898, %.0.i, !dbg !77
+  %939 = fmul float %899, %.0.i24, !dbg !77
+  %940 = fmul float %900, %.0.i24, !dbg !77
+  %941 = fmul float %901, %.0.i24, !dbg !77
+  %942 = fmul float %902, %.0.i24, !dbg !77
+  %943 = fmul float %903, %.0.i24, !dbg !77
+  %944 = fmul float %904, %.0.i24, !dbg !77
+  %945 = fmul float %905, %.0.i24, !dbg !77
+  %946 = fmul float %906, %.0.i24, !dbg !77
+  %947 = fmul float %931, %820, !dbg !78
+  %948 = fmul float %932, %821, !dbg !78
+  %949 = fmul float %933, %822, !dbg !78
+  %950 = fmul float %934, %823, !dbg !78
+  %951 = fmul float %935, %829, !dbg !78
+  %952 = fmul float %936, %830, !dbg !78
+  %953 = fmul float %937, %831, !dbg !78
+  %954 = fmul float %938, %832, !dbg !78
+  %955 = fmul float %939, %820, !dbg !78
+  %956 = fmul float %940, %821, !dbg !78
+  %957 = fmul float %941, %822, !dbg !78
+  %958 = fmul float %942, %823, !dbg !78
+  %959 = fmul float %943, %829, !dbg !78
+  %960 = fmul float %944, %830, !dbg !78
+  %961 = fmul float %945, %831, !dbg !78
+  %962 = fmul float %946, %832, !dbg !78
+  %963 = add i32 %761, %757, !dbg !79
+  %964 = add i32 %761, %758, !dbg !79
+  %965 = sext i32 %963 to i64, !dbg !80
+  %966 = getelementptr i16, ptr addrspace(1) %4, i64 %965, !dbg !80
+  %967 = sext i32 %964 to i64, !dbg !80
+  %968 = getelementptr i16, ptr addrspace(1) %4, i64 %967, !dbg !80
+  %969 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #6, !dbg !81
+  %970 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #6, !dbg !81
+  %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #6, !dbg !81
+  %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #6, !dbg !81
+  %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #6, !dbg !81
+  %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #6, !dbg !81
+  %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #6, !dbg !81
+  %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #6, !dbg !81
+  %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %955) #6, !dbg !81
+  %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %956) #6, !dbg !81
+  %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %957) #6, !dbg !81
+  %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %958) #6, !dbg !81
+  %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %959) #6, !dbg !81
+  %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %960) #6, !dbg !81
+  %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %961) #6, !dbg !81
+  %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %962) #6, !dbg !81
+  %985 = insertelement <2 x i16> undef, i16 %969, i64 0, !dbg !81
+  %986 = insertelement <2 x i16> %985, i16 %970, i64 1, !dbg !81
+  %987 = bitcast <2 x i16> %986 to i32, !dbg !81
+  %988 = insertelement <2 x i16> undef, i16 %971, i64 0, !dbg !81
+  %989 = insertelement <2 x i16> %988, i16 %972, i64 1, !dbg !81
+  %990 = bitcast <2 x i16> %989 to i32, !dbg !81
+  %991 = insertelement <2 x i16> undef, i16 %973, i64 0, !dbg !81
+  %992 = insertelement <2 x i16> %991, i16 %974, i64 1, !dbg !81
+  %993 = bitcast <2 x i16> %992 to i32, !dbg !81
+  %994 = insertelement <2 x i16> undef, i16 %975, i64 0, !dbg !81
+  %995 = insertelement <2 x i16> %994, i16 %976, i64 1, !dbg !81
+  %996 = bitcast <2 x i16> %995 to i32, !dbg !81
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %987, i32 %990, i32 %993, i32 %996, ptr addrspace(1) %966, i1 true) #6, !dbg !81
+  %997 = insertelement <2 x i16> undef, i16 %977, i64 0, !dbg !81
+  %998 = insertelement <2 x i16> %997, i16 %978, i64 1, !dbg !81
+  %999 = bitcast <2 x i16> %998 to i32, !dbg !81
+  %1000 = insertelement <2 x i16> undef, i16 %979, i64 0, !dbg !81
+  %1001 = insertelement <2 x i16> %1000, i16 %980, i64 1, !dbg !81
+  %1002 = bitcast <2 x i16> %1001 to i32, !dbg !81
+  %1003 = insertelement <2 x i16> undef, i16 %981, i64 0, !dbg !81
+  %1004 = insertelement <2 x i16> %1003, i16 %982, i64 1, !dbg !81
+  %1005 = bitcast <2 x i16> %1004 to i32, !dbg !81
+  %1006 = insertelement <2 x i16> undef, i16 %983, i64 0, !dbg !81
+  %1007 = insertelement <2 x i16> %1006, i16 %984, i64 1, !dbg !81
+  %1008 = bitcast <2 x i16> %1007 to i32, !dbg !81
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %999, i32 %1002, i32 %1005, i32 %1008, ptr addrspace(1) %968, i1 true) #6, !dbg !81
+  %1009 = add nuw nsw i32 %760, 64, !dbg !64
+  %1010 = icmp ult i32 %760, 192, !dbg !64
+  br i1 %1010, label %759, label %1011, !dbg !64
+
+1011:                                             ; preds = %__nv_rsqrtf.exit25
+  ret void, !dbg !82
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 22, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 36, scope: !7)
+!23 = !DILocation(line: 39, column: 40, scope: !7)
+!24 = !DILocation(line: 40, column: 44, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 39, column: 55, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 34, scope: !7)
+!32 = !DILocation(line: 40, column: 52, scope: !7)
+!33 = !DILocation(line: 41, column: 22, scope: !7)
+!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 44, column: 38, scope: !35)
+!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
+!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
+!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
+!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
+!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
+!43 = !DILocation(line: 47, column: 48, scope: !7)
+!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
+!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
+!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 41, scope: !45)
+!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
+!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
+!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
+!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
+!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
+!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
+!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
+!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
+!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
+!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
+!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
+!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
+!60 = !DILocation(line: 50, column: 41, scope: !35)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..fade03c9b7b8213cece19d92fb4462dff4c94d95
--- /dev/null
+++ b/.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir
@@ -0,0 +1,137 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
+    %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x64xi32>
+    %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
+    %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %10 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
+    %11 = arith.muli %10, %cst_8 : tensor<64x1xi32>
+    %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %14 = arith.addi %9, %cst_3 : tensor<64x1xi64>
+    %15 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
+    %16 = arith.select %15, %14, %9 : tensor<64x1xi1>, tensor<64x1xi64>
+    %17 = arith.cmpi sge, %16, %cst_2 : tensor<64x1xi64>
+    %18 = arith.cmpi slt, %16, %cst_3 : tensor<64x1xi64>
+    %19 = arith.andi %17, %18 : tensor<64x1xi1>
+    %20 = arith.muli %16, %cst_1 : tensor<64x1xi64>
+    %21 = tt.broadcast %20 : (tensor<64x1xi64>) -> tensor<64x64xi64>
+    %22 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %23:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>)  : i32 {
+      %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
+      %47 = arith.addi %46, %6 : tensor<1x64xi32>
+      %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
+      %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
+      %50 = arith.addi %49, %12 : tensor<64x64xi32>
+      %51 = tt.addptr %13, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
+      %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
+      %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %54 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
+      %55 = tt.broadcast %54 : (tensor<1x64xi64>) -> tensor<64x64xi64>
+      %56 = arith.addi %55, %21 : tensor<64x64xi64>
+      %57 = tt.addptr %22, %56 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
+      %58 = tt.load %57, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      %59 = arith.addf %58, %53 : tensor<64x64xf32>
+      %60 = arith.subf %59, %arg8 : tensor<64x64xf32>
+      %61 = arith.addf %arg10, %cst_0 : tensor<64x64xf32>
+      %62 = arith.divf %60, %61 : tensor<64x64xf32>
+      %63 = arith.addf %arg8, %62 : tensor<64x64xf32>
+      %64 = arith.subf %59, %63 : tensor<64x64xf32>
+      %65 = arith.mulf %60, %64 : tensor<64x64xf32>
+      %66 = arith.addf %arg9, %65 : tensor<64x64xf32>
+      %67 = arith.select %52, %63, %arg8 : tensor<64x64xi1>, tensor<64x64xf32>
+      %68 = arith.select %52, %66, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
+      %69 = arith.select %52, %61, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
+      scf.yield %67, %68, %69 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
+    }
+    %24:3 = "tt.reduce"(%23#0, %23#1, %23#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %46 = arith.subf %arg10, %arg7 : f32
+      %47 = arith.addf %arg9, %arg12 : f32
+      %48 = arith.cmpf oeq, %47, %cst : f32
+      %49 = arith.divf %arg12, %47 : f32
+      %50 = arith.select %48, %cst, %49 : f32
+      %51 = arith.mulf %46, %50 : f32
+      %52 = arith.addf %arg7, %51 : f32
+      %53 = arith.addf %arg8, %arg11 : f32
+      %54 = arith.mulf %46, %46 : f32
+      %55 = arith.mulf %54, %arg9 : f32
+      %56 = arith.mulf %55, %50 : f32
+      %57 = arith.addf %53, %56 : f32
+      tt.reduce.return %52, %57, %47 : f32, f32, f32
+    }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %25 = tt.expand_dims %24#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %26 = tt.expand_dims %24#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %27 = arith.muli %10, %cst_8 : tensor<64x1xi32>
+    %28 = tt.broadcast %27 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %30 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
+    %31 = arith.addi %9, %cst_3 : tensor<64x1xi64>
+    %32 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
+    %33 = arith.select %32, %31, %9 : tensor<64x1xi1>, tensor<64x1xi64>
+    %34 = arith.cmpi sge, %33, %cst_2 : tensor<64x1xi64>
+    %35 = arith.cmpi slt, %33, %cst_3 : tensor<64x1xi64>
+    %36 = arith.andi %34, %35 : tensor<64x1xi1>
+    %37 = arith.muli %33, %cst_1 : tensor<64x1xi64>
+    %38 = tt.broadcast %37 : (tensor<64x1xi64>) -> tensor<64x64xi64>
+    %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %40 = tt.broadcast %25 : (tensor<64x1xf32>) -> tensor<64x64xf32>
+    %41 = arith.divf %26, %cst_5 : tensor<64x1xf32>
+    %42 = arith.addf %41, %cst_4 : tensor<64x1xf32>
+    %43 = arith.muli %5, %cst_8 : tensor<64x1xi32>
+    %44 = tt.broadcast %43 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %45 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
+      %47 = arith.addi %46, %6 : tensor<1x64xi32>
+      %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
+      %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
+      %50 = arith.addi %49, %28 : tensor<64x64xi32>
+      %51 = tt.addptr %29, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
+      %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
+      %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
+      %54 = tt.addptr %30, %47 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
+      %55 = tt.load %54, %48, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
+      tt.assert %36, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %56 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
+      %57 = tt.broadcast %56 : (tensor<1x64xi64>) -> tensor<64x64xi64>
+      %58 = arith.addi %57, %38 : tensor<64x64xi64>
+      %59 = tt.addptr %39, %58 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
+      %60 = tt.load %59, %52, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
+      %61 = arith.addf %60, %53 : tensor<64x64xf32>
+      %62 = arith.subf %61, %40 : tensor<64x64xf32>
+      %63 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %64 = tt.broadcast %63 : (tensor<64x1xf32>) -> tensor<64x64xf32>
+      %65 = arith.mulf %62, %64 : tensor<64x64xf32>
+      %66 = tt.broadcast %55 : (tensor<1x64xf32>) -> tensor<64x64xf32>
+      %67 = arith.mulf %65, %66 : tensor<64x64xf32>
+      %68 = arith.addi %49, %44 : tensor<64x64xi32>
+      %69 = tt.addptr %45, %68 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
+      %70 = arith.truncf %67 : tensor<64x64xf32> to tensor<64x64xbf16>
+      tt.store %69, %70, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..91bba8de63eb5aee8f4c3dbcb6ba7dbd5bf32aea
Binary files /dev/null and b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin differ
diff --git a/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..ed8d72577e3161f892d79f5c6dc31ff9e87623e0
--- /dev/null
+++ b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx
@@ -0,0 +1,1154 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62};
+.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62};
+.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<65>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<188>;
+	.reg .f32 	%f<166>;
+	.reg .b64 	%rd<99>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd24, [triton__0d1d2d3d4d5d6de7de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	ld.param.u64 	%rd25, [triton__0d1d2d3d4d5d6de7de_param_1];
+	bfe.u32 	%r3, %r1, 6, 1;
+	and.b32  	%r4, %r1, 1;
+	.loc	1 24 33
+	shl.b32 	%r23, %r1, 1;
+	and.b32  	%r5, %r23, 126;
+	.loc	1 21 28
+	mov.u32 %r14, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r24, %r14, 1;
+	.loc	1 22 23
+	or.b32  	%r25, %r24, %r3;
+	or.b32  	%r26, %r24, %r4;
+	.loc	1 26 30
+	mul.wide.s32 	%rd26, %r25, 8;
+	add.s64 	%rd17, %rd24, %rd26;
+	mul.wide.s32 	%rd27, %r26, 8;
+	add.s64 	%rd21, %rd24, %rd27;
+	mov.pred 	%p61, -1;
+	.loc	1 26 35
+	mov.u64 %rd16, 0x0;
+	@%p61 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
+	mov.u64 %rd18, 0x0;
+	@%p61 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd17 + 0 ];
+	mov.u64 %rd20, 0x0;
+	@%p61 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r27, %r14, 30, 1;
+	shr.u32 	%r28, %r27, 23;
+	add.s32 	%r29, %r25, %r28;
+	and.b32  	%r30, %r29, 16776704;
+	sub.s32 	%r31, %r25, %r30;
+	.loc	1 35 44
+	shl.b32 	%r6, %r31, 8;
+	.loc	1 36 44
+	shl.b32 	%r7, %r25, 8;
+	.loc	1 37 22
+	add.s64 	%rd28, %rd20, 50257;
+	.loc	1 38 22
+	setp.lt.s64 	%p9, %rd16, 0;
+	setp.lt.s64 	%p10, %rd20, 0;
+	.loc	1 39 36
+	selp.b64 	%rd1, %rd28, %rd20, %p10;
+	.loc	1 40 40
+	setp.lt.u64 	%p11, %rd1, 50257;
+	.loc	1 41 44
+	shl.b64 	%rd29, %rd16, 8;
+	add.s64 	%rd30, %rd29, 12865792;
+	selp.b64 	%rd31, %rd30, %rd29, %p9;
+	shl.b64 	%rd32, %rd31, 2;
+	add.s64 	%rd2, %rd25, %rd32;
+	.loc	1 35 40
+	or.b32  	%r32, %r5, %r6;
+	.loc	1 35 34
+	mul.wide.s32 	%rd33, %r32, 4;
+	add.s64 	%rd62, %rd12, %rd33;
+	mov.b32 	%r179, 0;
+	.loc	1 35 50
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r15, %r16 }, [ %rd62 + 0 ];
+	@!%p61 mov.u32 %r15, %r179;
+	@!%p61 mov.u32 %r16, %r179;
+	mov.b32 	%f2, %r16;
+	mov.b32 	%f1, %r15;
+	.loc	1 36 40
+	or.b32  	%r33, %r5, %r7;
+	.loc	1 36 34
+	mul.wide.s32 	%rd34, %r33, 2;
+	add.s64 	%rd63, %rd13, %rd34;
+	.loc	1 36 50
+	mov.u32 %r19, 0x0;
+	@%p61 ld.global.L1::evict_last.b32 { %r19 }, [ %rd63 + 0 ];
+	@!%p61 mov.u32 %r19, %r179;
+	cvt.u16.u32 	%rs1, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r21, %rs1;
+	mov.b32 	%f3, %r21;
+	cvt.f32.bf16 %r22, %rs2;
+	mov.b32 	%f4, %r22;
+	mov.u64 	%rd95, assertMessage_0;
+	mov.u64 	%rd96, assertFile_0;
+	mov.u64 	%rd97, assertFunc_0;
+	mov.b32 	%r187, 1892;
+	mov.u64 	%rd98, 1;
+	.loc	1 40 55
+	@%p11 bra 	$L__BB0_2;
+	cvta.global.u64 	%rd36, %rd95;
+	cvta.global.u64 	%rd38, %rd96;
+	cvta.global.u64 	%rd40, %rd97;
+	{ // callseq 2, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd36;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd38;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r187;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd40;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd98;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 2
+$L__BB0_2:
+	.loc	1 0 55
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6de7de_param_4];
+	and.b32  	%r2, %r1, 31;
+	.loc	1 41 40
+	cvt.u64.u32 	%rd45, %r5;
+	.loc	1 41 34
+	mul.wide.u32 	%rd46, %r5, 4;
+	add.s64 	%rd73, %rd2, %rd46;
+	.loc	1 41 52
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r35, %r36 }, [ %rd73 + 0 ];
+	@!%p61 mov.u32 %r35, %r179;
+	@!%p61 mov.u32 %r36, %r179;
+	mov.b32 	%f21, %r36;
+	mov.b32 	%f22, %r35;
+	.loc	1 42 22
+	add.f32 	%f23, %f1, %f22;
+	add.f32 	%f24, %f2, %f21;
+	.loc	1 44 22
+	add.f32 	%f25, %f4, %f24;
+	mov.b32 	%r43, %f25;
+	add.f32 	%f26, %f3, %f23;
+	mov.b32 	%r40, %f26;
+	mov.b32 	%r41, 1065353216;
+$L__tmp1:
+	.loc	2 98 30
+	div.full.f32 %r39, %r40, %r41;
+	mov.b32 	%f27, %r39;
+	div.full.f32 %r42, %r43, %r41;
+	mov.b32 	%f28, %r42;
+	.loc	2 98 22
+	add.f32 	%f6, %f28, 0f00000000;
+	add.f32 	%f5, %f27, 0f00000000;
+	.loc	2 101 30
+	sub.f32 	%f29, %f26, %f5;
+	sub.f32 	%f30, %f25, %f6;
+$L__tmp2:
+	.loc	1 50 50
+	fma.rn.f32 	%f8, %f25, %f30, 0f00000000;
+	fma.rn.f32 	%f7, %f26, %f29, 0f00000000;
+	.loc	1 35 34
+	cvt.s64.s32 	%rd47, %r6;
+	add.s64 	%rd48, %rd45, %rd47;
+	shl.b64 	%rd49, %rd48, 2;
+	add.s64 	%rd50, %rd12, %rd49;
+	add.s64 	%rd75, %rd50, 512;
+	.loc	1 35 50
+	mov.u32 %r45, 0x0;
+	mov.u32 %r46, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r45, %r46 }, [ %rd75 + 0 ];
+	@!%p61 mov.u32 %r45, %r179;
+	@!%p61 mov.u32 %r46, %r179;
+	mov.b32 	%f10, %r46;
+	mov.b32 	%f9, %r45;
+	.loc	1 36 34
+	cvt.s64.s32 	%rd51, %r7;
+	add.s64 	%rd8, %rd45, %rd51;
+	shl.b64 	%rd52, %rd8, 1;
+	add.s64 	%rd53, %rd13, %rd52;
+	add.s64 	%rd76, %rd53, 256;
+	.loc	1 36 50
+	mov.u32 %r49, 0x0;
+	@%p61 ld.global.L1::evict_last.b32 { %r49 }, [ %rd76 + 0 ];
+	@!%p61 mov.u32 %r49, %r179;
+	cvt.u16.u32 	%rs3, %r49;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r49; }
+	.loc	1 36 101
+	cvt.f32.bf16 %r51, %rs3;
+	mov.b32 	%f11, %r51;
+	cvt.f32.bf16 %r52, %rs4;
+	mov.b32 	%f12, %r52;
+	.loc	1 40 55
+	@%p11 bra 	$L__BB0_4;
+	cvta.global.u64 	%rd55, %rd95;
+	cvta.global.u64 	%rd57, %rd96;
+	cvta.global.u64 	%rd59, %rd97;
+	{ // callseq 3, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd55;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd57;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r187;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd59;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd98;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 3
+$L__BB0_4:
+	.loc	1 0 55
+	ld.param.u64 	%rd15, [triton__0d1d2d3d4d5d6de7de_param_5];
+	cvt.s64.s32 	%rd4, %r33;
+	.loc	1 41 34
+	add.s64 	%rd86, %rd73, 512;
+	.loc	1 41 52
+	mov.u32 %r54, 0x0;
+	mov.u32 %r55, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r54, %r55 }, [ %rd86 + 0 ];
+	@!%p61 mov.u32 %r54, %r179;
+	@!%p61 mov.u32 %r55, %r179;
+	mov.b32 	%f31, %r54;
+	mov.b32 	%f32, %r55;
+	.loc	1 42 22
+	add.f32 	%f33, %f10, %f32;
+	add.f32 	%f34, %f9, %f31;
+	.loc	1 44 22
+	add.f32 	%f35, %f11, %f34;
+	add.f32 	%f36, %f12, %f33;
+$L__tmp3:
+	.loc	2 96 20
+	sub.f32 	%f37, %f36, %f6;
+	mov.b32 	%r62, %f37;
+	sub.f32 	%f38, %f35, %f5;
+	mov.b32 	%r59, %f38;
+	mov.b32 	%r60, 1073741824;
+	.loc	2 98 30
+	div.full.f32 %r58, %r59, %r60;
+	mov.b32 	%f39, %r58;
+	div.full.f32 %r61, %r62, %r60;
+	mov.b32 	%f40, %r61;
+	.loc	2 98 22
+	add.f32 	%f41, %f6, %f40;
+	add.f32 	%f42, %f5, %f39;
+	.loc	2 101 30
+	sub.f32 	%f43, %f35, %f42;
+	sub.f32 	%f44, %f36, %f41;
+$L__tmp4:
+	.loc	1 50 50
+	fma.rn.f32 	%f45, %f37, %f44, %f8;
+	fma.rn.f32 	%f46, %f38, %f43, %f7;
+	.loc	1 24 33
+	and.b32  	%r119, %r1, 127;
+	.loc	1 31 36
+	shl.b32 	%r120, %r119, 2;
+	mov.u32 	%r121, global_smem;
+	add.s32 	%r8, %r121, %r120;
+	st.shared.u32 	[%r8], %r60;
+	st.shared.u32 	[%r8+520], %r60;
+	bar.sync 	0;
+	mad.lo.s32 	%r122, %r3, 130, %r5;
+	shl.b32 	%r123, %r122, 2;
+	add.s32 	%r124, %r121, %r123;
+	ld.shared.v2.f32 	{%f47, %f48}, [%r124];
+$L__tmp5:
+	.loc	2 120 46
+	bar.sync 	0;
+$L__tmp6:
+	.loc	2 108 21
+	sub.f32 	%f49, %f41, %f42;
+	.loc	2 109 28
+	add.f32 	%f50, %f47, %f48;
+	.loc	2 110 39
+	setp.eq.f32 	%p41, %f50, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r65, %f48;
+	mov.b32 	%r66, %f50;
+	div.full.f32 %r64, %r65, %r66;
+	mov.b32 	%f51, %r64;
+	.loc	2 110 49
+	selp.f32 	%f52, 0f00000000, %f51, %p41;
+	.loc	2 112 17
+	fma.rn.f32 	%f53, %f49, %f52, %f42;
+	.loc	2 113 15
+	add.f32 	%f54, %f46, %f45;
+	.loc	2 113 30
+	mul.f32 	%f55, %f49, %f49;
+	.loc	2 113 38
+	mul.f32 	%f56, %f55, %f47;
+	.loc	2 113 22
+	fma.rn.f32 	%f57, %f56, %f52, %f54;
+$L__tmp7:
+	.loc	2 120 46
+	mov.b32 	%r125, %f53;
+	shfl.sync.bfly.b32	%r126, %r125, 16, 31, -1;
+	mov.b32 	%f58, %r126;
+	mov.b32 	%r127, %f57;
+	shfl.sync.bfly.b32	%r128, %r127, 16, 31, -1;
+	mov.b32 	%f59, %r128;
+	shfl.sync.bfly.b32	%r68, %r66, 16, 31, -1;
+	mov.b32 	%f60, %r68;
+$L__tmp8:
+	.loc	2 108 21
+	sub.f32 	%f61, %f58, %f53;
+	.loc	2 109 28
+	add.f32 	%f62, %f50, %f60;
+	.loc	2 110 39
+	setp.eq.f32 	%p42, %f62, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r69, %f62;
+	div.full.f32 %r67, %r68, %r69;
+	mov.b32 	%f63, %r67;
+	.loc	2 110 49
+	selp.f32 	%f64, 0f00000000, %f63, %p42;
+	.loc	2 112 17
+	fma.rn.f32 	%f65, %f61, %f64, %f53;
+	.loc	2 113 15
+	add.f32 	%f66, %f57, %f59;
+	.loc	2 113 30
+	mul.f32 	%f67, %f61, %f61;
+	.loc	2 113 38
+	mul.f32 	%f68, %f50, %f67;
+	.loc	2 113 22
+	fma.rn.f32 	%f69, %f68, %f64, %f66;
+$L__tmp9:
+	.loc	2 120 46
+	mov.b32 	%r129, %f65;
+	shfl.sync.bfly.b32	%r130, %r129, 8, 31, -1;
+	mov.b32 	%f70, %r130;
+	mov.b32 	%r131, %f69;
+	shfl.sync.bfly.b32	%r132, %r131, 8, 31, -1;
+	mov.b32 	%f71, %r132;
+	shfl.sync.bfly.b32	%r71, %r69, 8, 31, -1;
+	mov.b32 	%f72, %r71;
+$L__tmp10:
+	.loc	2 108 21
+	sub.f32 	%f73, %f70, %f65;
+	.loc	2 109 28
+	add.f32 	%f74, %f62, %f72;
+	.loc	2 110 39
+	setp.eq.f32 	%p43, %f74, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r72, %f74;
+	div.full.f32 %r70, %r71, %r72;
+	mov.b32 	%f75, %r70;
+	.loc	2 110 49
+	selp.f32 	%f76, 0f00000000, %f75, %p43;
+	.loc	2 112 17
+	fma.rn.f32 	%f77, %f73, %f76, %f65;
+	.loc	2 113 15
+	add.f32 	%f78, %f69, %f71;
+	.loc	2 113 30
+	mul.f32 	%f79, %f73, %f73;
+	.loc	2 113 38
+	mul.f32 	%f80, %f62, %f79;
+	.loc	2 113 22
+	fma.rn.f32 	%f81, %f76, %f80, %f78;
+$L__tmp11:
+	.loc	2 120 46
+	mov.b32 	%r133, %f77;
+	shfl.sync.bfly.b32	%r134, %r133, 4, 31, -1;
+	mov.b32 	%f82, %r134;
+	mov.b32 	%r135, %f81;
+	shfl.sync.bfly.b32	%r136, %r135, 4, 31, -1;
+	mov.b32 	%f83, %r136;
+	shfl.sync.bfly.b32	%r74, %r72, 4, 31, -1;
+	mov.b32 	%f84, %r74;
+$L__tmp12:
+	.loc	2 108 21
+	sub.f32 	%f85, %f82, %f77;
+	.loc	2 109 28
+	add.f32 	%f86, %f74, %f84;
+	.loc	2 110 39
+	setp.eq.f32 	%p44, %f86, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r75, %f86;
+	div.full.f32 %r73, %r74, %r75;
+	mov.b32 	%f87, %r73;
+	.loc	2 110 49
+	selp.f32 	%f88, 0f00000000, %f87, %p44;
+	.loc	2 112 17
+	fma.rn.f32 	%f89, %f85, %f88, %f77;
+	.loc	2 113 15
+	add.f32 	%f90, %f81, %f83;
+	.loc	2 113 30
+	mul.f32 	%f91, %f85, %f85;
+	.loc	2 113 38
+	mul.f32 	%f92, %f74, %f91;
+	.loc	2 113 22
+	fma.rn.f32 	%f93, %f88, %f92, %f90;
+$L__tmp13:
+	.loc	2 120 46
+	mov.b32 	%r137, %f89;
+	shfl.sync.bfly.b32	%r138, %r137, 2, 31, -1;
+	mov.b32 	%f94, %r138;
+	mov.b32 	%r139, %f93;
+	shfl.sync.bfly.b32	%r140, %r139, 2, 31, -1;
+	mov.b32 	%f95, %r140;
+	shfl.sync.bfly.b32	%r77, %r75, 2, 31, -1;
+	mov.b32 	%f96, %r77;
+$L__tmp14:
+	.loc	2 108 21
+	sub.f32 	%f97, %f94, %f89;
+	.loc	2 109 28
+	add.f32 	%f98, %f86, %f96;
+	.loc	2 110 39
+	setp.eq.f32 	%p45, %f98, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r78, %f98;
+	div.full.f32 %r76, %r77, %r78;
+	mov.b32 	%f99, %r76;
+	.loc	2 110 49
+	selp.f32 	%f100, 0f00000000, %f99, %p45;
+	.loc	2 112 17
+	fma.rn.f32 	%f101, %f97, %f100, %f89;
+	.loc	2 113 15
+	add.f32 	%f102, %f93, %f95;
+	.loc	2 113 30
+	mul.f32 	%f103, %f97, %f97;
+	.loc	2 113 38
+	mul.f32 	%f104, %f86, %f103;
+	.loc	2 113 22
+	fma.rn.f32 	%f105, %f100, %f104, %f102;
+$L__tmp15:
+	.loc	2 120 46
+	mov.b32 	%r141, %f101;
+	shfl.sync.bfly.b32	%r142, %r141, 1, 31, -1;
+	mov.b32 	%f106, %r142;
+	mov.b32 	%r143, %f105;
+	shfl.sync.bfly.b32	%r144, %r143, 1, 31, -1;
+	mov.b32 	%f107, %r144;
+	shfl.sync.bfly.b32	%r80, %r78, 1, 31, -1;
+	mov.b32 	%f108, %r80;
+$L__tmp16:
+	.loc	2 108 21
+	sub.f32 	%f109, %f106, %f101;
+	.loc	2 109 28
+	add.f32 	%f110, %f98, %f108;
+	.loc	2 110 39
+	setp.eq.f32 	%p46, %f110, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r81, %f110;
+	div.full.f32 %r79, %r80, %r81;
+	mov.b32 	%f111, %r79;
+	.loc	2 110 49
+	selp.f32 	%f112, 0f00000000, %f111, %p46;
+	.loc	2 112 17
+	fma.rn.f32 	%f113, %f109, %f112, %f101;
+	.loc	2 113 15
+	add.f32 	%f114, %f105, %f107;
+	.loc	2 113 30
+	mul.f32 	%f115, %f109, %f109;
+	.loc	2 113 38
+	mul.f32 	%f116, %f98, %f115;
+	.loc	2 113 22
+	fma.rn.f32 	%f117, %f112, %f116, %f114;
+$L__tmp17:
+	.loc	2 120 46
+	setp.eq.s32 	%p24, %r2, 0;
+	shr.u32 	%r145, %r1, 3;
+	and.b32  	%r146, %r145, 4;
+	shl.b32 	%r147, %r3, 3;
+	or.b32  	%r148, %r147, %r146;
+	add.s32 	%r82, %r121, %r148;
+	mov.b32 	%r83, %f113;
+	@%p24 st.shared.b32 [ %r82 + 0 ], %r83;
+	add.s32 	%r149, %r121, 16;
+	add.s32 	%r84, %r149, %r148;
+	mov.b32 	%r85, %f117;
+	@%p24 st.shared.b32 [ %r84 + 0 ], %r85;
+	add.s32 	%r150, %r121, 32;
+	add.s32 	%r86, %r150, %r148;
+	@%p24 st.shared.b32 [ %r86 + 0 ], %r81;
+	bar.sync 	0;
+	setp.lt.s32 	%p27, %r1, 4;
+	shl.b32 	%r151, %r1, 2;
+	add.s32 	%r89, %r121, %r151;
+	@%p27 ld.shared.b32 %r88, [ %r89 + 0 ];
+	mov.b32 	%f118, %r88;
+	add.s32 	%r91, %r149, %r151;
+	@%p27 ld.shared.b32 %r90, [ %r91 + 0 ];
+	mov.b32 	%f119, %r90;
+	add.s32 	%r93, %r150, %r151;
+	@%p27 ld.shared.b32 %r92, [ %r93 + 0 ];
+	mov.b32 	%f120, %r92;
+	shfl.sync.bfly.b32	%r152, %r88, 1, 31, -1;
+	mov.b32 	%f121, %r152;
+	shfl.sync.bfly.b32	%r153, %r90, 1, 31, -1;
+	mov.b32 	%f122, %r153;
+	shfl.sync.bfly.b32	%r95, %r92, 1, 31, -1;
+	mov.b32 	%f123, %r95;
+$L__tmp18:
+	.loc	2 108 21
+	sub.f32 	%f124, %f121, %f118;
+	.loc	2 109 28
+	add.f32 	%f125, %f120, %f123;
+	.loc	2 110 39
+	setp.eq.f32 	%p47, %f125, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r96, %f125;
+	div.full.f32 %r94, %r95, %r96;
+	mov.b32 	%f126, %r94;
+	.loc	2 110 49
+	selp.f32 	%f127, 0f00000000, %f126, %p47;
+	.loc	2 112 17
+	fma.rn.f32 	%f128, %f124, %f127, %f118;
+	.loc	2 113 15
+	add.f32 	%f129, %f119, %f122;
+	.loc	2 113 30
+	mul.f32 	%f130, %f124, %f124;
+	.loc	2 113 38
+	mul.f32 	%f131, %f120, %f130;
+	.loc	2 113 22
+	fma.rn.f32 	%f132, %f131, %f127, %f129;
+$L__tmp19:
+	.loc	2 120 46
+	setp.eq.s32 	%p48, %r4, 0;
+	and.pred  	%p30, %p27, %p48;
+	mov.b32 	%r98, %f128;
+	@%p30 st.shared.b32 [ %r89 + 0 ], %r98;
+	mov.b32 	%r100, %f132;
+	@%p30 st.shared.b32 [ %r91 + 0 ], %r100;
+	@%p30 st.shared.b32 [ %r93 + 0 ], %r96;
+	bar.sync 	0;
+	add.s32 	%r154, %r121, %r147;
+	ld.shared.f32 	%f13, [%r154];
+	add.s32 	%r155, %r149, %r147;
+$L__tmp20:
+	.loc	1 75 24
+	ld.shared.u32 	%r104, [%r155];
+	mov.b32 	%r105, 1132462080;
+	div.full.f32 %r103, %r104, %r105;
+	mov.b32 	%f133, %r103;
+	.loc	1 77 24
+	add.f32 	%f14, %f133, 0f3727C5AC;
+	shl.b32 	%r156, %r5, 2;
+	add.s32 	%r9, %r121, %r156;
+	.loc	1 62 51
+	mov.u32 %r109, 0x0;
+	mov.u32 %r110, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r109, %r110 }, [ %rd62 + 0 ];
+	@!%p61 mov.u32 %r109, %r179;
+	@!%p61 mov.u32 %r110, %r179;
+	mov.b32 	%f15, %r109;
+	mov.b32 	%f16, %r110;
+	.loc	1 63 51
+	mov.u32 %r113, 0x0;
+	@%p61 ld.global.L1::evict_first.b32 { %r113 }, [ %rd63 + 0 ];
+	@!%p61 mov.u32 %r113, %r179;
+	cvt.u16.u32 	%rs5, %r113;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r113; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r115, %rs5;
+	mov.b32 	%f17, %r115;
+	cvt.f32.bf16 %r116, %rs6;
+	mov.b32 	%f18, %r116;
+	.loc	1 64 35
+	mul.wide.u32 	%rd65, %r119, 4;
+	add.s64 	%rd64, %rd14, %rd65;
+	.loc	1 64 40
+	mov.u32 %r117, 0x0;
+	@%p61 ld.global.L1::evict_last.b32 { %r117 }, [ %rd64 + 0 ];
+	@!%p61 mov.u32 %r117, %r179;
+	mov.u64 	%rd90, assertMessage_1;
+	mov.u64 	%rd91, assertFile_1;
+	mov.u64 	%rd92, assertFunc_1;
+	.loc	1 68 57
+	@%p11 bra 	$L__BB0_6;
+	cvta.global.u64 	%rd67, %rd90;
+	cvta.global.u64 	%rd69, %rd91;
+	cvta.global.u64 	%rd71, %rd92;
+	{ // callseq 4, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd67;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd69;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r187;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd71;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd98;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 4
+$L__BB0_6:
+	.loc	1 69 54
+	mov.u32 %r158, 0x0;
+	mov.u32 %r159, 0x0;
+	@%p61 ld.global.L1::evict_first.v2.b32 { %r158, %r159 }, [ %rd73 + 0 ];
+	@!%p61 mov.u32 %r158, %r179;
+	@!%p61 mov.u32 %r159, %r179;
+	mov.b32 	%f134, %r158;
+	mov.b32 	%f135, %r159;
+	.loc	1 70 24
+	add.f32 	%f136, %f15, %f134;
+	add.f32 	%f137, %f16, %f135;
+	.loc	1 72 24
+	add.f32 	%f138, %f17, %f136;
+	add.f32 	%f139, %f18, %f137;
+	.loc	1 73 24
+	sub.f32 	%f140, %f138, %f13;
+	sub.f32 	%f141, %f139, %f13;
+	.loc	1 78 30
+	rsqrt.approx.ftz.f32 	%f142, %f14;
+	.loc	1 79 24
+	mul.f32 	%f143, %f140, %f142;
+	mul.f32 	%f144, %f141, %f142;
+	.loc	1 80 24
+	bar.sync 	0;
+	st.shared.u32 	[%r8], %r117;
+	bar.sync 	0;
+	ld.shared.v2.f32 	{%f145, %f146}, [%r9];
+	mul.f32 	%f147, %f143, %f145;
+	mul.f32 	%f148, %f144, %f146;
+	.loc	1 82 29
+	shl.b64 	%rd78, %rd4, 1;
+	add.s64 	%rd74, %rd15, %rd78;
+	.loc	1 82 52
+	mov.b32 	%r162, %f147;
+	cvt.rn.bf16.f32 %rs7, %r162;
+	mov.b32 	%r163, %f148;
+	cvt.rn.bf16.f32 %rs8, %r163;
+	mov.b32 	%r175, {%rs7, %rs8};
+	@%p61 st.global.b32 [ %rd74 + 0 ], { %r175 };
+	.loc	1 62 51
+	mov.u32 %r165, 0x0;
+	mov.u32 %r166, 0x0;
+	@%p61 ld.global.L1::evict_last.v2.b32 { %r165, %r166 }, [ %rd75 + 0 ];
+	@!%p61 mov.u32 %r165, %r179;
+	@!%p61 mov.u32 %r166, %r179;
+	.loc	1 63 51
+	mov.u32 %r169, 0x0;
+	@%p61 ld.global.L1::evict_first.b32 { %r169 }, [ %rd76 + 0 ];
+	@!%p61 mov.u32 %r169, %r179;
+	cvt.u16.u32 	%rs9, %r169;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r169; }
+	.loc	1 63 103
+	cvt.f32.bf16 %r171, %rs9;
+	mov.b32 	%f19, %r171;
+	cvt.f32.bf16 %r172, %rs10;
+	mov.b32 	%f20, %r172;
+	.loc	1 64 35
+	add.s64 	%rd77, %rd64, 512;
+	.loc	1 64 40
+	mov.u32 %r173, 0x0;
+	@%p61 ld.global.L1::evict_last.b32 { %r173 }, [ %rd77 + 0 ];
+	@!%p61 mov.u32 %r173, %r179;
+	.loc	1 68 57
+	@%p11 bra 	$L__BB0_8;
+	cvta.global.u64 	%rd80, %rd90;
+	cvta.global.u64 	%rd82, %rd91;
+	cvta.global.u64 	%rd84, %rd92;
+	{ // callseq 5, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd80;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd82;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r187;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd84;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd98;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 5
+$L__BB0_8:
+	.loc	1 69 54
+	mov.u32 %r177, 0x0;
+	mov.u32 %r178, 0x0;
+	@%p61 ld.global.L1::evict_first.v2.b32 { %r177, %r178 }, [ %rd86 + 0 ];
+	@!%p61 mov.u32 %r177, %r179;
+	@!%p61 mov.u32 %r178, %r179;
+	.loc	1 62 51
+	mov.b32 	%f150, %r166;
+	.loc	1 69 54
+	mov.b32 	%f151, %r178;
+	.loc	1 70 24
+	add.f32 	%f152, %f150, %f151;
+	.loc	1 72 24
+	add.f32 	%f153, %f20, %f152;
+	.loc	1 73 24
+	sub.f32 	%f154, %f153, %f13;
+	.loc	1 62 51
+	mov.b32 	%f155, %r165;
+	.loc	1 69 54
+	mov.b32 	%f156, %r177;
+	.loc	1 70 24
+	add.f32 	%f157, %f155, %f156;
+	.loc	1 72 24
+	add.f32 	%f158, %f19, %f157;
+	.loc	1 73 24
+	sub.f32 	%f159, %f158, %f13;
+	.loc	1 79 24
+	mul.f32 	%f160, %f159, %f142;
+	mul.f32 	%f161, %f154, %f142;
+	.loc	1 80 24
+	bar.sync 	0;
+	st.shared.u32 	[%r8], %r173;
+	bar.sync 	0;
+	ld.shared.v2.f32 	{%f162, %f163}, [%r9];
+	mul.f32 	%f164, %f160, %f162;
+	mul.f32 	%f165, %f161, %f163;
+	.loc	1 82 29
+	add.s64 	%rd89, %rd15, %rd52;
+	add.s64 	%rd87, %rd89, 256;
+	.loc	1 82 52
+	mov.b32 	%r181, %f164;
+	cvt.rn.bf16.f32 %rs11, %r181;
+	mov.b32 	%r182, %f165;
+	cvt.rn.bf16.f32 %rs12, %r182;
+	mov.b32 	%r184, {%rs11, %rs12};
+	@%p61 st.global.b32 [ %rd87 + 0 ], { %r184 };
+	.loc	1 58 4
+	ret;
+$L__tmp21:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccig6fki6p4lxrdmgg6eudahiexcvueeol2p4qp532pvve2y463y.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 302
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 103
+.b8 54
+.b8 102
+.b8 107
+.b8 105
+.b8 54
+.b8 112
+.b8 52
+.b8 108
+.b8 120
+.b8 114
+.b8 100
+.b8 109
+.b8 103
+.b8 103
+.b8 54
+.b8 101
+.b8 117
+.b8 100
+.b8 97
+.b8 104
+.b8 105
+.b8 101
+.b8 120
+.b8 99
+.b8 118
+.b8 117
+.b8 101
+.b8 101
+.b8 111
+.b8 108
+.b8 50
+.b8 112
+.b8 52
+.b8 113
+.b8 112
+.b8 53
+.b8 51
+.b8 50
+.b8 112
+.b8 118
+.b8 118
+.b8 101
+.b8 50
+.b8 121
+.b8 52
+.b8 54
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp4
+.b8 2
+.b8 47
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp5
+.b64 $L__tmp20
+.b8 2
+.b8 53
+.b8 44
+.b8 5
+.b32 125
+.b64 $L__tmp6
+.b64 $L__tmp19
+.b8 2
+.b8 53
+.b8 44
+.b8 4
+.b32 125
+.b64 $L__tmp6
+.b64 $L__tmp19
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 306
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..7a0ebe0ccc02860fec85dba78a5fef1462dd3be6
Binary files /dev/null and b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.cubin differ
diff --git a/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2795f42ce778902bab8010435bbf590e9fadf8c4
--- /dev/null
+++ b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ptx
@@ -0,0 +1,758 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6de7de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6de7de(
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<29>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<100>;
+	.reg .f32 	%f<86>;
+	.reg .b64 	%rd<16>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5d6de7de_param_0];
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6de7de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r66, %tid.x;
+	and.b32  	%r67, %r66, 31;
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6de7de_param_2];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5d6de7de_param_3];
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6de7de_param_4];
+	shl.b32 	%r68, %r66, 2;
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6de7de_param_5];
+	and.b32  	%r69, %r68, 252;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r70, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r71, %r70, %r69;
+	.loc	1 30 30
+	mul.wide.s32 	%rd13, %r71, 4;
+	add.s64 	%rd1, %rd7, %rd13;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r4;
+	mov.b32 	%f2, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd14, %r71, 2;
+	add.s64 	%rd2, %rd8, %rd14;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f3, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f4, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f5, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f6, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd9, %rd14;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f7, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f8, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f9, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f10, %r25;
+	.loc	1 33 30
+	add.s64 	%rd4, %rd10, %rd14;
+	.loc	1 33 46
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	cvt.u16.u32 	%rs9, %r26;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
+	cvt.u16.u32 	%rs11, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r30, %rs9;
+	mov.b32 	%f11, %r30;
+	cvt.f32.bf16 %r31, %rs10;
+	mov.b32 	%f12, %r31;
+	cvt.f32.bf16 %r32, %rs11;
+	mov.b32 	%f13, %r32;
+	cvt.f32.bf16 %r33, %rs12;
+	mov.b32 	%f14, %r33;
+	.loc	1 34 31
+	mul.wide.u32 	%rd15, %r69, 4;
+	add.s64 	%rd5, %rd11, %rd15;
+	.loc	1 34 36
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	@!%p1 mov.u32 %r36, %r6;
+	@!%p1 mov.u32 %r37, %r6;
+	.loc	1 36 18
+	add.f32 	%f15, %f5, %f1;
+	add.f32 	%f16, %f6, %f2;
+	.loc	1 38 18
+	add.f32 	%f17, %f15, %f9;
+	add.f32 	%f18, %f16, %f10;
+	.loc	1 30 46
+	mov.b32 	%f19, %r2;
+	mov.b32 	%f20, %r3;
+	.loc	1 36 18
+	add.f32 	%f21, %f4, %f20;
+	add.f32 	%f22, %f3, %f19;
+	.loc	1 38 18
+	add.f32 	%f23, %f22, %f7;
+	add.f32 	%f24, %f21, %f8;
+	.loc	1 40 18
+	add.f32 	%f25, %f24, %f12;
+	add.f32 	%f26, %f23, %f11;
+	add.f32 	%f27, %f17, %f13;
+	add.f32 	%f28, %f18, %f14;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f29, %f26, %f25;
+	add.f32 	%f30, %f29, %f27;
+	add.f32 	%f31, %f30, %f28;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r72, %f31;
+	shfl.sync.bfly.b32	%r73, %r72, 16, 31, -1;
+	mov.b32 	%f32, %r73;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r74, %f33;
+	shfl.sync.bfly.b32	%r75, %r74, 8, 31, -1;
+	mov.b32 	%f34, %r75;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f35, %f33, %f34;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r76, %f35;
+	shfl.sync.bfly.b32	%r77, %r76, 4, 31, -1;
+	mov.b32 	%f36, %r77;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f37, %f35, %f36;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r78, %f37;
+	shfl.sync.bfly.b32	%r79, %r78, 2, 31, -1;
+	mov.b32 	%f38, %r79;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f39, %f37, %f38;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r80, %f39;
+	shfl.sync.bfly.b32	%r81, %r80, 1, 31, -1;
+	mov.b32 	%f40, %r81;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p20, %r67, 0;
+	shr.u32 	%r82, %r66, 3;
+	and.b32  	%r83, %r82, 4;
+	mov.u32 	%r84, global_smem;
+	add.s32 	%r42, %r84, %r83;
+	mov.b32 	%r43, %f41;
+	@%p20 st.shared.b32 [ %r42 + 0 ], %r43;
+	bar.sync 	0;
+	setp.lt.s32 	%p21, %r66, 2;
+	add.s32 	%r45, %r84, %r68;
+	@%p21 ld.shared.b32 %r44, [ %r45 + 0 ];
+	mov.b32 	%f42, %r44;
+	shfl.sync.bfly.b32	%r85, %r44, 1, 31, -1;
+	mov.b32 	%f43, %r85;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f44, %f42, %f43;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r86, %r66, 1;
+	setp.eq.b32 	%p27, %r86, 1;
+	not.pred 	%p28, %p27;
+	and.pred  	%p22, %p21, %p28;
+	mov.b32 	%r47, %f44;
+	@%p22 st.shared.b32 [ %r45 + 0 ], %r47;
+	bar.sync 	0;
+	ld.shared.f32 	%f45, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f46, %f45, 0f00000000;
+$L__tmp16:
+	.loc	1 48 20
+	mov.b32 	%r49, %f46;
+	mov.b32 	%r50, 1132462080;
+	div.full.f32 %r48, %r49, %r50;
+	mov.b32 	%f47, %r48;
+	.loc	1 49 20
+	sub.f32 	%f48, %f26, %f47;
+	sub.f32 	%f49, %f25, %f47;
+	sub.f32 	%f50, %f27, %f47;
+	sub.f32 	%f51, %f28, %f47;
+	.loc	1 50 20
+	mul.f32 	%f52, %f49, %f49;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f53, %f48, %f48, %f52;
+	fma.rn.f32 	%f54, %f50, %f50, %f53;
+	fma.rn.f32 	%f55, %f51, %f51, %f54;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r87, %f55;
+	shfl.sync.bfly.b32	%r88, %r87, 16, 31, -1;
+	mov.b32 	%f56, %r88;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r89, %f57;
+	shfl.sync.bfly.b32	%r90, %r89, 8, 31, -1;
+	mov.b32 	%f58, %r90;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f59, %f57, %f58;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r91, %f59;
+	shfl.sync.bfly.b32	%r92, %r91, 4, 31, -1;
+	mov.b32 	%f60, %r92;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f61, %f59, %f60;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r93, %f61;
+	shfl.sync.bfly.b32	%r94, %r93, 2, 31, -1;
+	mov.b32 	%f62, %r94;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f63, %f61, %f62;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r95, %f63;
+	shfl.sync.bfly.b32	%r96, %r95, 1, 31, -1;
+	mov.b32 	%f64, %r96;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r52, %f65;
+	@%p20 st.shared.b32 [ %r42 + 0 ], %r52;
+	bar.sync 	0;
+	@%p21 ld.shared.b32 %r53, [ %r45 + 0 ];
+	mov.b32 	%f66, %r53;
+	shfl.sync.bfly.b32	%r97, %r53, 1, 31, -1;
+	mov.b32 	%f67, %r97;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f68, %f66, %f67;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r56, %f68;
+	@%p22 st.shared.b32 [ %r45 + 0 ], %r56;
+	bar.sync 	0;
+	ld.shared.f32 	%f69, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f70, %f69, 0f00000000;
+$L__tmp33:
+	.loc	1 56 20
+	mov.b32 	%r58, %f70;
+	div.full.f32 %r57, %r58, %r50;
+	mov.b32 	%f71, %r57;
+	.loc	1 58 20
+	add.f32 	%f72, %f71, 0f3727C5AC;
+	.loc	1 59 26
+	rsqrt.approx.ftz.f32 	%f73, %f72;
+	.loc	1 34 36
+	mov.b32 	%f74, %r37;
+	mov.b32 	%f75, %r36;
+	mov.b32 	%f76, %r35;
+	mov.b32 	%f77, %r34;
+	.loc	1 60 20
+	mul.f32 	%f78, %f48, %f73;
+	mul.f32 	%f79, %f49, %f73;
+	mul.f32 	%f80, %f50, %f73;
+	mul.f32 	%f81, %f51, %f73;
+	.loc	1 61 20
+	mul.f32 	%f82, %f78, %f77;
+	mul.f32 	%f83, %f79, %f76;
+	mul.f32 	%f84, %f80, %f75;
+	mul.f32 	%f85, %f81, %f74;
+	.loc	1 63 25
+	add.s64 	%rd6, %rd12, %rd14;
+	.loc	1 63 48
+	mov.b32 	%r60, %f82;
+	cvt.rn.bf16.f32 %rs13, %r60;
+	mov.b32 	%r61, %f83;
+	cvt.rn.bf16.f32 %rs14, %r61;
+	mov.b32 	%r62, %f84;
+	cvt.rn.bf16.f32 %rs15, %r62;
+	mov.b32 	%r63, %f85;
+	cvt.rn.bf16.f32 %rs16, %r63;
+	mov.b32 	%r98, {%rs13, %rs14};
+	mov.b32 	%r99, {%rs15, %rs16};
+	@%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r98, %r99 };
+	.loc	1 63 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/4q/c4qmi2qsgi5mnuig7w3wx5jmjnmvktjlgcv4c6q7w2vaw3bk6qzb.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 399
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 52
+.b8 113
+.b8 109
+.b8 105
+.b8 50
+.b8 113
+.b8 115
+.b8 103
+.b8 105
+.b8 53
+.b8 109
+.b8 110
+.b8 117
+.b8 105
+.b8 103
+.b8 55
+.b8 119
+.b8 51
+.b8 119
+.b8 120
+.b8 53
+.b8 106
+.b8 109
+.b8 106
+.b8 110
+.b8 109
+.b8 118
+.b8 107
+.b8 116
+.b8 106
+.b8 108
+.b8 103
+.b8 99
+.b8 118
+.b8 52
+.b8 99
+.b8 54
+.b8 113
+.b8 55
+.b8 119
+.b8 50
+.b8 118
+.b8 97
+.b8 119
+.b8 51
+.b8 98
+.b8 107
+.b8 54
+.b8 113
+.b8 122
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 52
+.b8 113
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 45
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 45
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 45
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 53
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 53
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 53
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 101
+.b8 55
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 403
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..4e392d00deef46b61401f127dd84a42edd0cd2f1
--- /dev/null
+++ b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttgir
@@ -0,0 +1,68 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %24 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %25 = arith.addf %24, %16 : tensor<256xf32, #blocked>
+    %26 = arith.addf %25, %20 : tensor<256xf32, #blocked>
+    %27 = arith.select %2, %26, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %29 = arith.addf %28, %cst_2 : f32
+    %30 = arith.divf %29, %cst_1 : f32
+    %31 = tt.splat %30 : (f32) -> tensor<256xf32, #blocked>
+    %32 = arith.subf %26, %31 : tensor<256xf32, #blocked>
+    %33 = arith.mulf %32, %32 : tensor<256xf32, #blocked>
+    %34 = arith.select %2, %33, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %36 = arith.addf %35, %cst_2 : f32
+    %37 = arith.divf %36, %cst_1 : f32
+    %38 = arith.addf %37, %cst_0 : f32
+    %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %40 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
+    %41 = arith.mulf %32, %40 : tensor<256xf32, #blocked>
+    %42 = arith.mulf %41, %23 : tensor<256xf32, #blocked>
+    %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %45 = arith.truncf %42 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttir b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c855241091cfa3597e5fddbc65457d13ba78b724
--- /dev/null
+++ b/.triton/dump/99f0a4c15ca0aab38ccdae6c765f7333/triton_.ttir
@@ -0,0 +1,67 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
+    %21 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %22 = tt.addptr %21, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %23 = tt.load %22, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %24 = arith.addf %8, %12 : tensor<256xf32>
+    %25 = arith.addf %24, %16 : tensor<256xf32>
+    %26 = arith.addf %25, %20 : tensor<256xf32>
+    %27 = arith.select %2, %26, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32>) -> f32
+    %29 = arith.addf %28, %cst_0 : f32
+    %30 = arith.divf %29, %cst_1 : f32
+    %31 = tt.splat %30 : (f32) -> tensor<256xf32>
+    %32 = arith.subf %26, %31 : tensor<256xf32>
+    %33 = arith.mulf %32, %32 : tensor<256xf32>
+    %34 = arith.select %2, %33, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %46 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %46 : f32
+    }) : (tensor<256xf32>) -> f32
+    %36 = arith.addf %35, %cst_0 : f32
+    %37 = arith.divf %36, %cst_1 : f32
+    %38 = arith.addf %37, %cst_2 : f32
+    %39 = tt.extern_elementwise %38 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %40 = tt.splat %39 : (f32) -> tensor<256xf32>
+    %41 = arith.mulf %32, %40 : tensor<256xf32>
+    %42 = arith.mulf %41, %23 : tensor<256xf32>
+    %43 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %44 = tt.addptr %43, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %45 = arith.truncf %42 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %44, %45, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c928b2a14f12b9f81c0a54a80f86598819a61434
Binary files /dev/null and b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin differ
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..bb3d87de0360b1df77539524aba63cc9907bda17
--- /dev/null
+++ b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir
@@ -0,0 +1,476 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 3, !dbg !10
+  %12 = lshr i32 %9, 1, !dbg !10
+  %13 = shl nuw nsw i32 %11, 4, !dbg !10
+  %14 = or i32 %13, %12, !dbg !10
+  %15 = and i32 %8, 63, !dbg !10
+  %16 = shl i32 %8, 2, !dbg !11
+  %17 = and i32 %16, 4, !dbg !11
+  %18 = and i32 %8, 7, !dbg !11
+  %19 = shl nuw nsw i32 %11, 2, !dbg !12
+  %20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %21 = shl i32 %20, 6, !dbg !14
+  %22 = or i32 %21, %14, !dbg !15
+  %23 = or i32 %21, %15, !dbg !15
+  %24 = sext i32 %22 to i64, !dbg !16
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
+  %26 = sext i32 %23 to i64, !dbg !16
+  %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
+  %33 = srem i32 %22, 512, !dbg !18
+  %34 = shl nsw i32 %33, 8, !dbg !19
+  %35 = add i64 %32, 50257, !dbg !20
+  %36 = icmp slt i64 %28, 0, !dbg !21
+  %37 = icmp slt i64 %32, 0, !dbg !21
+  %38 = select i1 %37, i64 %35, i64 %32, !dbg !22
+  %39 = icmp ugt i64 %38, 50256, !dbg !23
+  %40 = shl i64 %28, 8, !dbg !24
+  %41 = add i64 %40, 12865792, !dbg !24
+  %42 = select i1 %36, i64 %41, i64 %40, !dbg !24
+  %43 = getelementptr float, ptr addrspace(1) %1, i64 %42
+  br label %44, !dbg !12
+
+44:                                               ; preds = %7, %76
+  %45 = phi float [ 0.000000e+00, %7 ], [ %96, %76 ]
+  %46 = phi float [ 0.000000e+00, %7 ], [ %97, %76 ]
+  %47 = phi float [ 0.000000e+00, %7 ], [ %98, %76 ]
+  %48 = phi float [ 0.000000e+00, %7 ], [ %99, %76 ]
+  %49 = phi float [ 0.000000e+00, %7 ], [ %100, %76 ]
+  %50 = phi float [ 0.000000e+00, %7 ], [ %101, %76 ]
+  %51 = phi float [ 0.000000e+00, %7 ], [ %102, %76 ]
+  %52 = phi float [ 0.000000e+00, %7 ], [ %103, %76 ]
+  %53 = phi float [ 0.000000e+00, %7 ], [ %120, %76 ]
+  %54 = phi float [ 0.000000e+00, %7 ], [ %121, %76 ]
+  %55 = phi float [ 0.000000e+00, %7 ], [ %122, %76 ]
+  %56 = phi float [ 0.000000e+00, %7 ], [ %123, %76 ]
+  %57 = phi float [ 0.000000e+00, %7 ], [ %108, %76 ]
+  %58 = phi float [ 0.000000e+00, %7 ], [ %109, %76 ]
+  %59 = phi float [ 0.000000e+00, %7 ], [ %110, %76 ]
+  %60 = phi float [ 0.000000e+00, %7 ], [ %111, %76 ]
+  %61 = phi i32 [ 0, %7 ], [ %124, %76 ]
+  %62 = or i32 %61, %17, !dbg !25
+  %63 = add i32 %62, %34, !dbg !26
+  %64 = sext i32 %63 to i64, !dbg !27
+  %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
+  %66 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %67 = extractvalue { i32, i32, i32, i32 } %66, 0, !dbg !28
+  %68 = extractvalue { i32, i32, i32, i32 } %66, 1, !dbg !28
+  %69 = extractvalue { i32, i32, i32, i32 } %66, 2, !dbg !28
+  %70 = extractvalue { i32, i32, i32, i32 } %66, 3, !dbg !28
+  %71 = bitcast i32 %67 to float, !dbg !28
+  %72 = bitcast i32 %68 to float, !dbg !28
+  %73 = bitcast i32 %69 to float, !dbg !28
+  %74 = bitcast i32 %70 to float, !dbg !28
+  br i1 %39, label %75, label %76, !dbg !29
+
+75:                                               ; preds = %44
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
+  br label %76, !dbg !29
+
+76:                                               ; preds = %75, %44
+  %77 = zext nneg i32 %62 to i64, !dbg !30
+  %78 = getelementptr float, ptr addrspace(1) %43, i64 %77, !dbg !31
+  %79 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %78, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %80 = extractvalue { i32, i32, i32, i32 } %79, 0, !dbg !32
+  %81 = extractvalue { i32, i32, i32, i32 } %79, 1, !dbg !32
+  %82 = extractvalue { i32, i32, i32, i32 } %79, 2, !dbg !32
+  %83 = extractvalue { i32, i32, i32, i32 } %79, 3, !dbg !32
+  %84 = bitcast i32 %80 to float, !dbg !32
+  %85 = bitcast i32 %81 to float, !dbg !32
+  %86 = bitcast i32 %82 to float, !dbg !32
+  %87 = bitcast i32 %83 to float, !dbg !32
+  %88 = fadd float %71, %84, !dbg !33
+  %89 = fadd float %72, %85, !dbg !33
+  %90 = fadd float %73, %86, !dbg !33
+  %91 = fadd float %74, %87, !dbg !33
+  %92 = fsub float %88, %57, !dbg !34
+  %93 = fsub float %89, %58, !dbg !34
+  %94 = fsub float %90, %59, !dbg !34
+  %95 = fsub float %91, %60, !dbg !34
+  %96 = fadd float %45, 1.000000e+00, !dbg !38
+  %97 = fadd float %46, 1.000000e+00, !dbg !38
+  %98 = fadd float %47, 1.000000e+00, !dbg !38
+  %99 = fadd float %48, 1.000000e+00, !dbg !38
+  %100 = fadd float %49, 1.000000e+00, !dbg !38
+  %101 = fadd float %50, 1.000000e+00, !dbg !38
+  %102 = fadd float %51, 1.000000e+00, !dbg !38
+  %103 = fadd float %52, 1.000000e+00, !dbg !38
+  %104 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %92, float %96) #6, !dbg !39
+  %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %93, float %97) #6, !dbg !39
+  %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %94, float %98) #6, !dbg !39
+  %107 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %95, float %99) #6, !dbg !39
+  %108 = fadd float %57, %104, !dbg !40
+  %109 = fadd float %58, %105, !dbg !40
+  %110 = fadd float %59, %106, !dbg !40
+  %111 = fadd float %60, %107, !dbg !40
+  %112 = fsub float %88, %108, !dbg !41
+  %113 = fsub float %89, %109, !dbg !41
+  %114 = fsub float %90, %110, !dbg !41
+  %115 = fsub float %91, %111, !dbg !41
+  %116 = fmul float %92, %112, !dbg !42
+  %117 = fmul float %93, %113, !dbg !42
+  %118 = fmul float %94, %114, !dbg !42
+  %119 = fmul float %95, %115, !dbg !42
+  %120 = fadd float %53, %116, !dbg !43
+  %121 = fadd float %54, %117, !dbg !43
+  %122 = fadd float %55, %118, !dbg !43
+  %123 = fadd float %56, %119, !dbg !43
+  %124 = add nuw nsw i32 %61, 8, !dbg !12
+  %125 = icmp ult i32 %61, 248, !dbg !12
+  br i1 %125, label %44, label %126, !dbg !12
+
+126:                                              ; preds = %76
+  %127 = lshr i32 %9, 3, !dbg !12
+  %128 = or i32 %19, %127, !dbg !12
+  %129 = mul nuw nsw i32 %128, 12, !dbg !12
+  %130 = add nuw nsw i32 %129, %18, !dbg !12
+  %131 = zext nneg i32 %130 to i64, !dbg !12
+  %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !12
+  %133 = insertelement <1 x float> undef, float %100, i64 0, !dbg !12
+  store <1 x float> %133, ptr addrspace(3) %132, align 4, !dbg !12
+  %134 = or i32 %18, 192, !dbg !12
+  %135 = add nuw nsw i32 %134, %129, !dbg !12
+  %136 = zext nneg i32 %135 to i64, !dbg !12
+  %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !12
+  %138 = insertelement <1 x float> undef, float %101, i64 0, !dbg !12
+  store <1 x float> %138, ptr addrspace(3) %137, align 4, !dbg !12
+  %139 = or i32 %18, 384, !dbg !12
+  %140 = add nuw nsw i32 %139, %129, !dbg !12
+  %141 = zext nneg i32 %140 to i64, !dbg !12
+  %142 = getelementptr float, ptr addrspace(3) @global_smem, i64 %141, !dbg !12
+  %143 = insertelement <1 x float> undef, float %102, i64 0, !dbg !12
+  store <1 x float> %143, ptr addrspace(3) %142, align 4, !dbg !12
+  %144 = or i32 %18, 576, !dbg !12
+  %145 = add nuw nsw i32 %144, %129, !dbg !12
+  %146 = zext nneg i32 %145 to i64, !dbg !12
+  %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !12
+  %148 = insertelement <1 x float> undef, float %103, i64 0, !dbg !12
+  store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %149 = mul nuw nsw i32 %14, 12, !dbg !12
+  %150 = add nuw nsw i32 %149, %17, !dbg !12
+  %151 = zext nneg i32 %150 to i64, !dbg !12
+  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
+  %153 = load float, ptr addrspace(3) %152, align 16, !dbg !12
+  %154 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 1, !dbg !12
+  %155 = load float, ptr addrspace(3) %154, align 4, !dbg !12
+  %156 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 2, !dbg !12
+  %157 = load float, ptr addrspace(3) %156, align 8, !dbg !12
+  %158 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 3, !dbg !12
+  %159 = load float, ptr addrspace(3) %158, align 4, !dbg !12
+  %160 = fsub float %109, %108, !dbg !44
+  %161 = fadd float %153, %155, !dbg !48
+  %162 = fcmp oeq float %161, 0.000000e+00, !dbg !49
+  %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %155, float %161) #6, !dbg !50
+  %164 = select i1 %162, float 0.000000e+00, float %163, !dbg !51
+  %165 = fmul float %160, %164, !dbg !52
+  %166 = fadd float %108, %165, !dbg !53
+  %167 = fadd float %120, %121, !dbg !54
+  %168 = fmul float %160, %160, !dbg !55
+  %169 = fmul float %168, %153, !dbg !56
+  %170 = fmul float %169, %164, !dbg !57
+  %171 = fadd float %167, %170, !dbg !58
+  %172 = fsub float %110, %166, !dbg !44
+  %173 = fadd float %157, %161, !dbg !48
+  %174 = fcmp oeq float %173, 0.000000e+00, !dbg !49
+  %175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %157, float %173) #6, !dbg !50
+  %176 = select i1 %174, float 0.000000e+00, float %175, !dbg !51
+  %177 = fmul float %176, %172, !dbg !52
+  %178 = fadd float %166, %177, !dbg !53
+  %179 = fadd float %122, %171, !dbg !54
+  %180 = fmul float %172, %172, !dbg !55
+  %181 = fmul float %161, %180, !dbg !56
+  %182 = fmul float %176, %181, !dbg !57
+  %183 = fadd float %179, %182, !dbg !58
+  %184 = fsub float %111, %178, !dbg !44
+  %185 = fadd float %159, %173, !dbg !48
+  %186 = fcmp oeq float %185, 0.000000e+00, !dbg !49
+  %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %159, float %185) #6, !dbg !50
+  %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !51
+  %189 = fmul float %188, %184, !dbg !52
+  %190 = fadd float %178, %189, !dbg !53
+  %191 = fadd float %123, %183, !dbg !54
+  %192 = fmul float %184, %184, !dbg !55
+  %193 = fmul float %173, %192, !dbg !56
+  %194 = fmul float %188, %193, !dbg !57
+  %195 = fadd float %191, %194, !dbg !58
+  %196 = bitcast float %190 to i32, !dbg !59
+  %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !59
+  %198 = bitcast i32 %197 to float, !dbg !59
+  %199 = bitcast float %195 to i32, !dbg !59
+  %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !59
+  %201 = bitcast i32 %200 to float, !dbg !59
+  %202 = bitcast float %185 to i32, !dbg !59
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !59
+  %204 = bitcast i32 %203 to float, !dbg !59
+  %205 = fsub float %198, %190, !dbg !44
+  %206 = fadd float %185, %204, !dbg !48
+  %207 = fcmp oeq float %206, 0.000000e+00, !dbg !49
+  %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !50
+  %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !51
+  %210 = fmul float %209, %205, !dbg !52
+  %211 = fadd float %190, %210, !dbg !53
+  %212 = fadd float %195, %201, !dbg !54
+  %213 = fmul float %205, %205, !dbg !55
+  %214 = fmul float %185, %213, !dbg !56
+  %215 = fmul float %209, %214, !dbg !57
+  %216 = fadd float %212, %215, !dbg !58
+  %217 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %218 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %219 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %220 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %221 = fadd float %217, 0x3EE4F8B580000000, !dbg !62
+  %222 = shl i32 %22, 8, !dbg !63
+  br label %223, !dbg !64
+
+223:                                              ; preds = %126, %__nv_rsqrtf.exit
+  %224 = phi i32 [ 0, %126 ], [ %298, %__nv_rsqrtf.exit ]
+  %225 = or i32 %224, %17, !dbg !65
+  %226 = add i32 %225, %34, !dbg !66
+  %227 = sext i32 %226 to i64, !dbg !67
+  %228 = getelementptr float, ptr addrspace(1) %2, i64 %227, !dbg !67
+  %229 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %228, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %230 = extractvalue { i32, i32, i32, i32 } %229, 0, !dbg !68
+  %231 = extractvalue { i32, i32, i32, i32 } %229, 1, !dbg !68
+  %232 = extractvalue { i32, i32, i32, i32 } %229, 2, !dbg !68
+  %233 = extractvalue { i32, i32, i32, i32 } %229, 3, !dbg !68
+  %234 = bitcast i32 %230 to float, !dbg !68
+  %235 = bitcast i32 %231 to float, !dbg !68
+  %236 = bitcast i32 %232 to float, !dbg !68
+  %237 = bitcast i32 %233 to float, !dbg !68
+  %238 = zext nneg i32 %225 to i64, !dbg !69
+  %239 = getelementptr float, ptr addrspace(1) %3, i64 %238, !dbg !69
+  %240 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %239, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %241 = extractvalue { i32, i32, i32, i32 } %240, 0, !dbg !70
+  %242 = extractvalue { i32, i32, i32, i32 } %240, 1, !dbg !70
+  %243 = extractvalue { i32, i32, i32, i32 } %240, 2, !dbg !70
+  %244 = extractvalue { i32, i32, i32, i32 } %240, 3, !dbg !70
+  %245 = bitcast i32 %241 to float, !dbg !70
+  %246 = bitcast i32 %242 to float, !dbg !70
+  %247 = bitcast i32 %243 to float, !dbg !70
+  %248 = bitcast i32 %244 to float, !dbg !70
+  br i1 %39, label %249, label %250, !dbg !71
+
+249:                                              ; preds = %223
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %250, !dbg !71
+
+250:                                              ; preds = %249, %223
+  %251 = getelementptr float, ptr addrspace(1) %43, i64 %238, !dbg !72
+  %252 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %251, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %253 = extractvalue { i32, i32, i32, i32 } %252, 0, !dbg !73
+  %254 = extractvalue { i32, i32, i32, i32 } %252, 1, !dbg !73
+  %255 = extractvalue { i32, i32, i32, i32 } %252, 2, !dbg !73
+  %256 = extractvalue { i32, i32, i32, i32 } %252, 3, !dbg !73
+  %257 = bitcast i32 %253 to float, !dbg !73
+  %258 = bitcast i32 %254 to float, !dbg !73
+  %259 = bitcast i32 %255 to float, !dbg !73
+  %260 = bitcast i32 %256 to float, !dbg !73
+  %261 = fadd float %234, %257, !dbg !74
+  %262 = fadd float %235, %258, !dbg !74
+  %263 = fadd float %236, %259, !dbg !74
+  %264 = fadd float %237, %260, !dbg !74
+  %265 = fsub float %261, %211, !dbg !75
+  %266 = fsub float %262, %211, !dbg !75
+  %267 = fsub float %263, %211, !dbg !75
+  %268 = fsub float %264, %211, !dbg !75
+  %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i = icmp eq i32 %269, 0, !dbg !76
+  br i1 %.not.i, label %272, label %270, !dbg !76
+
+270:                                              ; preds = %250
+  %271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %221), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+272:                                              ; preds = %250
+  %273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %221), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+__nv_rsqrtf.exit:                                 ; preds = %270, %272
+  %.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !76
+  %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %277 = fmul float %265, %.0.i, !dbg !77
+  %278 = fmul float %266, %.0.i, !dbg !77
+  %279 = fmul float %267, %.0.i, !dbg !77
+  %280 = fmul float %268, %.0.i, !dbg !77
+  %281 = fmul float %277, %245, !dbg !78
+  %282 = fmul float %278, %246, !dbg !78
+  %283 = fmul float %279, %247, !dbg !78
+  %284 = fmul float %280, %248, !dbg !78
+  %285 = add i32 %225, %222, !dbg !79
+  %286 = sext i32 %285 to i64, !dbg !80
+  %287 = getelementptr i16, ptr addrspace(1) %4, i64 %286, !dbg !80
+  %288 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %281) #6, !dbg !81
+  %289 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %282) #6, !dbg !81
+  %290 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %283) #6, !dbg !81
+  %291 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %284) #6, !dbg !81
+  %292 = insertelement <2 x i16> undef, i16 %288, i64 0, !dbg !81
+  %293 = insertelement <2 x i16> %292, i16 %289, i64 1, !dbg !81
+  %294 = bitcast <2 x i16> %293 to i32, !dbg !81
+  %295 = insertelement <2 x i16> undef, i16 %290, i64 0, !dbg !81
+  %296 = insertelement <2 x i16> %295, i16 %291, i64 1, !dbg !81
+  %297 = bitcast <2 x i16> %296 to i32, !dbg !81
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %294, i32 %297, ptr addrspace(1) %287, i1 true) #6, !dbg !81
+  %298 = add nuw nsw i32 %224, 8, !dbg !64
+  %299 = icmp ult i32 %224, 248, !dbg !64
+  br i1 %299, label %223, label %300, !dbg !64
+
+300:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !82
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 22, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 36, scope: !7)
+!23 = !DILocation(line: 39, column: 40, scope: !7)
+!24 = !DILocation(line: 40, column: 44, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 39, column: 55, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 34, scope: !7)
+!32 = !DILocation(line: 40, column: 52, scope: !7)
+!33 = !DILocation(line: 41, column: 22, scope: !7)
+!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 44, column: 38, scope: !35)
+!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
+!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
+!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
+!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
+!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
+!43 = !DILocation(line: 47, column: 48, scope: !7)
+!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
+!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
+!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 41, scope: !45)
+!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
+!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
+!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
+!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
+!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
+!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
+!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
+!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
+!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
+!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
+!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
+!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
+!60 = !DILocation(line: 50, column: 41, scope: !35)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.cubin b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..2f8691fcf9c32c95cf4441b9ab8fa1028659024e
Binary files /dev/null and b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.cubin differ
diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4f8148244d44f2bba386eff895d199dc3ef80b89
--- /dev/null
+++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx
@@ -0,0 +1,871 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7d8de9de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<40>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<121>;
+	.reg .f32 	%f<86>;
+	.reg .b64 	%rd<49>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
+	ld.param.u64 	%rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
+	ld.param.u64 	%rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
+	ld.param.u64 	%rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r1, %tid.x;
+	shl.b32 	%r43, %r1, 2;
+	ld.param.u64 	%rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
+	and.b32  	%r44, %r43, 252;
+	ld.param.u64 	%rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
+	ld.param.u64 	%rd24, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
+	.loc	1 23 28
+	mov.u32 %r10, %ctaid.x;
+	.loc	1 30 18
+	shr.s32 	%r45, %r10, 31;
+	shr.u32 	%r46, %r45, 23;
+	add.s32 	%r47, %r10, %r46;
+	and.b32  	%r48, %r47, 16776704;
+	sub.s32 	%r49, %r10, %r48;
+	ld.param.u64 	%rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
+	.loc	1 31 30
+	mul.wide.s32 	%rd26, %r10, 8;
+	add.s64 	%rd8, %rd21, %rd26;
+	mov.pred 	%p24, -1;
+	.loc	1 31 35
+	mov.u64 %rd7, 0x0;
+	@%p24 ld.global.L1::evict_last.b64 { %rd7 }, [ %rd8 + 0 ];
+	mov.u64 %rd9, 0x0;
+	@%p24 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd8 + 0 ];
+	mov.u64 %rd11, 0x0;
+	@%p24 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd8 + 0 ];
+	mov.u64 %rd13, 0x0;
+	@%p24 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd8 + 0 ];
+	mov.u64 %rd15, 0x0;
+	@%p24 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd8 + 0 ];
+	.loc	1 32 40
+	shl.b32 	%r50, %r49, 8;
+	.loc	1 32 36
+	or.b32  	%r51, %r50, %r44;
+	.loc	1 32 30
+	mul.wide.s32 	%rd27, %r51, 4;
+	add.s64 	%rd17, %rd22, %rd27;
+	mov.b32 	%r59, 0;
+	.loc	1 32 46
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	mov.u32 %r13, 0x0;
+	mov.u32 %r14, 0x0;
+	@%p24 ld.global.L1::evict_last.v4.b32 { %r11, %r12, %r13, %r14 }, [ %rd17 + 0 ];
+	@!%p24 mov.u32 %r11, %r59;
+	@!%p24 mov.u32 %r12, %r59;
+	@!%p24 mov.u32 %r13, %r59;
+	@!%p24 mov.u32 %r14, %r59;
+	.loc	1 33 40
+	shl.b32 	%r52, %r10, 8;
+	.loc	1 33 36
+	or.b32  	%r53, %r52, %r44;
+	.loc	1 33 30
+	cvt.s64.s32 	%rd2, %r53;
+	mul.wide.s32 	%rd28, %r53, 2;
+	add.s64 	%rd18, %rd23, %rd28;
+	.loc	1 33 46
+	mov.u32 %r19, 0x0;
+	mov.u32 %r20, 0x0;
+	@%p24 ld.global.v2.b32 { %r19, %r20 }, [ %rd18 + 0 ];
+	@!%p24 mov.u32 %r19, %r59;
+	@!%p24 mov.u32 %r20, %r59;
+	cvt.u16.u32 	%rs1, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; }
+	cvt.u16.u32 	%rs3, %r20;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r20; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r23, %rs1;
+	mov.b32 	%f1, %r23;
+	cvt.f32.bf16 %r24, %rs2;
+	mov.b32 	%f2, %r24;
+	cvt.f32.bf16 %r25, %rs3;
+	mov.b32 	%f3, %r25;
+	cvt.f32.bf16 %r26, %rs4;
+	mov.b32 	%f4, %r26;
+	.loc	1 34 31
+	add.s64 	%rd19, %rd24, %rd28;
+	.loc	1 34 47
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	@%p24 ld.global.v2.b32 { %r27, %r28 }, [ %rd19 + 0 ];
+	@!%p24 mov.u32 %r27, %r59;
+	@!%p24 mov.u32 %r28, %r59;
+	cvt.u16.u32 	%rs5, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r27; }
+	cvt.u16.u32 	%rs7, %r28;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r28; }
+	.loc	1 34 68
+	cvt.f32.bf16 %r31, %rs5;
+	mov.b32 	%f5, %r31;
+	cvt.f32.bf16 %r32, %rs6;
+	mov.b32 	%f6, %r32;
+	cvt.f32.bf16 %r33, %rs7;
+	mov.b32 	%f7, %r33;
+	cvt.f32.bf16 %r34, %rs8;
+	mov.b32 	%f8, %r34;
+	.loc	1 35 31
+	cvt.u64.u32 	%rd3, %r44;
+	mul.wide.u32 	%rd29, %r44, 4;
+	add.s64 	%rd20, %rd25, %rd29;
+	.loc	1 35 36
+	mov.u32 %r35, 0x0;
+	mov.u32 %r36, 0x0;
+	mov.u32 %r37, 0x0;
+	mov.u32 %r38, 0x0;
+	@%p24 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd20 + 0 ];
+	@!%p24 mov.u32 %r35, %r59;
+	@!%p24 mov.u32 %r36, %r59;
+	@!%p24 mov.u32 %r37, %r59;
+	@!%p24 mov.u32 %r38, %r59;
+	.loc	1 36 18
+	add.s64 	%rd30, %rd15, 50257;
+	.loc	1 37 18
+	setp.lt.s64 	%p22, %rd15, 0;
+	.loc	1 38 32
+	selp.b64 	%rd31, %rd30, %rd15, %p22;
+	.loc	1 39 36
+	setp.lt.u64 	%p23, %rd31, 50257;
+	.loc	1 39 51
+	@%p23 bra 	$L__BB0_2;
+	mov.u64 	%rd32, assertMessage_0;
+	cvta.global.u64 	%rd33, %rd32;
+	mov.u64 	%rd34, assertFile_0;
+	cvta.global.u64 	%rd35, %rd34;
+	mov.u64 	%rd36, assertFunc_0;
+	cvta.global.u64 	%rd37, %rd36;
+	mov.b32 	%r54, 883;
+	mov.u64 	%rd38, 1;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd33;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd35;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r54;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd37;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd38;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 0
+$L__BB0_2:
+	.loc	1 37 18
+	setp.lt.s64 	%p37, %rd7, 0;
+	.loc	1 32 46
+	mov.b32 	%f9, %r14;
+	mov.b32 	%f10, %r13;
+	.loc	1 26 26
+	and.b32  	%r91, %r1, 31;
+	.loc	1 40 40
+	shl.b64 	%rd42, %rd7, 8;
+	add.s64 	%rd43, %rd42, 12865792;
+	selp.b64 	%rd44, %rd43, %rd42, %p37;
+	.loc	1 40 36
+	or.b64  	%rd45, %rd44, %rd3;
+	.loc	1 40 30
+	shl.b64 	%rd46, %rd45, 2;
+	add.s64 	%rd39, %rd4, %rd46;
+	.loc	1 40 48
+	mov.u32 %r55, 0x0;
+	mov.u32 %r56, 0x0;
+	mov.u32 %r57, 0x0;
+	mov.u32 %r58, 0x0;
+	@%p24 ld.global.v4.b32 { %r55, %r56, %r57, %r58 }, [ %rd39 + 0 ];
+	@!%p24 mov.u32 %r55, %r59;
+	@!%p24 mov.u32 %r56, %r59;
+	@!%p24 mov.u32 %r57, %r59;
+	@!%p24 mov.u32 %r58, %r59;
+	mov.b32 	%f11, %r57;
+	mov.b32 	%f12, %r58;
+	.loc	1 41 18
+	add.f32 	%f13, %f10, %f11;
+	add.f32 	%f14, %f9, %f12;
+	.loc	1 43 18
+	add.f32 	%f15, %f3, %f13;
+	add.f32 	%f16, %f4, %f14;
+	.loc	1 32 46
+	mov.b32 	%f17, %r11;
+	mov.b32 	%f18, %r12;
+	.loc	1 40 48
+	mov.b32 	%f19, %r55;
+	mov.b32 	%f20, %r56;
+	.loc	1 41 18
+	add.f32 	%f21, %f18, %f20;
+	add.f32 	%f22, %f17, %f19;
+	.loc	1 43 18
+	add.f32 	%f23, %f1, %f22;
+	add.f32 	%f24, %f2, %f21;
+	.loc	1 45 19
+	add.f32 	%f25, %f6, %f24;
+	mov.b32 	%r82, %f25;
+	add.f32 	%f26, %f5, %f23;
+	add.f32 	%f27, %f7, %f15;
+	add.f32 	%f28, %f8, %f16;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f29, %f26, %f25;
+	add.f32 	%f30, %f27, %f29;
+	add.f32 	%f31, %f28, %f30;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r92, %f31;
+	shfl.sync.bfly.b32	%r93, %r92, 16, 31, -1;
+	mov.b32 	%f32, %r93;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r94, %f33;
+	shfl.sync.bfly.b32	%r95, %r94, 8, 31, -1;
+	mov.b32 	%f34, %r95;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f35, %f33, %f34;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r96, %f35;
+	shfl.sync.bfly.b32	%r97, %r96, 4, 31, -1;
+	mov.b32 	%f36, %r97;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f37, %f35, %f36;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r98, %f37;
+	shfl.sync.bfly.b32	%r99, %r98, 2, 31, -1;
+	mov.b32 	%f38, %r99;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f39, %f37, %f38;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r100, %f39;
+	shfl.sync.bfly.b32	%r101, %r100, 1, 31, -1;
+	mov.b32 	%f40, %r101;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p29, %r91, 0;
+	shr.u32 	%r102, %r1, 3;
+	and.b32  	%r103, %r102, 4;
+	mov.u32 	%r104, global_smem;
+	add.s32 	%r63, %r104, %r103;
+	mov.b32 	%r64, %f41;
+	@%p29 st.shared.b32 [ %r63 + 0 ], %r64;
+	bar.sync 	0;
+	setp.lt.s32 	%p30, %r1, 2;
+	add.s32 	%r66, %r104, %r43;
+	@%p30 ld.shared.b32 %r65, [ %r66 + 0 ];
+	mov.b32 	%f42, %r65;
+	shfl.sync.bfly.b32	%r106, %r65, 1, 31, -1;
+	mov.b32 	%f43, %r106;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f44, %f42, %f43;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r107, %r1, 1;
+	setp.eq.b32 	%p38, %r107, 1;
+	not.pred 	%p39, %p38;
+	and.pred  	%p31, %p30, %p39;
+	mov.b32 	%r68, %f44;
+	@%p31 st.shared.b32 [ %r66 + 0 ], %r68;
+	bar.sync 	0;
+	ld.shared.f32 	%f45, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f46, %f45, 0f00000000;
+$L__tmp16:
+	.loc	1 53 20
+	mov.b32 	%r70, %f46;
+	mov.b32 	%r71, 1132462080;
+	div.full.f32 %r69, %r70, %r71;
+	mov.b32 	%f47, %r69;
+	.loc	1 54 20
+	sub.f32 	%f48, %f26, %f47;
+	sub.f32 	%f49, %f25, %f47;
+	sub.f32 	%f50, %f27, %f47;
+	sub.f32 	%f51, %f28, %f47;
+	.loc	1 55 20
+	mul.f32 	%f52, %f49, %f49;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f53, %f48, %f48, %f52;
+	fma.rn.f32 	%f54, %f50, %f50, %f53;
+	fma.rn.f32 	%f55, %f51, %f51, %f54;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r108, %f55;
+	shfl.sync.bfly.b32	%r109, %r108, 16, 31, -1;
+	mov.b32 	%f56, %r109;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r110, %f57;
+	shfl.sync.bfly.b32	%r111, %r110, 8, 31, -1;
+	mov.b32 	%f58, %r111;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f59, %f57, %f58;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r112, %f59;
+	shfl.sync.bfly.b32	%r113, %r112, 4, 31, -1;
+	mov.b32 	%f60, %r113;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f61, %f59, %f60;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r114, %f61;
+	shfl.sync.bfly.b32	%r115, %r114, 2, 31, -1;
+	mov.b32 	%f62, %r115;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f63, %f61, %f62;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r116, %f63;
+	shfl.sync.bfly.b32	%r117, %r116, 1, 31, -1;
+	mov.b32 	%f64, %r117;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r73, %f65;
+	@%p29 st.shared.b32 [ %r63 + 0 ], %r73;
+	bar.sync 	0;
+	@%p30 ld.shared.b32 %r74, [ %r66 + 0 ];
+	mov.b32 	%f66, %r74;
+	shfl.sync.bfly.b32	%r118, %r74, 1, 31, -1;
+	mov.b32 	%f67, %r118;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f68, %f66, %f67;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r77, %f68;
+	@%p31 st.shared.b32 [ %r66 + 0 ], %r77;
+	bar.sync 	0;
+	ld.shared.f32 	%f69, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f70, %f69, 0f00000000;
+$L__tmp33:
+	.loc	1 61 20
+	mov.b32 	%r79, %f70;
+	div.full.f32 %r78, %r79, %r71;
+	mov.b32 	%f71, %r78;
+	.loc	1 63 20
+	add.f32 	%f72, %f71, 0f3727C5AC;
+	.loc	1 64 26
+	rsqrt.approx.ftz.f32 	%f73, %f72;
+	.loc	1 35 36
+	mov.b32 	%f74, %r35;
+	mov.b32 	%f75, %r36;
+	mov.b32 	%f76, %r37;
+	mov.b32 	%f77, %r38;
+	.loc	1 65 20
+	mul.f32 	%f78, %f48, %f73;
+	mul.f32 	%f79, %f49, %f73;
+	mul.f32 	%f80, %f50, %f73;
+	mul.f32 	%f81, %f51, %f73;
+	.loc	1 66 20
+	mul.f32 	%f82, %f78, %f74;
+	mul.f32 	%f83, %f79, %f75;
+	mul.f32 	%f84, %f80, %f76;
+	mul.f32 	%f85, %f81, %f77;
+	.loc	1 68 25
+	shl.b64 	%rd47, %rd2, 2;
+	add.s64 	%rd40, %rd5, %rd47;
+	.loc	1 45 19
+	mov.b32 	%r81, %f26;
+	.loc	1 68 48
+	mov.b32 	%r83, %f27;
+	mov.b32 	%r84, %f28;
+	@%p24 st.global.v4.b32 [ %rd40 + 0 ], { %r81, %r82, %r83, %r84 };
+	.loc	1 69 25
+	shl.b64 	%rd48, %rd2, 1;
+	add.s64 	%rd41, %rd6, %rd48;
+	.loc	1 69 48
+	mov.b32 	%r85, %f82;
+	cvt.rn.bf16.f32 %rs9, %r85;
+	mov.b32 	%r86, %f83;
+	cvt.rn.bf16.f32 %rs10, %r86;
+	mov.b32 	%r87, %f84;
+	cvt.rn.bf16.f32 %rs11, %r87;
+	mov.b32 	%r88, %f85;
+	cvt.rn.bf16.f32 %rs12, %r88;
+	mov.b32 	%r119, {%rs9, %rs10};
+	mov.b32 	%r120, {%rs11, %rs12};
+	@%p24 st.global.v2.b32 [ %rd41 + 0 ], { %r119, %r120 };
+	.loc	1 69 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/6g/c6grvbmbs6r6xudavzwfrinzcj6we2lrqye32hw4xlcoakfxmlu7.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 407
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 54
+.b8 103
+.b8 114
+.b8 118
+.b8 98
+.b8 109
+.b8 98
+.b8 115
+.b8 54
+.b8 114
+.b8 54
+.b8 120
+.b8 117
+.b8 100
+.b8 97
+.b8 118
+.b8 122
+.b8 119
+.b8 102
+.b8 114
+.b8 105
+.b8 110
+.b8 122
+.b8 99
+.b8 106
+.b8 54
+.b8 119
+.b8 101
+.b8 50
+.b8 108
+.b8 114
+.b8 113
+.b8 121
+.b8 101
+.b8 51
+.b8 50
+.b8 104
+.b8 119
+.b8 52
+.b8 120
+.b8 108
+.b8 99
+.b8 111
+.b8 97
+.b8 107
+.b8 102
+.b8 120
+.b8 109
+.b8 108
+.b8 117
+.b8 55
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 54
+.b8 103
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 50
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 50
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 50
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 58
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 58
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 58
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 101
+.b8 57
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 411
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..0edc7601ba8264f81618c33004bdafd57004157a
--- /dev/null
+++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir
@@ -0,0 +1,384 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 31, !dbg !8
+  %8 = lshr i32 %6, 5, !dbg !8
+  %9 = shl i32 %6, 2, !dbg !8
+  %10 = and i32 %9, 60, !dbg !8
+  %11 = and i32 %8, 7, !dbg !9
+  %12 = lshr i32 %7, 4, !dbg !9
+  %13 = shl nuw nsw i32 %11, 1, !dbg !9
+  %14 = or i32 %13, %12, !dbg !9
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
+  %16 = shl i32 %15, 6, !dbg !11
+  %17 = or i32 %16, %10, !dbg !12
+  %18 = shl nuw nsw i32 %14, 17, !dbg !13
+  %19 = shl nuw nsw i32 %14, 17, !dbg !13
+  %20 = or i32 %19, 2097152, !dbg !13
+  %21 = shl nuw nsw i32 %14, 17, !dbg !13
+  %22 = or i32 %21, 4194304, !dbg !13
+  %23 = shl nuw nsw i32 %14, 17, !dbg !13
+  %24 = or i32 %23, 6291456, !dbg !13
+  %25 = add i32 %18, %17, !dbg !14
+  %26 = add i32 %20, %17, !dbg !14
+  %27 = add i32 %22, %17, !dbg !14
+  %28 = add i32 %24, %17, !dbg !14
+  %29 = sext i32 %25 to i64, !dbg !15
+  %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !15
+  %31 = sext i32 %26 to i64, !dbg !15
+  %32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !15
+  %33 = sext i32 %27 to i64, !dbg !15
+  %34 = getelementptr float, ptr addrspace(1) %0, i64 %33, !dbg !15
+  %35 = sext i32 %28 to i64, !dbg !15
+  %36 = getelementptr float, ptr addrspace(1) %0, i64 %35, !dbg !15
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !16
+  %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !16
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !16
+  %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !16
+  %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !16
+  %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !16
+  %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !16
+  %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !16
+  %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %34, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !16
+  %49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !16
+  %50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !16
+  %51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !16
+  %52 = bitcast i32 %48 to float, !dbg !16
+  %53 = bitcast i32 %49 to float, !dbg !16
+  %54 = bitcast i32 %50 to float, !dbg !16
+  %55 = bitcast i32 %51 to float, !dbg !16
+  %56 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %57 = extractvalue { i32, i32, i32, i32 } %56, 0, !dbg !16
+  %58 = extractvalue { i32, i32, i32, i32 } %56, 1, !dbg !16
+  %59 = extractvalue { i32, i32, i32, i32 } %56, 2, !dbg !16
+  %60 = extractvalue { i32, i32, i32, i32 } %56, 3, !dbg !16
+  %61 = fadd float %52, 0.000000e+00, !dbg !17
+  %62 = fadd float %53, 0.000000e+00, !dbg !17
+  %63 = fadd float %54, 0.000000e+00, !dbg !17
+  %64 = fadd float %55, 0.000000e+00, !dbg !17
+  %65 = or i32 %14, 112, !dbg !18
+  %66 = icmp ult i32 %65, 120, !dbg !19
+  %67 = shl nuw nsw i32 %14, 17, !dbg !13
+  %68 = or i32 %67, 8388608, !dbg !13
+  %69 = shl nuw nsw i32 %14, 17, !dbg !13
+  %70 = or i32 %69, 10485760, !dbg !13
+  %71 = shl nuw nsw i32 %14, 17, !dbg !13
+  %72 = or i32 %71, 12582912, !dbg !13
+  %73 = shl nuw nsw i32 %65, 17, !dbg !13
+  %74 = add i32 %68, %17, !dbg !14
+  %75 = add i32 %70, %17, !dbg !14
+  %76 = add i32 %72, %17, !dbg !14
+  %77 = add i32 %73, %17, !dbg !14
+  %78 = sext i32 %74 to i64, !dbg !15
+  %79 = getelementptr float, ptr addrspace(1) %0, i64 %78, !dbg !15
+  %80 = sext i32 %75 to i64, !dbg !15
+  %81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !15
+  %82 = sext i32 %76 to i64, !dbg !15
+  %83 = getelementptr float, ptr addrspace(1) %0, i64 %82, !dbg !15
+  %84 = sext i32 %77 to i64, !dbg !15
+  %85 = getelementptr float, ptr addrspace(1) %0, i64 %84, !dbg !15
+  %86 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %87 = extractvalue { i32, i32, i32, i32 } %86, 0, !dbg !16
+  %88 = extractvalue { i32, i32, i32, i32 } %86, 1, !dbg !16
+  %89 = extractvalue { i32, i32, i32, i32 } %86, 2, !dbg !16
+  %90 = extractvalue { i32, i32, i32, i32 } %86, 3, !dbg !16
+  %91 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %92 = extractvalue { i32, i32, i32, i32 } %91, 0, !dbg !16
+  %93 = extractvalue { i32, i32, i32, i32 } %91, 1, !dbg !16
+  %94 = extractvalue { i32, i32, i32, i32 } %91, 2, !dbg !16
+  %95 = extractvalue { i32, i32, i32, i32 } %91, 3, !dbg !16
+  %96 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %83, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
+  %97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !16
+  %98 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !16
+  %99 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !16
+  %100 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !16
+  %101 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %85, i1 %66, i32 0, i1 %66, i32 0, i1 %66, i32 0, i1 %66, i32 0, i1 %66) #3, !dbg !16
+  %102 = extractvalue { i32, i32, i32, i32 } %101, 0, !dbg !16
+  %103 = extractvalue { i32, i32, i32, i32 } %101, 1, !dbg !16
+  %104 = extractvalue { i32, i32, i32, i32 } %101, 2, !dbg !16
+  %105 = extractvalue { i32, i32, i32, i32 } %101, 3, !dbg !16
+  %106 = bitcast i32 %102 to float, !dbg !16
+  %107 = bitcast i32 %103 to float, !dbg !16
+  %108 = bitcast i32 %104 to float, !dbg !16
+  %109 = bitcast i32 %105 to float, !dbg !16
+  %110 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !16
+  %111 = insertelement <2 x i32> %110, i32 %43, i64 1, !dbg !16
+  %112 = bitcast <2 x i32> %111 to <2 x float>, !dbg !16
+  %113 = fadd <2 x float> %112, zeroinitializer, !dbg !17
+  %114 = insertelement <2 x i32> poison, i32 %87, i64 0, !dbg !16
+  %115 = insertelement <2 x i32> %114, i32 %92, i64 1, !dbg !16
+  %116 = bitcast <2 x i32> %115 to <2 x float>, !dbg !16
+  %117 = fadd <2 x float> %113, %116, !dbg !17
+  %118 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !16
+  %119 = insertelement <2 x i32> %118, i32 %44, i64 1, !dbg !16
+  %120 = bitcast <2 x i32> %119 to <2 x float>, !dbg !16
+  %121 = fadd <2 x float> %120, zeroinitializer, !dbg !17
+  %122 = insertelement <2 x i32> poison, i32 %88, i64 0, !dbg !16
+  %123 = insertelement <2 x i32> %122, i32 %93, i64 1, !dbg !16
+  %124 = bitcast <2 x i32> %123 to <2 x float>, !dbg !16
+  %125 = fadd <2 x float> %121, %124, !dbg !17
+  %126 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !16
+  %127 = insertelement <2 x i32> %126, i32 %45, i64 1, !dbg !16
+  %128 = bitcast <2 x i32> %127 to <2 x float>, !dbg !16
+  %129 = fadd <2 x float> %128, zeroinitializer, !dbg !17
+  %130 = insertelement <2 x i32> poison, i32 %89, i64 0, !dbg !16
+  %131 = insertelement <2 x i32> %130, i32 %94, i64 1, !dbg !16
+  %132 = bitcast <2 x i32> %131 to <2 x float>, !dbg !16
+  %133 = fadd <2 x float> %129, %132, !dbg !17
+  %134 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !16
+  %135 = insertelement <2 x i32> %134, i32 %46, i64 1, !dbg !16
+  %136 = bitcast <2 x i32> %135 to <2 x float>, !dbg !16
+  %137 = fadd <2 x float> %136, zeroinitializer, !dbg !17
+  %138 = insertelement <2 x i32> poison, i32 %90, i64 0, !dbg !16
+  %139 = insertelement <2 x i32> %138, i32 %95, i64 1, !dbg !16
+  %140 = bitcast <2 x i32> %139 to <2 x float>, !dbg !16
+  %141 = fadd <2 x float> %137, %140, !dbg !17
+  %142 = select i1 %66, float %106, float -0.000000e+00, !dbg !17
+  %143 = select i1 %66, float %107, float -0.000000e+00, !dbg !17
+  %144 = select i1 %66, float %108, float -0.000000e+00, !dbg !17
+  %145 = select i1 %66, float %109, float -0.000000e+00, !dbg !17
+  %146 = and i32 %6, 63, !dbg !8
+  %147 = or i32 %16, %146, !dbg !12
+  %148 = or i32 %10, 3, !dbg !20
+  %149 = or i32 %10, 2, !dbg !20
+  %150 = or i32 %10, 1, !dbg !20
+  %shift = shufflevector <2 x float> %117, <2 x float> poison, <2 x i32> <i32 poison, i32 0>, !dbg !24
+  %151 = fadd <2 x float> %shift, %117, !dbg !24
+  %shift16 = shufflevector <2 x float> %125, <2 x float> poison, <2 x i32> <i32 poison, i32 0>, !dbg !24
+  %152 = fadd <2 x float> %shift16, %125, !dbg !24
+  %shift17 = shufflevector <2 x float> %133, <2 x float> poison, <2 x i32> <i32 poison, i32 0>, !dbg !24
+  %153 = fadd <2 x float> %shift17, %133, !dbg !24
+  %shift18 = shufflevector <2 x float> %141, <2 x float> poison, <2 x i32> <i32 poison, i32 0>, !dbg !24
+  %154 = fadd <2 x float> %shift18, %141, !dbg !24
+  %155 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !16
+  %156 = insertelement <2 x i32> %155, i32 %97, i64 1, !dbg !16
+  %157 = bitcast <2 x i32> %156 to <2 x float>, !dbg !16
+  %158 = insertelement <2 x float> <float 0.000000e+00, float poison>, float %61, i64 1, !dbg !17
+  %159 = fadd <2 x float> %158, %157, !dbg !17
+  %160 = insertelement <2 x float> %151, float %142, i64 0, !dbg !17
+  %161 = fadd <2 x float> %159, %160, !dbg !17
+  %162 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !16
+  %163 = insertelement <2 x i32> %162, i32 %98, i64 1, !dbg !16
+  %164 = bitcast <2 x i32> %163 to <2 x float>, !dbg !16
+  %165 = insertelement <2 x float> <float 0.000000e+00, float poison>, float %62, i64 1, !dbg !17
+  %166 = fadd <2 x float> %165, %164, !dbg !17
+  %167 = insertelement <2 x float> %152, float %143, i64 0, !dbg !17
+  %168 = fadd <2 x float> %166, %167, !dbg !17
+  %169 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !16
+  %170 = insertelement <2 x i32> %169, i32 %99, i64 1, !dbg !16
+  %171 = bitcast <2 x i32> %170 to <2 x float>, !dbg !16
+  %172 = insertelement <2 x float> <float 0.000000e+00, float poison>, float %63, i64 1, !dbg !17
+  %173 = fadd <2 x float> %172, %171, !dbg !17
+  %174 = insertelement <2 x float> %153, float %144, i64 0, !dbg !17
+  %175 = fadd <2 x float> %173, %174, !dbg !17
+  %176 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !16
+  %177 = insertelement <2 x i32> %176, i32 %100, i64 1, !dbg !16
+  %178 = bitcast <2 x i32> %177 to <2 x float>, !dbg !16
+  %179 = insertelement <2 x float> <float 0.000000e+00, float poison>, float %64, i64 1, !dbg !17
+  %180 = fadd <2 x float> %179, %178, !dbg !17
+  %181 = insertelement <2 x float> %154, float %145, i64 0, !dbg !17
+  %182 = fadd <2 x float> %180, %181, !dbg !17
+  %shift19 = shufflevector <2 x float> %161, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !24
+  %183 = fadd <2 x float> %161, %shift19, !dbg !24
+  %184 = extractelement <2 x float> %183, i64 0, !dbg !24
+  %shift20 = shufflevector <2 x float> %168, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !24
+  %185 = fadd <2 x float> %168, %shift20, !dbg !24
+  %186 = extractelement <2 x float> %185, i64 0, !dbg !24
+  %shift21 = shufflevector <2 x float> %175, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !24
+  %187 = fadd <2 x float> %175, %shift21, !dbg !24
+  %188 = extractelement <2 x float> %187, i64 0, !dbg !24
+  %shift22 = shufflevector <2 x float> %182, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !24
+  %189 = fadd <2 x float> %182, %shift22, !dbg !24
+  %190 = extractelement <2 x float> %189, i64 0, !dbg !24
+  %191 = bitcast float %184 to i32, !dbg !20
+  %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 16, i32 31), !dbg !20
+  %193 = bitcast i32 %192 to float, !dbg !20
+  %194 = fadd float %184, %193, !dbg !24
+  %195 = bitcast float %186 to i32, !dbg !20
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 16, i32 31), !dbg !20
+  %197 = bitcast i32 %196 to float, !dbg !20
+  %198 = fadd float %186, %197, !dbg !24
+  %199 = bitcast float %188 to i32, !dbg !20
+  %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 16, i32 31), !dbg !20
+  %201 = bitcast i32 %200 to float, !dbg !20
+  %202 = fadd float %188, %201, !dbg !24
+  %203 = bitcast float %190 to i32, !dbg !20
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 16, i32 31), !dbg !20
+  %205 = bitcast i32 %204 to float, !dbg !20
+  %206 = fadd float %190, %205, !dbg !24
+  %207 = icmp ult i32 %7, 16, !dbg !20
+  %208 = shl nuw nsw i32 %10, 3, !dbg !20
+  %209 = or i32 %208, %11, !dbg !20
+  %210 = zext nneg i32 %209 to i64, !dbg !20
+  %211 = getelementptr float, ptr addrspace(3) @global_smem, i64 %210, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %211, float %194, i1 %207) #3, !dbg !20
+  %212 = shl nuw nsw i32 %150, 3, !dbg !20
+  %213 = or i32 %212, %11, !dbg !20
+  %214 = zext nneg i32 %213 to i64, !dbg !20
+  %215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, float %198, i1 %207) #3, !dbg !20
+  %216 = shl nuw nsw i32 %149, 3, !dbg !20
+  %217 = or i32 %216, %11, !dbg !20
+  %218 = zext nneg i32 %217 to i64, !dbg !20
+  %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, float %202, i1 %207) #3, !dbg !20
+  %220 = shl nuw nsw i32 %148, 3, !dbg !20
+  %221 = or i32 %220, %11, !dbg !20
+  %222 = zext nneg i32 %221 to i64, !dbg !20
+  %223 = getelementptr float, ptr addrspace(3) @global_smem, i64 %222, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %207) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %224 = icmp slt i32 %6, 512, !dbg !20
+  %225 = sext i32 %6 to i64, !dbg !20
+  %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !20
+  %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #3, !dbg !20
+  %228 = bitcast float %227 to i32, !dbg !20
+  %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 4, i32 31), !dbg !20
+  %230 = bitcast i32 %229 to float, !dbg !20
+  %231 = fadd float %227, %230, !dbg !24
+  %232 = bitcast float %231 to i32, !dbg !20
+  %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 2, i32 31), !dbg !20
+  %234 = bitcast i32 %233 to float, !dbg !20
+  %235 = fadd float %231, %234, !dbg !24
+  %236 = bitcast float %235 to i32, !dbg !20
+  %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !20
+  %238 = bitcast i32 %237 to float, !dbg !20
+  %239 = fadd float %235, %238, !dbg !24
+  %240 = and i32 %6, 7, !dbg !20
+  %241 = icmp eq i32 %240, 0, !dbg !20
+  %242 = and i1 %224, %241, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %239, i1 %242) #3, !dbg !20
+  %243 = add i32 %6, 256, !dbg !20
+  %244 = sext i32 %243 to i64, !dbg !20
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !20
+  %246 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %224) #3, !dbg !20
+  %247 = bitcast float %246 to i32, !dbg !20
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 4, i32 31), !dbg !20
+  %249 = bitcast i32 %248 to float, !dbg !20
+  %250 = fadd float %246, %249, !dbg !24
+  %251 = bitcast float %250 to i32, !dbg !20
+  %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 2, i32 31), !dbg !20
+  %253 = bitcast i32 %252 to float, !dbg !20
+  %254 = fadd float %250, %253, !dbg !24
+  %255 = bitcast float %254 to i32, !dbg !20
+  %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !20
+  %257 = bitcast i32 %256 to float, !dbg !20
+  %258 = fadd float %254, %257, !dbg !24
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, float %258, i1 %242) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %259 = zext nneg i32 %208 to i64, !dbg !20
+  %260 = getelementptr float, ptr addrspace(3) @global_smem, i64 %259, !dbg !20
+  %261 = load float, ptr addrspace(3) %260, align 4, !dbg !20
+  %262 = zext nneg i32 %212 to i64, !dbg !20
+  %263 = getelementptr float, ptr addrspace(3) @global_smem, i64 %262, !dbg !20
+  %264 = load float, ptr addrspace(3) %263, align 4, !dbg !20
+  %265 = zext nneg i32 %216 to i64, !dbg !20
+  %266 = getelementptr float, ptr addrspace(3) @global_smem, i64 %265, !dbg !20
+  %267 = load float, ptr addrspace(3) %266, align 4, !dbg !20
+  %268 = zext nneg i32 %220 to i64, !dbg !20
+  %269 = getelementptr float, ptr addrspace(3) @global_smem, i64 %268, !dbg !20
+  %270 = load float, ptr addrspace(3) %269, align 4, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %271 = zext nneg i32 %10 to i64, !dbg !28
+  %272 = getelementptr float, ptr addrspace(3) @global_smem, i64 %271, !dbg !28
+  %273 = insertelement <1 x float> undef, float %261, i64 0, !dbg !28
+  store <1 x float> %273, ptr addrspace(3) %272, align 4, !dbg !28
+  %274 = zext nneg i32 %150 to i64, !dbg !28
+  %275 = getelementptr float, ptr addrspace(3) @global_smem, i64 %274, !dbg !28
+  %276 = insertelement <1 x float> undef, float %264, i64 0, !dbg !28
+  store <1 x float> %276, ptr addrspace(3) %275, align 4, !dbg !28
+  %277 = zext nneg i32 %149 to i64, !dbg !28
+  %278 = getelementptr float, ptr addrspace(3) @global_smem, i64 %277, !dbg !28
+  %279 = insertelement <1 x float> undef, float %267, i64 0, !dbg !28
+  store <1 x float> %279, ptr addrspace(3) %278, align 4, !dbg !28
+  %280 = zext nneg i32 %148 to i64, !dbg !28
+  %281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !28
+  %282 = insertelement <1 x float> undef, float %270, i64 0, !dbg !28
+  store <1 x float> %282, ptr addrspace(3) %281, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %283 = zext nneg i32 %146 to i64, !dbg !28
+  %284 = getelementptr float, ptr addrspace(3) @global_smem, i64 %283, !dbg !28
+  %285 = load <1 x float>, ptr addrspace(3) %284, align 4, !dbg !28
+  %.frozen = freeze i32 %147
+  %286 = sdiv i32 %.frozen, 256, !dbg !29
+  %287 = mul i32 %286, 256
+  %.decomposed = sub i32 %.frozen, %287
+  %288 = sext i32 %286 to i64, !dbg !30
+  %289 = getelementptr i64, ptr addrspace(1) %1, i64 %288, !dbg !30
+  %290 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %289, i1 true) #3, !dbg !31
+  %291 = lshr i64 %290, 54, !dbg !32
+  %292 = and i64 %291, 512, !dbg !32
+  %293 = add i64 %292, %290, !dbg !32
+  %294 = shl i64 %293, 8, !dbg !33
+  %295 = sext i32 %.decomposed to i64, !dbg !34
+  %296 = getelementptr float, ptr addrspace(1) %2, i64 %294, !dbg !35
+  %297 = getelementptr float, ptr addrspace(1) %296, i64 %295, !dbg !35
+  %298 = and i32 %6, 192, !dbg !36
+  %299 = icmp eq i32 %298, 0, !dbg !36
+  %300 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %297, <1 x float> %285, i1 %299) #3, !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
+!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 22, column: 44, scope: !5)
+!9 = !DILocation(line: 24, column: 33, scope: !5)
+!10 = !DILocation(line: 21, column: 28, scope: !5)
+!11 = !DILocation(line: 21, column: 33, scope: !5)
+!12 = !DILocation(line: 22, column: 23, scope: !5)
+!13 = !DILocation(line: 31, column: 47, scope: !5)
+!14 = !DILocation(line: 31, column: 40, scope: !5)
+!15 = !DILocation(line: 31, column: 34, scope: !5)
+!16 = !DILocation(line: 31, column: 53, scope: !5)
+!17 = !DILocation(line: 34, column: 38, scope: !5)
+!18 = !DILocation(line: 28, column: 27, scope: !5)
+!19 = !DILocation(line: 29, column: 25, scope: !5)
+!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!23 = !DILocation(line: 35, column: 25, scope: !21)
+!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
+!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
+!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
+!27 = !DILocation(line: 35, column: 25, scope: !25)
+!28 = !DILocation(line: 35, column: 28, scope: !5)
+!29 = !DILocation(line: 36, column: 20, scope: !5)
+!30 = !DILocation(line: 38, column: 30, scope: !5)
+!31 = !DILocation(line: 38, column: 35, scope: !5)
+!32 = !DILocation(line: 41, column: 32, scope: !5)
+!33 = !DILocation(line: 45, column: 40, scope: !5)
+!34 = !DILocation(line: 45, column: 36, scope: !5)
+!35 = !DILocation(line: 45, column: 30, scope: !5)
+!36 = !DILocation(line: 45, column: 55, scope: !5)
+!37 = !DILocation(line: 45, column: 4, scope: !5)
diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f122a6130de200e26956e978263c837934848e70
--- /dev/null
+++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir
@@ -0,0 +1,67 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x64xi32, #blocked1>
+    %cst_4 = arith.constant dense<120> : tensor<1x64xi32, #blocked1>
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c120_i32 = arith.constant 120 : i32
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1>
+    %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked1>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1>
+    %12 = tt.broadcast %8 : (tensor<64x1xi32, #blocked1>) -> tensor<64x64xi32, #blocked1>
+    %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked1>
+    %14 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c64_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x64xf32, #blocked1>)  : i32 {
+      %32 = tt.splat %arg5 : (i32) -> tensor<1x64xi32, #blocked1>
+      %33 = arith.addi %32, %11 : tensor<1x64xi32, #blocked1>
+      %34 = arith.cmpi slt, %33, %cst_4 : tensor<1x64xi32, #blocked1>
+      %35 = arith.muli %33, %cst_3 : tensor<1x64xi32, #blocked1>
+      %36 = tt.broadcast %35 : (tensor<1x64xi32, #blocked1>) -> tensor<64x64xi32, #blocked1>
+      %37 = arith.addi %12, %36 : tensor<64x64xi32, #blocked1>
+      %38 = tt.addptr %13, %37 : tensor<64x64x!tt.ptr<f32, 1>, #blocked1>, tensor<64x64xi32, #blocked1>
+      %39 = tt.broadcast %34 : (tensor<1x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked1>
+      %40 = tt.load %38, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked1>
+      %41 = arith.addf %arg6, %40 : tensor<64x64xf32, #blocked1>
+      %42 = arith.select %39, %41, %arg6 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1>
+      scf.yield %42 : tensor<64x64xf32, #blocked1>
+    }
+    %15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %32 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %32 : f32
+    }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %16 = triton_gpu.convert_layout %15 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %18 = arith.divsi %9, %cst_2 : tensor<64x1xi32, #blocked>
+    %19 = arith.remsi %9, %cst_2 : tensor<64x1xi32, #blocked>
+    %20 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %21 = tt.addptr %20, %18 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %23 = arith.addi %22, %cst_1 : tensor<64x1xi64, #blocked>
+    %24 = arith.cmpi slt, %22, %cst_0 : tensor<64x1xi64, #blocked>
+    %25 = arith.select %24, %23, %22 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %26 = arith.muli %25, %cst : tensor<64x1xi64, #blocked>
+    %27 = arith.extsi %19 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
+    %28 = arith.addi %27, %26 : tensor<64x1xi64, #blocked>
+    %29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
+    %30 = tt.addptr %29, %28 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
+    %31 = "tt.atomic_rmw"(%30, %17, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..11f62c676ae3b36e8a565af04f5be4d9481e29cf
--- /dev/null
+++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir
@@ -0,0 +1,59 @@
+module {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_0 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_1 = arith.constant dense<512> : tensor<64x1xi64>
+    %c120_i32 = arith.constant 120 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %cst_2 = arith.constant dense<true> : tensor<64x1xi1>
+    %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_4 = arith.constant dense<131072> : tensor<1x64xi32>
+    %cst_5 = arith.constant dense<120> : tensor<1x64xi32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
+    %7 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x64xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
+    %9 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c64_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x64xf32>)  : i32 {
+      %26 = tt.splat %arg5 : (i32) -> tensor<1x64xi32>
+      %27 = arith.addi %26, %6 : tensor<1x64xi32>
+      %28 = arith.cmpi slt, %27, %cst_5 : tensor<1x64xi32>
+      %29 = arith.muli %27, %cst_4 : tensor<1x64xi32>
+      %30 = tt.broadcast %29 : (tensor<1x64xi32>) -> tensor<64x64xi32>
+      %31 = arith.addi %7, %30 : tensor<64x64xi32>
+      %32 = tt.addptr %8, %31 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
+      %33 = tt.broadcast %28 : (tensor<1x64xi1>) -> tensor<64x64xi1>
+      %34 = tt.load %32, %33, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
+      %35 = arith.addf %arg6, %34 : tensor<64x64xf32>
+      %36 = arith.select %33, %35, %arg6 : tensor<64x64xi1>, tensor<64x64xf32>
+      scf.yield %36 : tensor<64x64xf32>
+    }
+    %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %26 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %26 : f32
+    }) : (tensor<64x64xf32>) -> tensor<64xf32>
+    %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %12 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
+    %13 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
+    %14 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %15 = tt.addptr %14, %12 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %16 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %17 = arith.addi %16, %cst_1 : tensor<64x1xi64>
+    %18 = arith.cmpi slt, %16, %cst_0 : tensor<64x1xi64>
+    %19 = arith.select %18, %17, %16 : tensor<64x1xi1>, tensor<64x1xi64>
+    %20 = arith.muli %19, %cst : tensor<64x1xi64>
+    %21 = arith.extsi %13 : tensor<64x1xi32> to tensor<64x1xi64>
+    %22 = arith.addi %21, %20 : tensor<64x1xi64>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
+    %24 = tt.addptr %23, %22 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
+    %25 = "tt.atomic_rmw"(%24, %11, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.llir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..9e7dd146a6eb43325f2e34fbca4799ac611eafbc
--- /dev/null
+++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.llir
@@ -0,0 +1,62 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 2, !dbg !8
+  %6 = and i32 %5, 508, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
+  %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
+  %15 = trunc i32 %13 to i16, !dbg !13
+  %extelt.offset = lshr i32 %13, 16, !dbg !13
+  %16 = trunc i32 %extelt.offset to i16, !dbg !13
+  %17 = trunc i32 %14 to i16, !dbg !13
+  %extelt.offset1 = lshr i32 %14, 16, !dbg !13
+  %18 = trunc i32 %extelt.offset1 to i16, !dbg !13
+  %19 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !14
+  %20 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #1, !dbg !14
+  %21 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #1, !dbg !14
+  %22 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #1, !dbg !14
+  %23 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
+  %24 = bitcast float %19 to i32, !dbg !16
+  %25 = bitcast float %20 to i32, !dbg !16
+  %26 = bitcast float %21 to i32, !dbg !16
+  %27 = bitcast float %22 to i32, !dbg !16
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %24, i32 %25, i32 %26, i32 %27, ptr addrspace(1) %23, i1 true) #1, !dbg !16
+  ret void, !dbg !17
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py", directory: "/tmp/torchinductor_root/zl")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..cdc64a19be9d4816eddc0cdbad88d21c48b0d233
--- /dev/null
+++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.ptx b/.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4ef08f48923232519b446ec885f4e01246f9c678
--- /dev/null
+++ b/.triton/dump/c0db4dd81e5aac83500e3ccf67d3896d/triton_.ptx
@@ -0,0 +1,296 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<12>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	.loc	1 24 44
+	cvt.f32.bf16 %r5, %rs1;
+	cvt.f32.bf16 %r6, %rs2;
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 4;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/ya/cyamhdbxtmf4rgres6uo7orhfzw3ryhsvm5qzdvyqgggck2hqbyi.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 121
+.b8 97
+.b8 109
+.b8 104
+.b8 100
+.b8 98
+.b8 120
+.b8 116
+.b8 109
+.b8 102
+.b8 52
+.b8 114
+.b8 103
+.b8 114
+.b8 101
+.b8 115
+.b8 54
+.b8 117
+.b8 111
+.b8 55
+.b8 111
+.b8 114
+.b8 104
+.b8 102
+.b8 122
+.b8 119
+.b8 51
+.b8 114
+.b8 121
+.b8 104
+.b8 115
+.b8 118
+.b8 109
+.b8 53
+.b8 113
+.b8 122
+.b8 100
+.b8 118
+.b8 121
+.b8 113
+.b8 103
+.b8 103
+.b8 103
+.b8 99
+.b8 107
+.b8 50
+.b8 104
+.b8 113
+.b8 98
+.b8 121
+.b8 105
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 121
+.b8 97
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8b1fe1c0dff02c6fe19cf3e15c9fb44f328ccad6
--- /dev/null
+++ b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir
@@ -0,0 +1,760 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [8 x i8] c"<module>"
+@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [8 x i8] c"<module>"
+@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = lshr i32 %8, 4, !dbg !10
+  %10 = and i32 %9, 15, !dbg !10
+  %11 = and i32 %8, 15, !dbg !10
+  %12 = shl nuw nsw i32 %11, 3, !dbg !11
+  %13 = or i32 %12, 4, !dbg !11
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %15 = shl i32 %14, 4, !dbg !13
+  %16 = or i32 %15, %10, !dbg !14
+  %17 = or i32 %15, %11, !dbg !14
+  %18 = sext i32 %16 to i64, !dbg !15
+  %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
+  %20 = sext i32 %17 to i64, !dbg !15
+  %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %31 = srem i32 %16, 512, !dbg !17
+  %32 = shl nsw i32 %31, 8, !dbg !18
+  %33 = add i64 %30, 50257, !dbg !19
+  %34 = icmp slt i64 %22, 0, !dbg !20
+  %35 = icmp slt i64 %30, 0, !dbg !20
+  %36 = select i1 %35, i64 %33, i64 %30, !dbg !21
+  %37 = icmp ugt i64 %36, 50256, !dbg !22
+  %38 = shl i64 %22, 8, !dbg !23
+  %39 = add i64 %38, 12865792, !dbg !23
+  %40 = select i1 %34, i64 %39, i64 %38, !dbg !23
+  %41 = getelementptr float, ptr addrspace(1) %1, i64 %40
+  br label %42, !dbg !24
+
+42:                                               ; preds = %7, %104
+  %43 = phi float [ 0.000000e+00, %7 ], [ %143, %104 ]
+  %44 = phi float [ 0.000000e+00, %7 ], [ %144, %104 ]
+  %45 = phi float [ 0.000000e+00, %7 ], [ %145, %104 ]
+  %46 = phi float [ 0.000000e+00, %7 ], [ %146, %104 ]
+  %47 = phi float [ 0.000000e+00, %7 ], [ %147, %104 ]
+  %48 = phi float [ 0.000000e+00, %7 ], [ %148, %104 ]
+  %49 = phi float [ 0.000000e+00, %7 ], [ %149, %104 ]
+  %50 = phi float [ 0.000000e+00, %7 ], [ %150, %104 ]
+  %51 = phi float [ 0.000000e+00, %7 ], [ %151, %104 ]
+  %52 = phi float [ 0.000000e+00, %7 ], [ %152, %104 ]
+  %53 = phi float [ 0.000000e+00, %7 ], [ %153, %104 ]
+  %54 = phi float [ 0.000000e+00, %7 ], [ %154, %104 ]
+  %55 = phi float [ 0.000000e+00, %7 ], [ %155, %104 ]
+  %56 = phi float [ 0.000000e+00, %7 ], [ %156, %104 ]
+  %57 = phi float [ 0.000000e+00, %7 ], [ %157, %104 ]
+  %58 = phi float [ 0.000000e+00, %7 ], [ %158, %104 ]
+  %59 = phi float [ 0.000000e+00, %7 ], [ %191, %104 ]
+  %60 = phi float [ 0.000000e+00, %7 ], [ %192, %104 ]
+  %61 = phi float [ 0.000000e+00, %7 ], [ %193, %104 ]
+  %62 = phi float [ 0.000000e+00, %7 ], [ %194, %104 ]
+  %63 = phi float [ 0.000000e+00, %7 ], [ %195, %104 ]
+  %64 = phi float [ 0.000000e+00, %7 ], [ %196, %104 ]
+  %65 = phi float [ 0.000000e+00, %7 ], [ %197, %104 ]
+  %66 = phi float [ 0.000000e+00, %7 ], [ %198, %104 ]
+  %67 = phi float [ 0.000000e+00, %7 ], [ %167, %104 ]
+  %68 = phi float [ 0.000000e+00, %7 ], [ %168, %104 ]
+  %69 = phi float [ 0.000000e+00, %7 ], [ %169, %104 ]
+  %70 = phi float [ 0.000000e+00, %7 ], [ %170, %104 ]
+  %71 = phi float [ 0.000000e+00, %7 ], [ %171, %104 ]
+  %72 = phi float [ 0.000000e+00, %7 ], [ %172, %104 ]
+  %73 = phi float [ 0.000000e+00, %7 ], [ %173, %104 ]
+  %74 = phi float [ 0.000000e+00, %7 ], [ %174, %104 ]
+  %75 = phi i1 [ true, %7 ], [ false, %104 ]
+  %76 = phi i32 [ 0, %7 ], [ 128, %104 ]
+  %77 = or i32 %76, %12, !dbg !25
+  %78 = or i32 %76, %13, !dbg !25
+  %79 = or i32 %77, %32, !dbg !26
+  %80 = or i32 %78, %32, !dbg !26
+  %81 = sext i32 %79 to i64, !dbg !27
+  %82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !27
+  %83 = sext i32 %80 to i64, !dbg !27
+  %84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !27
+  %85 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %86 = extractvalue { i32, i32, i32, i32 } %85, 0, !dbg !28
+  %87 = extractvalue { i32, i32, i32, i32 } %85, 1, !dbg !28
+  %88 = extractvalue { i32, i32, i32, i32 } %85, 2, !dbg !28
+  %89 = extractvalue { i32, i32, i32, i32 } %85, 3, !dbg !28
+  %90 = bitcast i32 %86 to float, !dbg !28
+  %91 = bitcast i32 %87 to float, !dbg !28
+  %92 = bitcast i32 %88 to float, !dbg !28
+  %93 = bitcast i32 %89 to float, !dbg !28
+  %94 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %95 = extractvalue { i32, i32, i32, i32 } %94, 0, !dbg !28
+  %96 = extractvalue { i32, i32, i32, i32 } %94, 1, !dbg !28
+  %97 = extractvalue { i32, i32, i32, i32 } %94, 2, !dbg !28
+  %98 = extractvalue { i32, i32, i32, i32 } %94, 3, !dbg !28
+  %99 = bitcast i32 %95 to float, !dbg !28
+  %100 = bitcast i32 %96 to float, !dbg !28
+  %101 = bitcast i32 %97 to float, !dbg !28
+  %102 = bitcast i32 %98 to float, !dbg !28
+  br i1 %37, label %103, label %104, !dbg !29
+
+103:                                              ; preds = %42
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !29
+  br label %104, !dbg !29
+
+104:                                              ; preds = %103, %42
+  %105 = zext nneg i32 %77 to i64, !dbg !30
+  %106 = zext nneg i32 %78 to i64, !dbg !30
+  %107 = getelementptr float, ptr addrspace(1) %41, i64 %105, !dbg !31
+  %108 = getelementptr float, ptr addrspace(1) %41, i64 %106, !dbg !31
+  %109 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %110 = extractvalue { i32, i32, i32, i32 } %109, 0, !dbg !32
+  %111 = extractvalue { i32, i32, i32, i32 } %109, 1, !dbg !32
+  %112 = extractvalue { i32, i32, i32, i32 } %109, 2, !dbg !32
+  %113 = extractvalue { i32, i32, i32, i32 } %109, 3, !dbg !32
+  %114 = bitcast i32 %110 to float, !dbg !32
+  %115 = bitcast i32 %111 to float, !dbg !32
+  %116 = bitcast i32 %112 to float, !dbg !32
+  %117 = bitcast i32 %113 to float, !dbg !32
+  %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %108, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !32
+  %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !32
+  %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !32
+  %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !32
+  %123 = bitcast i32 %119 to float, !dbg !32
+  %124 = bitcast i32 %120 to float, !dbg !32
+  %125 = bitcast i32 %121 to float, !dbg !32
+  %126 = bitcast i32 %122 to float, !dbg !32
+  %127 = fadd float %90, %114, !dbg !33
+  %128 = fadd float %91, %115, !dbg !33
+  %129 = fadd float %92, %116, !dbg !33
+  %130 = fadd float %93, %117, !dbg !33
+  %131 = fadd float %99, %123, !dbg !33
+  %132 = fadd float %100, %124, !dbg !33
+  %133 = fadd float %101, %125, !dbg !33
+  %134 = fadd float %102, %126, !dbg !33
+  %135 = fsub float %127, %67, !dbg !34
+  %136 = fsub float %128, %68, !dbg !34
+  %137 = fsub float %129, %69, !dbg !34
+  %138 = fsub float %130, %70, !dbg !34
+  %139 = fsub float %131, %71, !dbg !34
+  %140 = fsub float %132, %72, !dbg !34
+  %141 = fsub float %133, %73, !dbg !34
+  %142 = fsub float %134, %74, !dbg !34
+  %143 = fadd float %43, 1.000000e+00, !dbg !38
+  %144 = fadd float %44, 1.000000e+00, !dbg !38
+  %145 = fadd float %45, 1.000000e+00, !dbg !38
+  %146 = fadd float %46, 1.000000e+00, !dbg !38
+  %147 = fadd float %47, 1.000000e+00, !dbg !38
+  %148 = fadd float %48, 1.000000e+00, !dbg !38
+  %149 = fadd float %49, 1.000000e+00, !dbg !38
+  %150 = fadd float %50, 1.000000e+00, !dbg !38
+  %151 = fadd float %51, 1.000000e+00, !dbg !38
+  %152 = fadd float %52, 1.000000e+00, !dbg !38
+  %153 = fadd float %53, 1.000000e+00, !dbg !38
+  %154 = fadd float %54, 1.000000e+00, !dbg !38
+  %155 = fadd float %55, 1.000000e+00, !dbg !38
+  %156 = fadd float %56, 1.000000e+00, !dbg !38
+  %157 = fadd float %57, 1.000000e+00, !dbg !38
+  %158 = fadd float %58, 1.000000e+00, !dbg !38
+  %159 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %143) #6, !dbg !39
+  %160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %144) #6, !dbg !39
+  %161 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float %145) #6, !dbg !39
+  %162 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %138, float %146) #6, !dbg !39
+  %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %139, float %147) #6, !dbg !39
+  %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %140, float %148) #6, !dbg !39
+  %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %149) #6, !dbg !39
+  %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %142, float %150) #6, !dbg !39
+  %167 = fadd float %67, %159, !dbg !40
+  %168 = fadd float %68, %160, !dbg !40
+  %169 = fadd float %69, %161, !dbg !40
+  %170 = fadd float %70, %162, !dbg !40
+  %171 = fadd float %71, %163, !dbg !40
+  %172 = fadd float %72, %164, !dbg !40
+  %173 = fadd float %73, %165, !dbg !40
+  %174 = fadd float %74, %166, !dbg !40
+  %175 = fsub float %127, %167, !dbg !41
+  %176 = fsub float %128, %168, !dbg !41
+  %177 = fsub float %129, %169, !dbg !41
+  %178 = fsub float %130, %170, !dbg !41
+  %179 = fsub float %131, %171, !dbg !41
+  %180 = fsub float %132, %172, !dbg !41
+  %181 = fsub float %133, %173, !dbg !41
+  %182 = fsub float %134, %174, !dbg !41
+  %183 = fmul float %135, %175, !dbg !42
+  %184 = fmul float %136, %176, !dbg !42
+  %185 = fmul float %137, %177, !dbg !42
+  %186 = fmul float %138, %178, !dbg !42
+  %187 = fmul float %139, %179, !dbg !42
+  %188 = fmul float %140, %180, !dbg !42
+  %189 = fmul float %141, %181, !dbg !42
+  %190 = fmul float %142, %182, !dbg !42
+  %191 = fadd float %59, %183, !dbg !43
+  %192 = fadd float %60, %184, !dbg !43
+  %193 = fadd float %61, %185, !dbg !43
+  %194 = fadd float %62, %186, !dbg !43
+  %195 = fadd float %63, %187, !dbg !43
+  %196 = fadd float %64, %188, !dbg !43
+  %197 = fadd float %65, %189, !dbg !43
+  %198 = fadd float %66, %190, !dbg !43
+  br i1 %75, label %42, label %199, !dbg !24
+
+199:                                              ; preds = %104
+  %200 = and i32 %8, 127, !dbg !11
+  %201 = and i32 %8, 128, !dbg !24
+  %.not = icmp eq i32 %201, 0, !dbg !24
+  %202 = select i1 %.not, i32 0, i32 136, !dbg !24
+  %203 = add nuw nsw i32 %202, %200, !dbg !24
+  %204 = zext nneg i32 %203 to i64, !dbg !24
+  %205 = getelementptr float, ptr addrspace(3) @global_smem, i64 %204, !dbg !24
+  %206 = insertelement <1 x float> undef, float %151, i64 0, !dbg !24
+  store <1 x float> %206, ptr addrspace(3) %205, align 4, !dbg !24
+  %207 = add nuw nsw i32 %200, 272, !dbg !24
+  %208 = add nuw nsw i32 %207, %202, !dbg !24
+  %209 = zext nneg i32 %208 to i64, !dbg !24
+  %210 = getelementptr float, ptr addrspace(3) @global_smem, i64 %209, !dbg !24
+  %211 = insertelement <1 x float> undef, float %152, i64 0, !dbg !24
+  store <1 x float> %211, ptr addrspace(3) %210, align 4, !dbg !24
+  %212 = add nuw nsw i32 %200, 544, !dbg !24
+  %213 = add nuw nsw i32 %212, %202, !dbg !24
+  %214 = zext nneg i32 %213 to i64, !dbg !24
+  %215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !24
+  %216 = insertelement <1 x float> undef, float %153, i64 0, !dbg !24
+  store <1 x float> %216, ptr addrspace(3) %215, align 4, !dbg !24
+  %217 = add nuw nsw i32 %200, 816, !dbg !24
+  %218 = add nuw nsw i32 %217, %202, !dbg !24
+  %219 = zext nneg i32 %218 to i64, !dbg !24
+  %220 = getelementptr float, ptr addrspace(3) @global_smem, i64 %219, !dbg !24
+  %221 = insertelement <1 x float> undef, float %154, i64 0, !dbg !24
+  store <1 x float> %221, ptr addrspace(3) %220, align 4, !dbg !24
+  %222 = add nuw nsw i32 %200, 1088, !dbg !24
+  %223 = add nuw nsw i32 %222, %202, !dbg !24
+  %224 = zext nneg i32 %223 to i64, !dbg !24
+  %225 = getelementptr float, ptr addrspace(3) @global_smem, i64 %224, !dbg !24
+  %226 = insertelement <1 x float> undef, float %155, i64 0, !dbg !24
+  store <1 x float> %226, ptr addrspace(3) %225, align 4, !dbg !24
+  %227 = add nuw nsw i32 %200, 1360, !dbg !24
+  %228 = add nuw nsw i32 %227, %202, !dbg !24
+  %229 = zext nneg i32 %228 to i64, !dbg !24
+  %230 = getelementptr float, ptr addrspace(3) @global_smem, i64 %229, !dbg !24
+  %231 = insertelement <1 x float> undef, float %156, i64 0, !dbg !24
+  store <1 x float> %231, ptr addrspace(3) %230, align 4, !dbg !24
+  %232 = add nuw nsw i32 %200, 1632, !dbg !24
+  %233 = add nuw nsw i32 %232, %202, !dbg !24
+  %234 = zext nneg i32 %233 to i64, !dbg !24
+  %235 = getelementptr float, ptr addrspace(3) @global_smem, i64 %234, !dbg !24
+  %236 = insertelement <1 x float> undef, float %157, i64 0, !dbg !24
+  store <1 x float> %236, ptr addrspace(3) %235, align 4, !dbg !24
+  %237 = add nuw nsw i32 %200, 1904, !dbg !24
+  %238 = add nuw nsw i32 %237, %202, !dbg !24
+  %239 = zext nneg i32 %238 to i64, !dbg !24
+  %240 = getelementptr float, ptr addrspace(3) @global_smem, i64 %239, !dbg !24
+  %241 = insertelement <1 x float> undef, float %158, i64 0, !dbg !24
+  store <1 x float> %241, ptr addrspace(3) %240, align 4, !dbg !24
+  tail call void @llvm.nvvm.barrier0(), !dbg !24
+  %242 = mul nuw nsw i32 %10, 136, !dbg !24
+  %243 = add nuw nsw i32 %242, %12, !dbg !24
+  %244 = zext nneg i32 %243 to i64, !dbg !24
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !24
+  %246 = load float, ptr addrspace(3) %245, align 32, !dbg !24
+  %247 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 1, !dbg !24
+  %248 = load float, ptr addrspace(3) %247, align 4, !dbg !24
+  %249 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 2, !dbg !24
+  %250 = load float, ptr addrspace(3) %249, align 8, !dbg !24
+  %251 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 3, !dbg !24
+  %252 = load float, ptr addrspace(3) %251, align 4, !dbg !24
+  %253 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 4, !dbg !24
+  %254 = load float, ptr addrspace(3) %253, align 16, !dbg !24
+  %255 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 5, !dbg !24
+  %256 = load float, ptr addrspace(3) %255, align 4, !dbg !24
+  %257 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 6, !dbg !24
+  %258 = load float, ptr addrspace(3) %257, align 8, !dbg !24
+  %259 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 7, !dbg !24
+  %260 = load float, ptr addrspace(3) %259, align 4, !dbg !24
+  %261 = fsub float %168, %167, !dbg !44
+  %262 = fadd float %246, %248, !dbg !48
+  %263 = fcmp oeq float %262, 0.000000e+00, !dbg !49
+  %264 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %262) #6, !dbg !50
+  %265 = select i1 %263, float 0.000000e+00, float %264, !dbg !51
+  %266 = fmul float %261, %265, !dbg !52
+  %267 = fadd float %167, %266, !dbg !53
+  %268 = fadd float %191, %192, !dbg !54
+  %269 = fmul float %261, %261, !dbg !55
+  %270 = fmul float %269, %246, !dbg !56
+  %271 = fmul float %270, %265, !dbg !57
+  %272 = fadd float %268, %271, !dbg !58
+  %273 = fsub float %169, %267, !dbg !44
+  %274 = fadd float %250, %262, !dbg !48
+  %275 = fcmp oeq float %274, 0.000000e+00, !dbg !49
+  %276 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %274) #6, !dbg !50
+  %277 = select i1 %275, float 0.000000e+00, float %276, !dbg !51
+  %278 = fmul float %277, %273, !dbg !52
+  %279 = fadd float %267, %278, !dbg !53
+  %280 = fadd float %193, %272, !dbg !54
+  %281 = fmul float %273, %273, !dbg !55
+  %282 = fmul float %262, %281, !dbg !56
+  %283 = fmul float %277, %282, !dbg !57
+  %284 = fadd float %280, %283, !dbg !58
+  %285 = fsub float %170, %279, !dbg !44
+  %286 = fadd float %252, %274, !dbg !48
+  %287 = fcmp oeq float %286, 0.000000e+00, !dbg !49
+  %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %286) #6, !dbg !50
+  %289 = select i1 %287, float 0.000000e+00, float %288, !dbg !51
+  %290 = fmul float %289, %285, !dbg !52
+  %291 = fadd float %279, %290, !dbg !53
+  %292 = fadd float %194, %284, !dbg !54
+  %293 = fmul float %285, %285, !dbg !55
+  %294 = fmul float %274, %293, !dbg !56
+  %295 = fmul float %289, %294, !dbg !57
+  %296 = fadd float %292, %295, !dbg !58
+  %297 = fsub float %171, %291, !dbg !44
+  %298 = fadd float %254, %286, !dbg !48
+  %299 = fcmp oeq float %298, 0.000000e+00, !dbg !49
+  %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %254, float %298) #6, !dbg !50
+  %301 = select i1 %299, float 0.000000e+00, float %300, !dbg !51
+  %302 = fmul float %301, %297, !dbg !52
+  %303 = fadd float %291, %302, !dbg !53
+  %304 = fadd float %195, %296, !dbg !54
+  %305 = fmul float %297, %297, !dbg !55
+  %306 = fmul float %286, %305, !dbg !56
+  %307 = fmul float %301, %306, !dbg !57
+  %308 = fadd float %304, %307, !dbg !58
+  %309 = fsub float %172, %303, !dbg !44
+  %310 = fadd float %256, %298, !dbg !48
+  %311 = fcmp oeq float %310, 0.000000e+00, !dbg !49
+  %312 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %256, float %310) #6, !dbg !50
+  %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !51
+  %314 = fmul float %313, %309, !dbg !52
+  %315 = fadd float %303, %314, !dbg !53
+  %316 = fadd float %196, %308, !dbg !54
+  %317 = fmul float %309, %309, !dbg !55
+  %318 = fmul float %298, %317, !dbg !56
+  %319 = fmul float %313, %318, !dbg !57
+  %320 = fadd float %316, %319, !dbg !58
+  %321 = fsub float %173, %315, !dbg !44
+  %322 = fadd float %258, %310, !dbg !48
+  %323 = fcmp oeq float %322, 0.000000e+00, !dbg !49
+  %324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %258, float %322) #6, !dbg !50
+  %325 = select i1 %323, float 0.000000e+00, float %324, !dbg !51
+  %326 = fmul float %325, %321, !dbg !52
+  %327 = fadd float %315, %326, !dbg !53
+  %328 = fadd float %197, %320, !dbg !54
+  %329 = fmul float %321, %321, !dbg !55
+  %330 = fmul float %310, %329, !dbg !56
+  %331 = fmul float %325, %330, !dbg !57
+  %332 = fadd float %328, %331, !dbg !58
+  %333 = fsub float %174, %327, !dbg !44
+  %334 = fadd float %260, %322, !dbg !48
+  %335 = fcmp oeq float %334, 0.000000e+00, !dbg !49
+  %336 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %260, float %334) #6, !dbg !50
+  %337 = select i1 %335, float 0.000000e+00, float %336, !dbg !51
+  %338 = fmul float %337, %333, !dbg !52
+  %339 = fadd float %327, %338, !dbg !53
+  %340 = fadd float %198, %332, !dbg !54
+  %341 = fmul float %333, %333, !dbg !55
+  %342 = fmul float %322, %341, !dbg !56
+  %343 = fmul float %337, %342, !dbg !57
+  %344 = fadd float %340, %343, !dbg !58
+  %345 = bitcast float %339 to i32, !dbg !59
+  %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !59
+  %347 = bitcast i32 %346 to float, !dbg !59
+  %348 = bitcast float %344 to i32, !dbg !59
+  %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 8, i32 31), !dbg !59
+  %350 = bitcast i32 %349 to float, !dbg !59
+  %351 = bitcast float %334 to i32, !dbg !59
+  %352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 8, i32 31), !dbg !59
+  %353 = bitcast i32 %352 to float, !dbg !59
+  %354 = fsub float %347, %339, !dbg !44
+  %355 = fadd float %334, %353, !dbg !48
+  %356 = fcmp oeq float %355, 0.000000e+00, !dbg !49
+  %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %353, float %355) #6, !dbg !50
+  %358 = select i1 %356, float 0.000000e+00, float %357, !dbg !51
+  %359 = fmul float %358, %354, !dbg !52
+  %360 = fadd float %339, %359, !dbg !53
+  %361 = fadd float %344, %350, !dbg !54
+  %362 = fmul float %354, %354, !dbg !55
+  %363 = fmul float %334, %362, !dbg !56
+  %364 = fmul float %358, %363, !dbg !57
+  %365 = fadd float %361, %364, !dbg !58
+  %366 = bitcast float %360 to i32, !dbg !59
+  %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 4, i32 31), !dbg !59
+  %368 = bitcast i32 %367 to float, !dbg !59
+  %369 = bitcast float %365 to i32, !dbg !59
+  %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 4, i32 31), !dbg !59
+  %371 = bitcast i32 %370 to float, !dbg !59
+  %372 = bitcast float %355 to i32, !dbg !59
+  %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 4, i32 31), !dbg !59
+  %374 = bitcast i32 %373 to float, !dbg !59
+  %375 = fsub float %368, %360, !dbg !44
+  %376 = fadd float %355, %374, !dbg !48
+  %377 = fcmp oeq float %376, 0.000000e+00, !dbg !49
+  %378 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %374, float %376) #6, !dbg !50
+  %379 = select i1 %377, float 0.000000e+00, float %378, !dbg !51
+  %380 = fmul float %379, %375, !dbg !52
+  %381 = fadd float %360, %380, !dbg !53
+  %382 = fadd float %365, %371, !dbg !54
+  %383 = fmul float %375, %375, !dbg !55
+  %384 = fmul float %355, %383, !dbg !56
+  %385 = fmul float %379, %384, !dbg !57
+  %386 = fadd float %382, %385, !dbg !58
+  %387 = bitcast float %381 to i32, !dbg !59
+  %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 2, i32 31), !dbg !59
+  %389 = bitcast i32 %388 to float, !dbg !59
+  %390 = bitcast float %386 to i32, !dbg !59
+  %391 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 2, i32 31), !dbg !59
+  %392 = bitcast i32 %391 to float, !dbg !59
+  %393 = bitcast float %376 to i32, !dbg !59
+  %394 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %393, i32 2, i32 31), !dbg !59
+  %395 = bitcast i32 %394 to float, !dbg !59
+  %396 = fsub float %389, %381, !dbg !44
+  %397 = fadd float %376, %395, !dbg !48
+  %398 = fcmp oeq float %397, 0.000000e+00, !dbg !49
+  %399 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %395, float %397) #6, !dbg !50
+  %400 = select i1 %398, float 0.000000e+00, float %399, !dbg !51
+  %401 = fmul float %400, %396, !dbg !52
+  %402 = fadd float %381, %401, !dbg !53
+  %403 = fadd float %386, %392, !dbg !54
+  %404 = fmul float %396, %396, !dbg !55
+  %405 = fmul float %376, %404, !dbg !56
+  %406 = fmul float %400, %405, !dbg !57
+  %407 = fadd float %403, %406, !dbg !58
+  %408 = bitcast float %402 to i32, !dbg !59
+  %409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 1, i32 31), !dbg !59
+  %410 = bitcast i32 %409 to float, !dbg !59
+  %411 = bitcast float %407 to i32, !dbg !59
+  %412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %411, i32 1, i32 31), !dbg !59
+  %413 = bitcast i32 %412 to float, !dbg !59
+  %414 = bitcast float %397 to i32, !dbg !59
+  %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !59
+  %416 = bitcast i32 %415 to float, !dbg !59
+  %417 = fsub float %410, %402, !dbg !44
+  %418 = fadd float %397, %416, !dbg !48
+  %419 = fcmp oeq float %418, 0.000000e+00, !dbg !49
+  %420 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %418) #6, !dbg !50
+  %421 = select i1 %419, float 0.000000e+00, float %420, !dbg !51
+  %422 = fmul float %421, %417, !dbg !52
+  %423 = fadd float %402, %422, !dbg !53
+  %424 = fadd float %407, %413, !dbg !54
+  %425 = fmul float %417, %417, !dbg !55
+  %426 = fmul float %397, %425, !dbg !56
+  %427 = fmul float %421, %426, !dbg !57
+  %428 = fadd float %424, %427, !dbg !58
+  %429 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %430 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %431 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %432 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %434 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %435 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %436 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %437 = fadd float %429, 0x3EE4F8B580000000, !dbg !62
+  %438 = shl i32 %16, 8, !dbg !63
+  br label %439, !dbg !64
+
+439:                                              ; preds = %199, %__nv_rsqrtf.exit
+  %440 = phi i1 [ true, %199 ], [ false, %__nv_rsqrtf.exit ]
+  %441 = phi i32 [ 0, %199 ], [ 128, %__nv_rsqrtf.exit ]
+  %442 = or i32 %441, %12, !dbg !65
+  %443 = or i32 %441, %13, !dbg !65
+  %444 = or i32 %442, %32, !dbg !66
+  %445 = or i32 %443, %32, !dbg !66
+  %446 = sext i32 %444 to i64, !dbg !67
+  %447 = getelementptr float, ptr addrspace(1) %2, i64 %446, !dbg !67
+  %448 = sext i32 %445 to i64, !dbg !67
+  %449 = getelementptr float, ptr addrspace(1) %2, i64 %448, !dbg !67
+  %450 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %447, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %451 = extractvalue { i32, i32, i32, i32 } %450, 0, !dbg !68
+  %452 = extractvalue { i32, i32, i32, i32 } %450, 1, !dbg !68
+  %453 = extractvalue { i32, i32, i32, i32 } %450, 2, !dbg !68
+  %454 = extractvalue { i32, i32, i32, i32 } %450, 3, !dbg !68
+  %455 = bitcast i32 %451 to float, !dbg !68
+  %456 = bitcast i32 %452 to float, !dbg !68
+  %457 = bitcast i32 %453 to float, !dbg !68
+  %458 = bitcast i32 %454 to float, !dbg !68
+  %459 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %449, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %460 = extractvalue { i32, i32, i32, i32 } %459, 0, !dbg !68
+  %461 = extractvalue { i32, i32, i32, i32 } %459, 1, !dbg !68
+  %462 = extractvalue { i32, i32, i32, i32 } %459, 2, !dbg !68
+  %463 = extractvalue { i32, i32, i32, i32 } %459, 3, !dbg !68
+  %464 = bitcast i32 %460 to float, !dbg !68
+  %465 = bitcast i32 %461 to float, !dbg !68
+  %466 = bitcast i32 %462 to float, !dbg !68
+  %467 = bitcast i32 %463 to float, !dbg !68
+  %468 = zext nneg i32 %442 to i64, !dbg !69
+  %469 = getelementptr float, ptr addrspace(1) %3, i64 %468, !dbg !69
+  %470 = zext nneg i32 %443 to i64, !dbg !69
+  %471 = getelementptr float, ptr addrspace(1) %3, i64 %470, !dbg !69
+  %472 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %469, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %473 = extractvalue { i32, i32, i32, i32 } %472, 0, !dbg !70
+  %474 = extractvalue { i32, i32, i32, i32 } %472, 1, !dbg !70
+  %475 = extractvalue { i32, i32, i32, i32 } %472, 2, !dbg !70
+  %476 = extractvalue { i32, i32, i32, i32 } %472, 3, !dbg !70
+  %477 = bitcast i32 %473 to float, !dbg !70
+  %478 = bitcast i32 %474 to float, !dbg !70
+  %479 = bitcast i32 %475 to float, !dbg !70
+  %480 = bitcast i32 %476 to float, !dbg !70
+  %481 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %471, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %482 = extractvalue { i32, i32, i32, i32 } %481, 0, !dbg !70
+  %483 = extractvalue { i32, i32, i32, i32 } %481, 1, !dbg !70
+  %484 = extractvalue { i32, i32, i32, i32 } %481, 2, !dbg !70
+  %485 = extractvalue { i32, i32, i32, i32 } %481, 3, !dbg !70
+  %486 = bitcast i32 %482 to float, !dbg !70
+  %487 = bitcast i32 %483 to float, !dbg !70
+  %488 = bitcast i32 %484 to float, !dbg !70
+  %489 = bitcast i32 %485 to float, !dbg !70
+  br i1 %37, label %490, label %491, !dbg !71
+
+490:                                              ; preds = %439
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %491, !dbg !71
+
+491:                                              ; preds = %490, %439
+  %492 = getelementptr float, ptr addrspace(1) %41, i64 %468, !dbg !72
+  %493 = getelementptr float, ptr addrspace(1) %41, i64 %470, !dbg !72
+  %494 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %492, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %495 = extractvalue { i32, i32, i32, i32 } %494, 0, !dbg !73
+  %496 = extractvalue { i32, i32, i32, i32 } %494, 1, !dbg !73
+  %497 = extractvalue { i32, i32, i32, i32 } %494, 2, !dbg !73
+  %498 = extractvalue { i32, i32, i32, i32 } %494, 3, !dbg !73
+  %499 = bitcast i32 %495 to float, !dbg !73
+  %500 = bitcast i32 %496 to float, !dbg !73
+  %501 = bitcast i32 %497 to float, !dbg !73
+  %502 = bitcast i32 %498 to float, !dbg !73
+  %503 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %493, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %504 = extractvalue { i32, i32, i32, i32 } %503, 0, !dbg !73
+  %505 = extractvalue { i32, i32, i32, i32 } %503, 1, !dbg !73
+  %506 = extractvalue { i32, i32, i32, i32 } %503, 2, !dbg !73
+  %507 = extractvalue { i32, i32, i32, i32 } %503, 3, !dbg !73
+  %508 = bitcast i32 %504 to float, !dbg !73
+  %509 = bitcast i32 %505 to float, !dbg !73
+  %510 = bitcast i32 %506 to float, !dbg !73
+  %511 = bitcast i32 %507 to float, !dbg !73
+  %512 = fadd float %455, %499, !dbg !74
+  %513 = fadd float %456, %500, !dbg !74
+  %514 = fadd float %457, %501, !dbg !74
+  %515 = fadd float %458, %502, !dbg !74
+  %516 = fadd float %464, %508, !dbg !74
+  %517 = fadd float %465, %509, !dbg !74
+  %518 = fadd float %466, %510, !dbg !74
+  %519 = fadd float %467, %511, !dbg !74
+  %520 = fsub float %512, %423, !dbg !75
+  %521 = fsub float %513, %423, !dbg !75
+  %522 = fsub float %514, %423, !dbg !75
+  %523 = fsub float %515, %423, !dbg !75
+  %524 = fsub float %516, %423, !dbg !75
+  %525 = fsub float %517, %423, !dbg !75
+  %526 = fsub float %518, %423, !dbg !75
+  %527 = fsub float %519, %423, !dbg !75
+  %528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i = icmp eq i32 %528, 0, !dbg !76
+  br i1 %.not.i, label %531, label %529, !dbg !76
+
+529:                                              ; preds = %491
+  %530 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %437), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+531:                                              ; preds = %491
+  %532 = tail call float @llvm.nvvm.rsqrt.approx.f(float %437), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+__nv_rsqrtf.exit:                                 ; preds = %529, %531
+  %.0.i = phi float [ %530, %529 ], [ %532, %531 ], !dbg !76
+  %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %540 = fmul float %520, %.0.i, !dbg !77
+  %541 = fmul float %521, %.0.i, !dbg !77
+  %542 = fmul float %522, %.0.i, !dbg !77
+  %543 = fmul float %523, %.0.i, !dbg !77
+  %544 = fmul float %524, %.0.i, !dbg !77
+  %545 = fmul float %525, %.0.i, !dbg !77
+  %546 = fmul float %526, %.0.i, !dbg !77
+  %547 = fmul float %527, %.0.i, !dbg !77
+  %548 = fmul float %540, %477, !dbg !78
+  %549 = fmul float %541, %478, !dbg !78
+  %550 = fmul float %542, %479, !dbg !78
+  %551 = fmul float %543, %480, !dbg !78
+  %552 = fmul float %544, %486, !dbg !78
+  %553 = fmul float %545, %487, !dbg !78
+  %554 = fmul float %546, %488, !dbg !78
+  %555 = fmul float %547, %489, !dbg !78
+  %556 = or i32 %442, %438, !dbg !79
+  %557 = sext i32 %556 to i64, !dbg !80
+  %558 = getelementptr i16, ptr addrspace(1) %4, i64 %557, !dbg !80
+  %559 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %548) #6, !dbg !81
+  %560 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %549) #6, !dbg !81
+  %561 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %550) #6, !dbg !81
+  %562 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %551) #6, !dbg !81
+  %563 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %552) #6, !dbg !81
+  %564 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %553) #6, !dbg !81
+  %565 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %554) #6, !dbg !81
+  %566 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %555) #6, !dbg !81
+  %567 = insertelement <2 x i16> undef, i16 %559, i64 0, !dbg !81
+  %568 = insertelement <2 x i16> %567, i16 %560, i64 1, !dbg !81
+  %569 = bitcast <2 x i16> %568 to i32, !dbg !81
+  %570 = insertelement <2 x i16> undef, i16 %561, i64 0, !dbg !81
+  %571 = insertelement <2 x i16> %570, i16 %562, i64 1, !dbg !81
+  %572 = bitcast <2 x i16> %571 to i32, !dbg !81
+  %573 = insertelement <2 x i16> undef, i16 %563, i64 0, !dbg !81
+  %574 = insertelement <2 x i16> %573, i16 %564, i64 1, !dbg !81
+  %575 = bitcast <2 x i16> %574 to i32, !dbg !81
+  %576 = insertelement <2 x i16> undef, i16 %565, i64 0, !dbg !81
+  %577 = insertelement <2 x i16> %576, i16 %566, i64 1, !dbg !81
+  %578 = bitcast <2 x i16> %577 to i32, !dbg !81
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %569, i32 %572, i32 %575, i32 %578, ptr addrspace(1) %558, i1 true) #6, !dbg !81
+  br i1 %440, label %439, label %579, !dbg !64
+
+579:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !82
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 22, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 36, scope: !7)
+!22 = !DILocation(line: 39, column: 40, scope: !7)
+!23 = !DILocation(line: 40, column: 44, scope: !7)
+!24 = !DILocation(line: 31, column: 36, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 39, column: 55, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 34, scope: !7)
+!32 = !DILocation(line: 40, column: 52, scope: !7)
+!33 = !DILocation(line: 41, column: 22, scope: !7)
+!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 44, column: 38, scope: !35)
+!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
+!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
+!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
+!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
+!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
+!43 = !DILocation(line: 47, column: 48, scope: !7)
+!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
+!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
+!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 41, scope: !45)
+!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
+!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
+!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
+!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
+!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
+!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
+!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
+!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
+!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
+!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
+!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
+!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
+!60 = !DILocation(line: 50, column: 41, scope: !35)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..fa0ee683e7de385f10c4577972bfdbc6b1832729
--- /dev/null
+++ b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx
@@ -0,0 +1,1278 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62};
+.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62};
+.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<84>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<236>;
+	.reg .f32 	%f<324>;
+	.reg .b64 	%rd<89>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5de6de_param_4];
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5de6de_param_2];
+	ld.param.u64 	%rd29, [triton__0d1d2d3d4d5de6de_param_0];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r1, %tid.x;
+	ld.param.u64 	%rd30, [triton__0d1d2d3d4d5de6de_param_1];
+	bfe.u32 	%r2, %r1, 4, 4;
+	and.b32  	%r12, %r1, 15;
+	.loc	1 24 33
+	shl.b32 	%r3, %r12, 3;
+	.loc	1 21 28
+	mov.u32 %r10, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r13, %r10, 4;
+	.loc	1 22 23
+	or.b32  	%r4, %r13, %r2;
+	or.b32  	%r14, %r13, %r12;
+	.loc	1 26 30
+	mul.wide.s32 	%rd31, %r4, 8;
+	add.s64 	%rd12, %rd29, %rd31;
+	mul.wide.s32 	%rd32, %r14, 8;
+	add.s64 	%rd28, %rd29, %rd32;
+	mov.pred 	%p3, -1;
+	.loc	1 26 35
+	mov.u64 %rd11, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd12 + 0 ];
+	mov.u64 %rd13, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd12 + 0 ];
+	mov.u64 %rd15, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd12 + 0 ];
+	mov.u64 %rd17, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd12 + 0 ];
+	mov.u64 %rd19, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd12 + 0 ];
+	mov.u64 %rd21, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd12 + 0 ];
+	mov.u64 %rd23, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd12 + 0 ];
+	mov.u64 %rd25, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd12 + 0 ];
+	mov.u64 %rd27, 0x0;
+	@%p3 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd28 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r15, %r10, 27, 1;
+	shr.u32 	%r16, %r15, 23;
+	add.s32 	%r17, %r4, %r16;
+	and.b32  	%r18, %r17, 16776704;
+	sub.s32 	%r19, %r4, %r18;
+	.loc	1 35 44
+	shl.b32 	%r5, %r19, 8;
+	.loc	1 36 22
+	add.s64 	%rd33, %rd27, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p13, %rd11, 0;
+	setp.lt.s64 	%p14, %rd27, 0;
+	.loc	1 38 36
+	selp.b64 	%rd1, %rd33, %rd27, %p14;
+	.loc	1 40 44
+	shl.b64 	%rd34, %rd11, 8;
+	add.s64 	%rd35, %rd34, 12865792;
+	selp.b64 	%rd36, %rd35, %rd34, %p13;
+	shl.b64 	%rd37, %rd36, 2;
+	add.s64 	%rd2, %rd30, %rd37;
+	mov.b32 	%r24, 0;
+	mov.f32 	%f292, 0f00000000;
+	mov.f32 	%f293, %f292;
+	mov.f32 	%f294, %f292;
+	mov.f32 	%f295, %f292;
+	mov.f32 	%f296, %f292;
+	mov.f32 	%f297, %f292;
+	mov.f32 	%f298, %f292;
+	mov.f32 	%f299, %f292;
+	mov.f32 	%f300, %f292;
+	mov.f32 	%f301, %f292;
+	mov.f32 	%f302, %f292;
+	mov.f32 	%f303, %f292;
+	mov.f32 	%f304, %f292;
+	mov.f32 	%f305, %f292;
+	mov.f32 	%f306, %f292;
+	mov.f32 	%f307, %f292;
+	mov.f32 	%f308, %f292;
+	mov.f32 	%f309, %f292;
+	mov.f32 	%f310, %f292;
+	mov.f32 	%f311, %f292;
+	mov.f32 	%f312, %f292;
+	mov.f32 	%f313, %f292;
+	mov.f32 	%f314, %f292;
+	mov.f32 	%f315, %f292;
+	mov.f32 	%f316, %f292;
+	mov.f32 	%f317, %f292;
+	mov.f32 	%f318, %f292;
+	mov.f32 	%f319, %f292;
+	mov.f32 	%f320, %f292;
+	mov.f32 	%f321, %f292;
+	mov.f32 	%f322, %f292;
+	mov.f32 	%f323, %f292;
+	mov.pred 	%p82, %p3;
+	mov.u32 	%r234, %r24;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 0 0
+	mov.b32 	%f33, %r20;
+	mov.b32 	%f34, %r21;
+	mov.b32 	%f35, %r22;
+	mov.b32 	%f36, %r23;
+	mov.b32 	%f37, %r28;
+	mov.b32 	%f38, %r29;
+	mov.b32 	%f39, %r30;
+	mov.b32 	%f40, %r31;
+	.loc	1 40 34
+	mul.wide.u32 	%rd55, %r7, 4;
+	add.s64 	%rd53, %rd2, %rd55;
+	.loc	1 40 40
+	cvt.u64.u32 	%rd56, %r234;
+	add.s64 	%rd57, %rd56, %rd4;
+	.loc	1 40 34
+	shl.b64 	%rd58, %rd57, 2;
+	add.s64 	%rd59, %rd2, %rd58;
+	add.s64 	%rd54, %rd59, 16;
+	mov.b32 	%r167, 0;
+	mov.pred 	%p49, -1;
+	.loc	1 40 52
+	mov.u32 %r38, 0x0;
+	mov.u32 %r39, 0x0;
+	mov.u32 %r40, 0x0;
+	mov.u32 %r41, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd53 + 0 ];
+	@!%p49 mov.u32 %r38, %r167;
+	@!%p49 mov.u32 %r39, %r167;
+	@!%p49 mov.u32 %r40, %r167;
+	@!%p49 mov.u32 %r41, %r167;
+	mov.b32 	%f92, %r38;
+	mov.b32 	%f93, %r39;
+	mov.b32 	%f94, %r40;
+	mov.b32 	%f95, %r41;
+	mov.u32 %r46, 0x0;
+	mov.u32 %r47, 0x0;
+	mov.u32 %r48, 0x0;
+	mov.u32 %r49, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd54 + 0 ];
+	@!%p49 mov.u32 %r46, %r167;
+	@!%p49 mov.u32 %r47, %r167;
+	@!%p49 mov.u32 %r48, %r167;
+	@!%p49 mov.u32 %r49, %r167;
+	mov.b32 	%f96, %r46;
+	mov.b32 	%f97, %r47;
+	mov.b32 	%f98, %r48;
+	mov.b32 	%f99, %r49;
+	.loc	1 41 22
+	add.f32 	%f100, %f33, %f92;
+	add.f32 	%f101, %f34, %f93;
+	add.f32 	%f102, %f35, %f94;
+	add.f32 	%f103, %f36, %f95;
+	add.f32 	%f104, %f37, %f96;
+	add.f32 	%f105, %f38, %f97;
+	add.f32 	%f106, %f39, %f98;
+	add.f32 	%f107, %f40, %f99;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f108, %f100, %f316;
+	sub.f32 	%f109, %f101, %f317;
+	sub.f32 	%f110, %f102, %f318;
+	sub.f32 	%f111, %f103, %f319;
+	sub.f32 	%f112, %f104, %f320;
+	sub.f32 	%f113, %f105, %f321;
+	sub.f32 	%f114, %f106, %f322;
+	sub.f32 	%f115, %f107, %f323;
+	.loc	2 97 26
+	add.f32 	%f292, %f292, 0f3F800000;
+	add.f32 	%f293, %f293, 0f3F800000;
+	add.f32 	%f294, %f294, 0f3F800000;
+	add.f32 	%f295, %f295, 0f3F800000;
+	add.f32 	%f296, %f296, 0f3F800000;
+	add.f32 	%f297, %f297, 0f3F800000;
+	add.f32 	%f298, %f298, 0f3F800000;
+	add.f32 	%f299, %f299, 0f3F800000;
+	add.f32 	%f300, %f300, 0f3F800000;
+	add.f32 	%f301, %f301, 0f3F800000;
+	add.f32 	%f302, %f302, 0f3F800000;
+	add.f32 	%f303, %f303, 0f3F800000;
+	add.f32 	%f304, %f304, 0f3F800000;
+	add.f32 	%f305, %f305, 0f3F800000;
+	add.f32 	%f306, %f306, 0f3F800000;
+	add.f32 	%f307, %f307, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r55, %f108;
+	mov.b32 	%r56, %f292;
+	div.full.f32 %r54, %r55, %r56;
+	mov.b32 	%f116, %r54;
+	mov.b32 	%r58, %f109;
+	mov.b32 	%r59, %f293;
+	div.full.f32 %r57, %r58, %r59;
+	mov.b32 	%f117, %r57;
+	mov.b32 	%r61, %f110;
+	mov.b32 	%r62, %f294;
+	div.full.f32 %r60, %r61, %r62;
+	mov.b32 	%f118, %r60;
+	mov.b32 	%r64, %f111;
+	mov.b32 	%r65, %f295;
+	div.full.f32 %r63, %r64, %r65;
+	mov.b32 	%f119, %r63;
+	mov.b32 	%r67, %f112;
+	mov.b32 	%r68, %f296;
+	div.full.f32 %r66, %r67, %r68;
+	mov.b32 	%f120, %r66;
+	mov.b32 	%r70, %f113;
+	mov.b32 	%r71, %f297;
+	div.full.f32 %r69, %r70, %r71;
+	mov.b32 	%f121, %r69;
+	mov.b32 	%r73, %f114;
+	mov.b32 	%r74, %f298;
+	div.full.f32 %r72, %r73, %r74;
+	mov.b32 	%f122, %r72;
+	mov.b32 	%r76, %f115;
+	mov.b32 	%r77, %f299;
+	div.full.f32 %r75, %r76, %r77;
+	mov.b32 	%f123, %r75;
+	.loc	2 98 22
+	add.f32 	%f316, %f316, %f116;
+	add.f32 	%f317, %f317, %f117;
+	add.f32 	%f318, %f318, %f118;
+	add.f32 	%f319, %f319, %f119;
+	add.f32 	%f320, %f320, %f120;
+	add.f32 	%f321, %f321, %f121;
+	add.f32 	%f322, %f322, %f122;
+	add.f32 	%f323, %f323, %f123;
+	.loc	2 101 30
+	sub.f32 	%f124, %f100, %f316;
+	sub.f32 	%f125, %f101, %f317;
+	sub.f32 	%f126, %f102, %f318;
+	sub.f32 	%f127, %f103, %f319;
+	sub.f32 	%f128, %f104, %f320;
+	sub.f32 	%f129, %f105, %f321;
+	sub.f32 	%f130, %f106, %f322;
+	sub.f32 	%f131, %f107, %f323;
+$L__tmp2:
+	.loc	1 47 48
+	fma.rn.f32 	%f308, %f108, %f124, %f308;
+	fma.rn.f32 	%f309, %f109, %f125, %f309;
+	fma.rn.f32 	%f310, %f110, %f126, %f310;
+	fma.rn.f32 	%f311, %f111, %f127, %f311;
+	fma.rn.f32 	%f312, %f112, %f128, %f312;
+	fma.rn.f32 	%f313, %f113, %f129, %f313;
+	fma.rn.f32 	%f314, %f114, %f130, %f314;
+	fma.rn.f32 	%f315, %f115, %f131, %f315;
+	mov.b32 	%r234, 128;
+	mov.pred 	%p82, 0;
+	.loc	1 31 36
+	@%p1 bra 	$L__BB0_1;
+	bra.uni 	$L__BB0_4;
+$L__BB0_1:
+	.loc	1 0 36
+	mov.pred 	%p1, %p82;
+	.loc	1 39 40
+	setp.lt.u64 	%p25, %rd1, 50257;
+	.loc	1 32 27
+	or.b32  	%r7, %r234, %r3;
+	.loc	1 35 40
+	or.b32  	%r36, %r7, %r5;
+	.loc	1 35 34
+	mul.wide.s32 	%rd40, %r36, 4;
+	add.s64 	%rd38, %rd8, %rd40;
+	cvt.s64.s32 	%rd3, %r5;
+	cvt.s64.s32 	%rd41, %r234;
+	cvt.u64.u32 	%rd4, %r3;
+	add.s64 	%rd42, %rd41, %rd4;
+	add.s64 	%rd43, %rd42, %rd3;
+	shl.b64 	%rd44, %rd43, 2;
+	add.s64 	%rd45, %rd8, %rd44;
+	add.s64 	%rd39, %rd45, 16;
+	.loc	1 35 50
+	mov.u32 %r20, 0x0;
+	mov.u32 %r21, 0x0;
+	mov.u32 %r22, 0x0;
+	mov.u32 %r23, 0x0;
+	@%p3 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd38 + 0 ];
+	@!%p3 mov.u32 %r20, %r24;
+	@!%p3 mov.u32 %r21, %r24;
+	@!%p3 mov.u32 %r22, %r24;
+	@!%p3 mov.u32 %r23, %r24;
+	mov.u32 %r28, 0x0;
+	mov.u32 %r29, 0x0;
+	mov.u32 %r30, 0x0;
+	mov.u32 %r31, 0x0;
+	@%p3 ld.global.L1::evict_last.v4.b32 { %r28, %r29, %r30, %r31 }, [ %rd39 + 0 ];
+	@!%p3 mov.u32 %r28, %r24;
+	@!%p3 mov.u32 %r29, %r24;
+	@!%p3 mov.u32 %r30, %r24;
+	@!%p3 mov.u32 %r31, %r24;
+	mov.b32 	%r233, 1892;
+	mov.u64 	%rd88, 1;
+	.loc	1 39 55
+	@%p25 bra 	$L__BB0_3;
+	mov.u64 	%rd46, assertMessage_0;
+	cvta.global.u64 	%rd47, %rd46;
+	mov.u64 	%rd48, assertFile_0;
+	cvta.global.u64 	%rd49, %rd48;
+	mov.u64 	%rd50, assertFunc_0;
+	cvta.global.u64 	%rd51, %rd50;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd47;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd49;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r233;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd51;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd88;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 0
+	bra.uni 	$L__BB0_3;
+$L__BB0_4:
+	.loc	1 24 33
+	and.b32  	%r137, %r1, 127;
+	.loc	1 31 36
+	bfe.s32 	%r138, %r1, 7, 1;
+	and.b32  	%r139, %r138, 136;
+	add.s32 	%r140, %r139, %r137;
+	shl.b32 	%r141, %r140, 2;
+	mov.u32 	%r142, global_smem;
+	add.s32 	%r143, %r142, %r141;
+	st.shared.f32 	[%r143], %f300;
+	st.shared.f32 	[%r143+1088], %f301;
+	st.shared.f32 	[%r143+2176], %f302;
+	st.shared.f32 	[%r143+3264], %f303;
+	st.shared.f32 	[%r143+4352], %f304;
+	st.shared.f32 	[%r143+5440], %f305;
+	st.shared.f32 	[%r143+6528], %f306;
+	st.shared.f32 	[%r143+7616], %f307;
+	bar.sync 	0;
+	mad.lo.s32 	%r144, %r2, 136, %r3;
+	shl.b32 	%r145, %r144, 2;
+	add.s32 	%r146, %r142, %r145;
+	ld.shared.v4.f32 	{%f132, %f133, %f134, %f135}, [%r146];
+	ld.shared.v4.f32 	{%f136, %f137, %f138, %f139}, [%r146+16];
+$L__tmp3:
+	.loc	2 108 21
+	sub.f32 	%f140, %f317, %f316;
+	.loc	2 109 28
+	add.f32 	%f141, %f132, %f133;
+	.loc	2 110 39
+	setp.eq.f32 	%p38, %f141, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r80, %f133;
+	mov.b32 	%r81, %f141;
+	div.full.f32 %r79, %r80, %r81;
+	mov.b32 	%f142, %r79;
+	.loc	2 110 49
+	selp.f32 	%f143, 0f00000000, %f142, %p38;
+	.loc	2 112 17
+	fma.rn.f32 	%f144, %f140, %f143, %f316;
+	.loc	2 113 15
+	add.f32 	%f145, %f308, %f309;
+	.loc	2 113 30
+	mul.f32 	%f146, %f140, %f140;
+	.loc	2 113 38
+	mul.f32 	%f147, %f146, %f132;
+	.loc	2 113 22
+	fma.rn.f32 	%f148, %f147, %f143, %f145;
+	.loc	2 108 21
+	sub.f32 	%f149, %f318, %f144;
+	.loc	2 109 28
+	add.f32 	%f150, %f134, %f141;
+	.loc	2 110 39
+	setp.eq.f32 	%p39, %f150, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r84, %f150;
+	mov.b32 	%r83, %f134;
+	div.full.f32 %r82, %r83, %r84;
+	mov.b32 	%f151, %r82;
+	.loc	2 110 49
+	selp.f32 	%f152, 0f00000000, %f151, %p39;
+	.loc	2 112 17
+	fma.rn.f32 	%f153, %f152, %f149, %f144;
+	.loc	2 113 15
+	add.f32 	%f154, %f310, %f148;
+	.loc	2 113 30
+	mul.f32 	%f155, %f149, %f149;
+	.loc	2 113 38
+	mul.f32 	%f156, %f141, %f155;
+	.loc	2 113 22
+	fma.rn.f32 	%f157, %f152, %f156, %f154;
+	.loc	2 108 21
+	sub.f32 	%f158, %f319, %f153;
+	.loc	2 109 28
+	add.f32 	%f159, %f135, %f150;
+	.loc	2 110 39
+	setp.eq.f32 	%p40, %f159, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r87, %f159;
+	mov.b32 	%r86, %f135;
+	div.full.f32 %r85, %r86, %r87;
+	mov.b32 	%f160, %r85;
+	.loc	2 110 49
+	selp.f32 	%f161, 0f00000000, %f160, %p40;
+	.loc	2 112 17
+	fma.rn.f32 	%f162, %f161, %f158, %f153;
+	.loc	2 113 15
+	add.f32 	%f163, %f311, %f157;
+	.loc	2 113 30
+	mul.f32 	%f164, %f158, %f158;
+	.loc	2 113 38
+	mul.f32 	%f165, %f150, %f164;
+	.loc	2 113 22
+	fma.rn.f32 	%f166, %f161, %f165, %f163;
+	.loc	2 108 21
+	sub.f32 	%f167, %f320, %f162;
+	.loc	2 109 28
+	add.f32 	%f168, %f136, %f159;
+	.loc	2 110 39
+	setp.eq.f32 	%p41, %f168, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r90, %f168;
+	mov.b32 	%r89, %f136;
+	div.full.f32 %r88, %r89, %r90;
+	mov.b32 	%f169, %r88;
+	.loc	2 110 49
+	selp.f32 	%f170, 0f00000000, %f169, %p41;
+	.loc	2 112 17
+	fma.rn.f32 	%f171, %f170, %f167, %f162;
+	.loc	2 113 15
+	add.f32 	%f172, %f312, %f166;
+	.loc	2 113 30
+	mul.f32 	%f173, %f167, %f167;
+	.loc	2 113 38
+	mul.f32 	%f174, %f159, %f173;
+	.loc	2 113 22
+	fma.rn.f32 	%f175, %f170, %f174, %f172;
+	.loc	2 108 21
+	sub.f32 	%f176, %f321, %f171;
+	.loc	2 109 28
+	add.f32 	%f177, %f137, %f168;
+	.loc	2 110 39
+	setp.eq.f32 	%p42, %f177, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r93, %f177;
+	mov.b32 	%r92, %f137;
+	div.full.f32 %r91, %r92, %r93;
+	mov.b32 	%f178, %r91;
+	.loc	2 110 49
+	selp.f32 	%f179, 0f00000000, %f178, %p42;
+	.loc	2 112 17
+	fma.rn.f32 	%f180, %f179, %f176, %f171;
+	.loc	2 113 15
+	add.f32 	%f181, %f313, %f175;
+	.loc	2 113 30
+	mul.f32 	%f182, %f176, %f176;
+	.loc	2 113 38
+	mul.f32 	%f183, %f168, %f182;
+	.loc	2 113 22
+	fma.rn.f32 	%f184, %f179, %f183, %f181;
+	.loc	2 108 21
+	sub.f32 	%f185, %f322, %f180;
+	.loc	2 109 28
+	add.f32 	%f186, %f138, %f177;
+	.loc	2 110 39
+	setp.eq.f32 	%p43, %f186, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r96, %f186;
+	mov.b32 	%r95, %f138;
+	div.full.f32 %r94, %r95, %r96;
+	mov.b32 	%f187, %r94;
+	.loc	2 110 49
+	selp.f32 	%f188, 0f00000000, %f187, %p43;
+	.loc	2 112 17
+	fma.rn.f32 	%f189, %f188, %f185, %f180;
+	.loc	2 113 15
+	add.f32 	%f190, %f314, %f184;
+	.loc	2 113 30
+	mul.f32 	%f191, %f185, %f185;
+	.loc	2 113 38
+	mul.f32 	%f192, %f177, %f191;
+	.loc	2 113 22
+	fma.rn.f32 	%f193, %f188, %f192, %f190;
+	.loc	2 108 21
+	sub.f32 	%f194, %f323, %f189;
+	.loc	2 109 28
+	add.f32 	%f195, %f139, %f186;
+	.loc	2 110 39
+	setp.eq.f32 	%p44, %f195, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r99, %f195;
+	mov.b32 	%r98, %f139;
+	div.full.f32 %r97, %r98, %r99;
+	mov.b32 	%f196, %r97;
+	.loc	2 110 49
+	selp.f32 	%f197, 0f00000000, %f196, %p44;
+	.loc	2 112 17
+	fma.rn.f32 	%f198, %f197, %f194, %f189;
+	.loc	2 113 15
+	add.f32 	%f199, %f315, %f193;
+	.loc	2 113 30
+	mul.f32 	%f200, %f194, %f194;
+	.loc	2 113 38
+	mul.f32 	%f201, %f186, %f200;
+	.loc	2 113 22
+	fma.rn.f32 	%f202, %f197, %f201, %f199;
+$L__tmp4:
+	.loc	2 120 46
+	mov.b32 	%r147, %f198;
+	shfl.sync.bfly.b32	%r148, %r147, 8, 31, -1;
+	mov.b32 	%f203, %r148;
+	mov.b32 	%r149, %f202;
+	shfl.sync.bfly.b32	%r150, %r149, 8, 31, -1;
+	mov.b32 	%f204, %r150;
+	shfl.sync.bfly.b32	%r101, %r99, 8, 31, -1;
+	mov.b32 	%f205, %r101;
+$L__tmp5:
+	.loc	2 108 21
+	sub.f32 	%f206, %f203, %f198;
+	.loc	2 109 28
+	add.f32 	%f207, %f195, %f205;
+	.loc	2 110 39
+	setp.eq.f32 	%p45, %f207, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r102, %f207;
+	div.full.f32 %r100, %r101, %r102;
+	mov.b32 	%f208, %r100;
+	.loc	2 110 49
+	selp.f32 	%f209, 0f00000000, %f208, %p45;
+	.loc	2 112 17
+	fma.rn.f32 	%f210, %f209, %f206, %f198;
+	.loc	2 113 15
+	add.f32 	%f211, %f202, %f204;
+	.loc	2 113 30
+	mul.f32 	%f212, %f206, %f206;
+	.loc	2 113 38
+	mul.f32 	%f213, %f195, %f212;
+	.loc	2 113 22
+	fma.rn.f32 	%f214, %f209, %f213, %f211;
+$L__tmp6:
+	.loc	2 120 46
+	mov.b32 	%r151, %f210;
+	shfl.sync.bfly.b32	%r152, %r151, 4, 31, -1;
+	mov.b32 	%f215, %r152;
+	mov.b32 	%r153, %f214;
+	shfl.sync.bfly.b32	%r154, %r153, 4, 31, -1;
+	mov.b32 	%f216, %r154;
+	shfl.sync.bfly.b32	%r104, %r102, 4, 31, -1;
+	mov.b32 	%f217, %r104;
+$L__tmp7:
+	.loc	2 108 21
+	sub.f32 	%f218, %f215, %f210;
+	.loc	2 109 28
+	add.f32 	%f219, %f207, %f217;
+	.loc	2 110 39
+	setp.eq.f32 	%p46, %f219, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r105, %f219;
+	div.full.f32 %r103, %r104, %r105;
+	mov.b32 	%f220, %r103;
+	.loc	2 110 49
+	selp.f32 	%f221, 0f00000000, %f220, %p46;
+	.loc	2 112 17
+	fma.rn.f32 	%f222, %f221, %f218, %f210;
+	.loc	2 113 15
+	add.f32 	%f223, %f214, %f216;
+	.loc	2 113 30
+	mul.f32 	%f224, %f218, %f218;
+	.loc	2 113 38
+	mul.f32 	%f225, %f207, %f224;
+	.loc	2 113 22
+	fma.rn.f32 	%f226, %f221, %f225, %f223;
+$L__tmp8:
+	.loc	2 120 46
+	mov.b32 	%r155, %f222;
+	shfl.sync.bfly.b32	%r156, %r155, 2, 31, -1;
+	mov.b32 	%f227, %r156;
+	mov.b32 	%r157, %f226;
+	shfl.sync.bfly.b32	%r158, %r157, 2, 31, -1;
+	mov.b32 	%f228, %r158;
+	shfl.sync.bfly.b32	%r107, %r105, 2, 31, -1;
+	mov.b32 	%f229, %r107;
+$L__tmp9:
+	.loc	2 108 21
+	sub.f32 	%f230, %f227, %f222;
+	.loc	2 109 28
+	add.f32 	%f231, %f219, %f229;
+	.loc	2 110 39
+	setp.eq.f32 	%p47, %f231, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r108, %f231;
+	div.full.f32 %r106, %r107, %r108;
+	mov.b32 	%f232, %r106;
+	.loc	2 110 49
+	selp.f32 	%f233, 0f00000000, %f232, %p47;
+	.loc	2 112 17
+	fma.rn.f32 	%f234, %f233, %f230, %f222;
+	.loc	2 113 15
+	add.f32 	%f235, %f226, %f228;
+	.loc	2 113 30
+	mul.f32 	%f236, %f230, %f230;
+	.loc	2 113 38
+	mul.f32 	%f237, %f219, %f236;
+	.loc	2 113 22
+	fma.rn.f32 	%f238, %f233, %f237, %f235;
+$L__tmp10:
+	.loc	2 120 46
+	mov.b32 	%r159, %f234;
+	shfl.sync.bfly.b32	%r160, %r159, 1, 31, -1;
+	mov.b32 	%f239, %r160;
+	mov.b32 	%r161, %f238;
+	shfl.sync.bfly.b32	%r162, %r161, 1, 31, -1;
+	mov.b32 	%f240, %r162;
+	shfl.sync.bfly.b32	%r110, %r108, 1, 31, -1;
+	mov.b32 	%f241, %r110;
+$L__tmp11:
+	.loc	2 108 21
+	sub.f32 	%f242, %f239, %f234;
+	.loc	2 109 28
+	add.f32 	%f243, %f231, %f241;
+	.loc	2 110 39
+	setp.eq.f32 	%p48, %f243, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r111, %f243;
+	div.full.f32 %r109, %r110, %r111;
+	mov.b32 	%f244, %r109;
+	.loc	2 110 49
+	selp.f32 	%f245, 0f00000000, %f244, %p48;
+	.loc	2 112 17
+	fma.rn.f32 	%f73, %f245, %f242, %f234;
+	.loc	2 113 15
+	add.f32 	%f246, %f238, %f240;
+	.loc	2 113 30
+	mul.f32 	%f247, %f242, %f242;
+	.loc	2 113 38
+	mul.f32 	%f248, %f231, %f247;
+	.loc	2 113 22
+	fma.rn.f32 	%f249, %f245, %f248, %f246;
+$L__tmp12:
+	.loc	1 69 23
+	mov.b32 	%r113, %f249;
+	mov.b32 	%r114, 1132462080;
+	div.full.f32 %r112, %r113, %r114;
+	mov.b32 	%f250, %r112;
+	.loc	1 71 24
+	add.f32 	%f74, %f250, 0f3727C5AC;
+	.loc	1 76 39
+	shl.b32 	%r8, %r4, 8;
+	rsqrt.approx.ftz.f32 	%f275, %f74;
+	mov.pred 	%p83, %p49;
+	mov.u32 	%r235, %r167;
+	bra.uni 	$L__BB0_5;
+$L__BB0_7:
+	.loc	1 65 35
+	shl.b64 	%rd84, %rd6, 2;
+	add.s64 	%rd81, %rd2, %rd84;
+	add.s64 	%rd86, %rd2, %rd72;
+	add.s64 	%rd82, %rd86, 16;
+	mov.b32 	%r202, 0;
+	mov.pred 	%p70, -1;
+	.loc	1 65 54
+	mov.u32 %r198, 0x0;
+	mov.u32 %r199, 0x0;
+	mov.u32 %r200, 0x0;
+	mov.u32 %r201, 0x0;
+	@%p70 ld.global.L1::evict_first.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd81 + 0 ];
+	@!%p70 mov.u32 %r198, %r202;
+	@!%p70 mov.u32 %r199, %r202;
+	@!%p70 mov.u32 %r200, %r202;
+	@!%p70 mov.u32 %r201, %r202;
+	mov.b32 	%f251, %r198;
+	mov.b32 	%f252, %r199;
+	mov.b32 	%f253, %r200;
+	mov.b32 	%f254, %r201;
+	mov.u32 %r206, 0x0;
+	mov.u32 %r207, 0x0;
+	mov.u32 %r208, 0x0;
+	mov.u32 %r209, 0x0;
+	@%p70 ld.global.L1::evict_first.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd82 + 0 ];
+	@!%p70 mov.u32 %r206, %r202;
+	@!%p70 mov.u32 %r207, %r202;
+	@!%p70 mov.u32 %r208, %r202;
+	@!%p70 mov.u32 %r209, %r202;
+	mov.b32 	%f255, %r206;
+	mov.b32 	%f256, %r207;
+	mov.b32 	%f257, %r208;
+	mov.b32 	%f258, %r209;
+	.loc	1 66 24
+	add.f32 	%f259, %f75, %f251;
+	add.f32 	%f260, %f76, %f252;
+	add.f32 	%f261, %f77, %f253;
+	add.f32 	%f262, %f78, %f254;
+	add.f32 	%f263, %f79, %f255;
+	add.f32 	%f264, %f80, %f256;
+	add.f32 	%f265, %f81, %f257;
+	add.f32 	%f266, %f82, %f258;
+	.loc	1 67 24
+	sub.f32 	%f267, %f259, %f73;
+	sub.f32 	%f268, %f260, %f73;
+	sub.f32 	%f269, %f261, %f73;
+	sub.f32 	%f270, %f262, %f73;
+	sub.f32 	%f271, %f263, %f73;
+	sub.f32 	%f272, %f264, %f73;
+	sub.f32 	%f273, %f265, %f73;
+	sub.f32 	%f274, %f266, %f73;
+	cvt.u32.u64 	%r227, %rd6;
+	.loc	1 73 24
+	mul.f32 	%f276, %f267, %f275;
+	mul.f32 	%f277, %f268, %f275;
+	mul.f32 	%f278, %f269, %f275;
+	mul.f32 	%f279, %f270, %f275;
+	mul.f32 	%f280, %f271, %f275;
+	mul.f32 	%f281, %f272, %f275;
+	mul.f32 	%f282, %f273, %f275;
+	mul.f32 	%f283, %f274, %f275;
+	.loc	1 74 24
+	mul.f32 	%f284, %f276, %f83;
+	mul.f32 	%f285, %f277, %f84;
+	mul.f32 	%f286, %f278, %f85;
+	mul.f32 	%f287, %f279, %f86;
+	mul.f32 	%f288, %f280, %f87;
+	mul.f32 	%f289, %f281, %f88;
+	mul.f32 	%f290, %f282, %f89;
+	mul.f32 	%f291, %f283, %f90;
+	.loc	1 76 35
+	or.b32  	%r228, %r227, %r8;
+	.loc	1 76 29
+	mul.wide.s32 	%rd87, %r228, 2;
+	add.s64 	%rd83, %rd10, %rd87;
+	.loc	1 76 52
+	mov.b32 	%r214, %f284;
+	cvt.rn.bf16.f32 %rs1, %r214;
+	mov.b32 	%r215, %f285;
+	cvt.rn.bf16.f32 %rs2, %r215;
+	mov.b32 	%r216, %f286;
+	cvt.rn.bf16.f32 %rs3, %r216;
+	mov.b32 	%r217, %f287;
+	cvt.rn.bf16.f32 %rs4, %r217;
+	mov.b32 	%r218, %f288;
+	cvt.rn.bf16.f32 %rs5, %r218;
+	mov.b32 	%r219, %f289;
+	cvt.rn.bf16.f32 %rs6, %r219;
+	mov.b32 	%r220, %f290;
+	cvt.rn.bf16.f32 %rs7, %r220;
+	mov.b32 	%r221, %f291;
+	cvt.rn.bf16.f32 %rs8, %r221;
+	mov.b32 	%r229, {%rs1, %rs2};
+	mov.b32 	%r230, {%rs3, %rs4};
+	mov.b32 	%r231, {%rs5, %rs6};
+	mov.b32 	%r232, {%rs7, %rs8};
+	@%p70 st.global.v4.b32 [ %rd83 + 0 ], { %r229, %r230, %r231, %r232 };
+	mov.b32 	%r235, 128;
+	mov.pred 	%p83, 0;
+	.loc	1 55 36
+	@%p2 bra 	$L__BB0_5;
+	bra.uni 	$L__BB0_8;
+$L__BB0_5:
+	.loc	1 0 36
+	mov.pred 	%p2, %p83;
+	.loc	1 56 27
+	or.b32  	%r195, %r235, %r3;
+	.loc	1 59 41
+	or.b32  	%r196, %r195, %r5;
+	.loc	1 59 35
+	mul.wide.s32 	%rd64, %r196, 4;
+	add.s64 	%rd60, %rd8, %rd64;
+	cvt.s64.s32 	%rd65, %r235;
+	add.s64 	%rd66, %rd65, %rd4;
+	add.s64 	%rd67, %rd66, %rd3;
+	shl.b64 	%rd68, %rd67, 2;
+	add.s64 	%rd69, %rd8, %rd68;
+	add.s64 	%rd61, %rd69, 16;
+	.loc	1 59 51
+	mov.u32 %r163, 0x0;
+	mov.u32 %r164, 0x0;
+	mov.u32 %r165, 0x0;
+	mov.u32 %r166, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r163, %r164, %r165, %r166 }, [ %rd60 + 0 ];
+	@!%p49 mov.u32 %r163, %r167;
+	@!%p49 mov.u32 %r164, %r167;
+	@!%p49 mov.u32 %r165, %r167;
+	@!%p49 mov.u32 %r166, %r167;
+	mov.b32 	%f75, %r163;
+	mov.b32 	%f76, %r164;
+	mov.b32 	%f77, %r165;
+	mov.b32 	%f78, %r166;
+	mov.u32 %r171, 0x0;
+	mov.u32 %r172, 0x0;
+	mov.u32 %r173, 0x0;
+	mov.u32 %r174, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r171, %r172, %r173, %r174 }, [ %rd61 + 0 ];
+	@!%p49 mov.u32 %r171, %r167;
+	@!%p49 mov.u32 %r172, %r167;
+	@!%p49 mov.u32 %r173, %r167;
+	@!%p49 mov.u32 %r174, %r167;
+	mov.b32 	%f79, %r171;
+	mov.b32 	%f80, %r172;
+	mov.b32 	%f81, %r173;
+	mov.b32 	%f82, %r174;
+	.loc	1 60 35
+	cvt.u64.u32 	%rd6, %r195;
+	mul.wide.u32 	%rd70, %r195, 4;
+	add.s64 	%rd62, %rd9, %rd70;
+	cvt.u64.u32 	%rd71, %r235;
+	add.s64 	%rd7, %rd71, %rd4;
+	shl.b64 	%rd72, %rd7, 2;
+	add.s64 	%rd73, %rd9, %rd72;
+	add.s64 	%rd63, %rd73, 16;
+	.loc	1 60 40
+	mov.u32 %r179, 0x0;
+	mov.u32 %r180, 0x0;
+	mov.u32 %r181, 0x0;
+	mov.u32 %r182, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r179, %r180, %r181, %r182 }, [ %rd62 + 0 ];
+	@!%p49 mov.u32 %r179, %r167;
+	@!%p49 mov.u32 %r180, %r167;
+	@!%p49 mov.u32 %r181, %r167;
+	@!%p49 mov.u32 %r182, %r167;
+	mov.b32 	%f83, %r179;
+	mov.b32 	%f84, %r180;
+	mov.b32 	%f85, %r181;
+	mov.b32 	%f86, %r182;
+	mov.u32 %r187, 0x0;
+	mov.u32 %r188, 0x0;
+	mov.u32 %r189, 0x0;
+	mov.u32 %r190, 0x0;
+	@%p49 ld.global.L1::evict_last.v4.b32 { %r187, %r188, %r189, %r190 }, [ %rd63 + 0 ];
+	@!%p49 mov.u32 %r187, %r167;
+	@!%p49 mov.u32 %r188, %r167;
+	@!%p49 mov.u32 %r189, %r167;
+	@!%p49 mov.u32 %r190, %r167;
+	mov.b32 	%f87, %r187;
+	mov.b32 	%f88, %r188;
+	mov.b32 	%f89, %r189;
+	mov.b32 	%f90, %r190;
+	.loc	1 64 57
+	@%p25 bra 	$L__BB0_7;
+	mov.u64 	%rd74, assertMessage_1;
+	cvta.global.u64 	%rd75, %rd74;
+	mov.u64 	%rd76, assertFile_1;
+	cvta.global.u64 	%rd77, %rd76;
+	mov.u64 	%rd78, assertFunc_1;
+	cvta.global.u64 	%rd79, %rd78;
+	{ // callseq 1, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd75;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd77;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r233;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd79;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd88;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 1
+	bra.uni 	$L__BB0_7;
+$L__BB0_8:
+	.loc	1 55 4
+	ret;
+$L__tmp13:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/lh/clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 108
+.b8 104
+.b8 101
+.b8 52
+.b8 97
+.b8 51
+.b8 115
+.b8 116
+.b8 118
+.b8 117
+.b8 102
+.b8 120
+.b8 97
+.b8 102
+.b8 109
+.b8 113
+.b8 51
+.b8 107
+.b8 107
+.b8 53
+.b8 104
+.b8 111
+.b8 100
+.b8 97
+.b8 122
+.b8 122
+.b8 50
+.b8 101
+.b8 102
+.b8 99
+.b8 116
+.b8 102
+.b8 102
+.b8 116
+.b8 101
+.b8 54
+.b8 52
+.b8 54
+.b8 122
+.b8 110
+.b8 106
+.b8 100
+.b8 110
+.b8 118
+.b8 51
+.b8 108
+.b8 113
+.b8 105
+.b8 53
+.b8 111
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 108
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp12
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp12
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp4
+.b64 $L__tmp11
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..5b4589d516d59c82c67286e9e46db7ee19b695a3
--- /dev/null
+++ b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir
@@ -0,0 +1,139 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<256> : tensor<16x1xi64>
+    %cst_2 = arith.constant dense<0> : tensor<16x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<16x1xi64>
+    %cst_4 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
+    %cst_5 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x128xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
+    %cst_8 = arith.constant dense<256> : tensor<16x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x128xi32>
+    %cst_10 = arith.constant dense<512> : tensor<16x1xi32>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
+    %5 = arith.addi %4, %3 : tensor<16x1xi32>
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<16x1xi32>
+    %12 = arith.muli %11, %cst_8 : tensor<16x1xi32>
+    %13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %15 = arith.addi %10, %cst_3 : tensor<16x1xi64>
+    %16 = arith.cmpi slt, %10, %cst_2 : tensor<16x1xi64>
+    %17 = arith.select %16, %15, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %18 = arith.cmpi sge, %17, %cst_2 : tensor<16x1xi64>
+    %19 = arith.cmpi slt, %17, %cst_3 : tensor<16x1xi64>
+    %20 = arith.andi %18, %19 : tensor<16x1xi1>
+    %21 = arith.muli %17, %cst_1 : tensor<16x1xi64>
+    %22 = tt.broadcast %21 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>)  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x128xi32>
+      %48 = arith.addi %47, %7 : tensor<1x128xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x128xi32>
+      %50 = tt.broadcast %48 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %51 = arith.addi %50, %13 : tensor<16x128xi32>
+      %52 = tt.addptr %14, %51 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %53 = tt.broadcast %49 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %55 = arith.extsi %48 : tensor<1x128xi32> to tensor<1x128xi64>
+      %56 = tt.broadcast %55 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %57 = arith.addi %56, %22 : tensor<16x128xi64>
+      %58 = tt.addptr %23, %57 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %60 = arith.addf %59, %54 : tensor<16x128xf32>
+      %61 = arith.subf %60, %arg8 : tensor<16x128xf32>
+      %62 = arith.addf %arg10, %cst_0 : tensor<16x128xf32>
+      %63 = arith.divf %61, %62 : tensor<16x128xf32>
+      %64 = arith.addf %arg8, %63 : tensor<16x128xf32>
+      %65 = arith.subf %60, %64 : tensor<16x128xf32>
+      %66 = arith.mulf %61, %65 : tensor<16x128xf32>
+      %67 = arith.addf %arg9, %66 : tensor<16x128xf32>
+      %68 = arith.select %53, %64, %arg8 : tensor<16x128xi1>, tensor<16x128xf32>
+      %69 = arith.select %53, %67, %arg9 : tensor<16x128xi1>, tensor<16x128xf32>
+      %70 = arith.select %53, %62, %arg10 : tensor<16x128xi1>, tensor<16x128xf32>
+      scf.yield %68, %69, %70 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>
+    }
+    %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %47 = arith.subf %arg10, %arg7 : f32
+      %48 = arith.addf %arg9, %arg12 : f32
+      %49 = arith.cmpf oeq, %48, %cst : f32
+      %50 = arith.divf %arg12, %48 : f32
+      %51 = arith.select %49, %cst, %50 : f32
+      %52 = arith.mulf %47, %51 : f32
+      %53 = arith.addf %arg7, %52 : f32
+      %54 = arith.addf %arg8, %arg11 : f32
+      %55 = arith.mulf %47, %47 : f32
+      %56 = arith.mulf %55, %arg9 : f32
+      %57 = arith.mulf %56, %51 : f32
+      %58 = arith.addf %54, %57 : f32
+      tt.reduce.return %53, %58, %48 : f32, f32, f32
+    }) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
+    %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %28 = arith.muli %11, %cst_8 : tensor<16x1xi32>
+    %29 = tt.broadcast %28 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>>
+    %32 = arith.addi %10, %cst_3 : tensor<16x1xi64>
+    %33 = arith.cmpi slt, %10, %cst_2 : tensor<16x1xi64>
+    %34 = arith.select %33, %32, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %35 = arith.cmpi sge, %34, %cst_2 : tensor<16x1xi64>
+    %36 = arith.cmpi slt, %34, %cst_3 : tensor<16x1xi64>
+    %37 = arith.andi %35, %36 : tensor<16x1xi1>
+    %38 = arith.muli %34, %cst_1 : tensor<16x1xi64>
+    %39 = tt.broadcast %38 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %41 = tt.broadcast %26 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+    %42 = arith.divf %27, %cst_5 : tensor<16x1xf32>
+    %43 = arith.addf %42, %cst_4 : tensor<16x1xf32>
+    %44 = arith.muli %5, %cst_8 : tensor<16x1xi32>
+    %45 = tt.broadcast %44 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x128xi32>
+      %48 = arith.addi %47, %7 : tensor<1x128xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x128xi32>
+      %50 = tt.broadcast %48 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %51 = arith.addi %50, %29 : tensor<16x128xi32>
+      %52 = tt.addptr %30, %51 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %53 = tt.broadcast %49 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %55 = tt.addptr %31, %48 : tensor<1x128x!tt.ptr<f32, 1>>, tensor<1x128xi32>
+      %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32>
+      tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %57 = arith.extsi %48 : tensor<1x128xi32> to tensor<1x128xi64>
+      %58 = tt.broadcast %57 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %59 = arith.addi %58, %39 : tensor<16x128xi64>
+      %60 = tt.addptr %40, %59 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
+      %62 = arith.addf %61, %54 : tensor<16x128xf32>
+      %63 = arith.subf %62, %41 : tensor<16x128xf32>
+      %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
+      %65 = tt.broadcast %64 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+      %66 = arith.mulf %63, %65 : tensor<16x128xf32>
+      %67 = tt.broadcast %56 : (tensor<1x128xf32>) -> tensor<16x128xf32>
+      %68 = arith.mulf %66, %67 : tensor<16x128xf32>
+      %69 = arith.addi %50, %45 : tensor<16x128xi32>
+      %70 = tt.addptr %46, %69 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %71 = arith.truncf %68 : tensor<16x128xf32> to tensor<16x128xbf16>
+      tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b
--- /dev/null
+++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..9923385c7c15be7d90a7a67d767f94a648ad6dc4
Binary files /dev/null and b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.cubin differ
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..49c104e15168cb73a83e6e9a5fd23fc3c8388490
--- /dev/null
+++ b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.llir
@@ -0,0 +1,310 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 1, !dbg !10
+  %urem = shl i32 %8, 2, !dbg !10
+  %12 = and i32 %urem, 252, !dbg !10
+  %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %14 = shl i32 %13, 8, !dbg !12
+  %15 = or i32 %14, %12, !dbg !13
+  %16 = sext i32 %15 to i64, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !14
+  %18 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %17, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %19 = extractvalue { i32, i32, i32, i32 } %18, 0, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %18, 1, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %18, 2, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %18, 3, !dbg !15
+  %23 = bitcast i32 %21 to float, !dbg !15
+  %24 = bitcast i32 %22 to float, !dbg !15
+  %25 = getelementptr i16, ptr addrspace(1) %1, i64 %16, !dbg !16
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
+  %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
+  %29 = trunc i32 %27 to i16, !dbg !17
+  %extelt.offset = lshr i32 %27, 16, !dbg !17
+  %30 = trunc i32 %extelt.offset to i16, !dbg !17
+  %31 = trunc i32 %28 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %28, 16, !dbg !17
+  %32 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #6, !dbg !18
+  %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = getelementptr i16, ptr addrspace(1) %2, i64 %16, !dbg !19
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !20
+  %40 = extractvalue { i32, i32 } %38, 1, !dbg !20
+  %41 = trunc i32 %39 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %39, 16, !dbg !20
+  %42 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %43 = trunc i32 %40 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %40, 16, !dbg !20
+  %44 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %45 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #6, !dbg !21
+  %46 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #6, !dbg !21
+  %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = zext nneg i32 %12 to i64, !dbg !22
+  %50 = getelementptr float, ptr addrspace(1) %3, i64 %49, !dbg !22
+  %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %52 = fadd float %35, %23, !dbg !24
+  %53 = fadd float %36, %24, !dbg !24
+  %54 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !15
+  %55 = insertelement <2 x i32> %54, i32 %20, i64 1, !dbg !15
+  %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !15
+  %57 = insertelement <2 x float> poison, float %33, i64 0, !dbg !24
+  %58 = insertelement <2 x float> %57, float %34, i64 1, !dbg !24
+  %59 = fadd <2 x float> %58, %56, !dbg !24
+  %60 = insertelement <2 x float> poison, float %45, i64 0, !dbg !25
+  %61 = insertelement <2 x float> %60, float %46, i64 1, !dbg !25
+  %62 = fadd <2 x float> %59, %61, !dbg !25
+  %63 = fadd float %52, %47, !dbg !25
+  %64 = fadd float %53, %48, !dbg !25
+  %65 = extractelement <2 x float> %62, i64 0, !dbg !26
+  %66 = extractelement <2 x float> %62, i64 1, !dbg !26
+  %67 = fadd float %65, %66, !dbg !26
+  %68 = fadd float %67, %63, !dbg !26
+  %69 = fadd float %68, %64, !dbg !26
+  %70 = bitcast float %69 to i32, !dbg !32
+  %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 16, i32 31), !dbg !32
+  %72 = bitcast i32 %71 to float, !dbg !32
+  %73 = fadd float %69, %72, !dbg !26
+  %74 = bitcast float %73 to i32, !dbg !32
+  %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !32
+  %76 = bitcast i32 %75 to float, !dbg !32
+  %77 = fadd float %73, %76, !dbg !26
+  %78 = bitcast float %77 to i32, !dbg !32
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 4, i32 31), !dbg !32
+  %80 = bitcast i32 %79 to float, !dbg !32
+  %81 = fadd float %77, %80, !dbg !26
+  %82 = bitcast float %81 to i32, !dbg !32
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 2, i32 31), !dbg !32
+  %84 = bitcast i32 %83 to float, !dbg !32
+  %85 = fadd float %81, %84, !dbg !26
+  %86 = bitcast float %85 to i32, !dbg !32
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !32
+  %88 = bitcast i32 %87 to float, !dbg !32
+  %89 = fadd float %85, %88, !dbg !26
+  %90 = icmp eq i32 %9, 0, !dbg !32
+  %91 = zext nneg i32 %11 to i64, !dbg !32
+  %92 = getelementptr float, ptr addrspace(3) @global_smem, i64 %91, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %89, i1 %90) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %93 = icmp slt i32 %8, 2, !dbg !32
+  %94 = sext i32 %8 to i64, !dbg !32
+  %95 = getelementptr float, ptr addrspace(3) @global_smem, i64 %94, !dbg !32
+  %96 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !32
+  %97 = bitcast float %96 to i32, !dbg !32
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 1, i32 31), !dbg !32
+  %99 = bitcast i32 %98 to float, !dbg !32
+  %100 = fadd float %96, %99, !dbg !26
+  %101 = and i32 %8, 1, !dbg !32
+  %102 = icmp eq i32 %101, 0, !dbg !32
+  %103 = and i1 %93, %102, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %100, i1 %103) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %104 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  %105 = fadd float %104, 0.000000e+00, !dbg !34
+  %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %105, float 2.560000e+02) #6, !dbg !38
+  %107 = fsub float %65, %106, !dbg !39
+  %108 = fsub float %66, %106, !dbg !39
+  %109 = fsub float %63, %106, !dbg !39
+  %110 = fsub float %64, %106, !dbg !39
+  %111 = fmul float %107, %107, !dbg !40
+  %112 = fmul float %108, %108, !dbg !40
+  %113 = fmul float %109, %109, !dbg !40
+  %114 = fmul float %110, %110, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %115 = fadd float %111, %112, !dbg !43
+  %116 = fadd float %113, %115, !dbg !43
+  %117 = fadd float %114, %116, !dbg !43
+  %118 = bitcast float %117 to i32, !dbg !41
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 16, i32 31), !dbg !41
+  %120 = bitcast i32 %119 to float, !dbg !41
+  %121 = fadd float %117, %120, !dbg !43
+  %122 = bitcast float %121 to i32, !dbg !41
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 8, i32 31), !dbg !41
+  %124 = bitcast i32 %123 to float, !dbg !41
+  %125 = fadd float %121, %124, !dbg !43
+  %126 = bitcast float %125 to i32, !dbg !41
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 4, i32 31), !dbg !41
+  %128 = bitcast i32 %127 to float, !dbg !41
+  %129 = fadd float %125, %128, !dbg !43
+  %130 = bitcast float %129 to i32, !dbg !41
+  %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !41
+  %132 = bitcast i32 %131 to float, !dbg !41
+  %133 = fadd float %129, %132, !dbg !43
+  %134 = bitcast float %133 to i32, !dbg !41
+  %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 1, i32 31), !dbg !41
+  %136 = bitcast i32 %135 to float, !dbg !41
+  %137 = fadd float %133, %136, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %92, float %137, i1 %90) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %138 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %95, i1 %93) #6, !dbg !41
+  %139 = bitcast float %138 to i32, !dbg !41
+  %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !41
+  %141 = bitcast i32 %140 to float, !dbg !41
+  %142 = fadd float %138, %141, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %95, float %142, i1 %103) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
+  %144 = fadd float %143, 0.000000e+00, !dbg !46
+  %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !48
+  %146 = fadd float %145, 0x3EE4F8B580000000, !dbg !49
+  %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
+  %.not.i = icmp eq i32 %147, 0, !dbg !50
+  br i1 %.not.i, label %150, label %148, !dbg !50
+
+148:                                              ; preds = %7
+  %149 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+
+150:                                              ; preds = %7
+  %151 = tail call float @llvm.nvvm.rsqrt.approx.f(float %146), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+
+__nv_rsqrtf.exit:                                 ; preds = %148, %150
+  %.0.i = phi float [ %149, %148 ], [ %151, %150 ], !dbg !50
+  %152 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !23
+  %153 = bitcast i32 %152 to float, !dbg !23
+  %154 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !23
+  %155 = bitcast i32 %154 to float, !dbg !23
+  %156 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !23
+  %157 = bitcast i32 %156 to float, !dbg !23
+  %158 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !23
+  %159 = bitcast i32 %158 to float, !dbg !23
+  %160 = fmul float %107, %.0.i, !dbg !51
+  %161 = fmul float %108, %.0.i, !dbg !51
+  %162 = fmul float %109, %.0.i, !dbg !51
+  %163 = fmul float %110, %.0.i, !dbg !51
+  %164 = fmul float %160, %159, !dbg !52
+  %165 = fmul float %161, %157, !dbg !52
+  %166 = fmul float %162, %155, !dbg !52
+  %167 = fmul float %163, %153, !dbg !52
+  %168 = getelementptr i16, ptr addrspace(1) %4, i64 %16, !dbg !53
+  %169 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %164) #6, !dbg !54
+  %170 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %165) #6, !dbg !54
+  %171 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !54
+  %172 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !54
+  %173 = insertelement <2 x i16> undef, i16 %169, i64 0, !dbg !54
+  %174 = insertelement <2 x i16> %173, i16 %170, i64 1, !dbg !54
+  %175 = bitcast <2 x i16> %174 to i32, !dbg !54
+  %176 = insertelement <2 x i16> undef, i16 %171, i64 0, !dbg !54
+  %177 = insertelement <2 x i16> %176, i16 %172, i64 1, !dbg !54
+  %178 = bitcast <2 x i16> %177 to i32, !dbg !54
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %175, i32 %178, ptr addrspace(1) %168, i1 true) #6, !dbg !54
+  ret void, !dbg !55
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cfrefrv25bx2ibxbe2x4lejxbs5umypl6khaq66iftrtmjo55mug.py", directory: "/tmp/torchinductor_root/fr")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 31, scope: !7)
+!23 = !DILocation(line: 33, column: 36, scope: !7)
+!24 = !DILocation(line: 35, column: 18, scope: !7)
+!25 = !DILocation(line: 37, column: 18, scope: !7)
+!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
+!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 42, column: 59, scope: !27)
+!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
+!33 = !DILocation(line: 42, column: 59, scope: !29)
+!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 42, column: 45, scope: !35)
+!38 = !DILocation(line: 45, column: 20, scope: !7)
+!39 = !DILocation(line: 46, column: 19, scope: !7)
+!40 = !DILocation(line: 47, column: 20, scope: !7)
+!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 59, scope: !29)
+!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
+!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
+!45 = !DILocation(line: 50, column: 59, scope: !27)
+!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 45, scope: !35)
+!48 = !DILocation(line: 53, column: 20, scope: !7)
+!49 = !DILocation(line: 55, column: 20, scope: !7)
+!50 = !DILocation(line: 56, column: 26, scope: !7)
+!51 = !DILocation(line: 57, column: 20, scope: !7)
+!52 = !DILocation(line: 58, column: 20, scope: !7)
+!53 = !DILocation(line: 60, column: 25, scope: !7)
+!54 = !DILocation(line: 60, column: 48, scope: !7)
+!55 = !DILocation(line: 60, column: 4, scope: !7)
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4a64078ac23c738ec3332cf6701e1fe69b4737ea
--- /dev/null
+++ b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx
@@ -0,0 +1,723 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<26>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<92>;
+	.reg .f32 	%f<78>;
+	.reg .b64 	%rd<14>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd6, [triton__0d1d2d3d4d5de6de_param_0];
+	ld.param.u64 	%rd7, [triton__0d1d2d3d4d5de6de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r58, %tid.x;
+	and.b32  	%r59, %r58, 31;
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5de6de_param_2];
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5de6de_param_4];
+	shl.b32 	%r60, %r58, 2;
+	and.b32  	%r61, %r60, 252;
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r62, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r63, %r62, %r61;
+	.loc	1 30 30
+	mul.wide.s32 	%rd11, %r63, 4;
+	add.s64 	%rd1, %rd6, %rd11;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r4;
+	mov.b32 	%f2, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd12, %r63, 2;
+	add.s64 	%rd2, %rd7, %rd12;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f3, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f4, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f5, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f6, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd8, %rd12;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f7, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f8, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f9, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f10, %r25;
+	.loc	1 33 31
+	mul.wide.u32 	%rd13, %r61, 4;
+	add.s64 	%rd4, %rd9, %rd13;
+	.loc	1 33 36
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	@!%p1 mov.u32 %r28, %r6;
+	@!%p1 mov.u32 %r29, %r6;
+	.loc	1 35 18
+	add.f32 	%f11, %f5, %f1;
+	add.f32 	%f12, %f6, %f2;
+	.loc	1 30 46
+	mov.b32 	%f13, %r3;
+	mov.b32 	%f14, %r2;
+	.loc	1 35 18
+	add.f32 	%f15, %f3, %f14;
+	add.f32 	%f16, %f4, %f13;
+	.loc	1 37 18
+	add.f32 	%f17, %f16, %f8;
+	add.f32 	%f18, %f15, %f7;
+	add.f32 	%f19, %f11, %f9;
+	add.f32 	%f20, %f12, %f10;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f21, %f18, %f17;
+	add.f32 	%f22, %f21, %f19;
+	add.f32 	%f23, %f22, %f20;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r64, %f23;
+	shfl.sync.bfly.b32	%r65, %r64, 16, 31, -1;
+	mov.b32 	%f24, %r65;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r66, %f25;
+	shfl.sync.bfly.b32	%r67, %r66, 8, 31, -1;
+	mov.b32 	%f26, %r67;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f27, %f25, %f26;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r68, %f27;
+	shfl.sync.bfly.b32	%r69, %r68, 4, 31, -1;
+	mov.b32 	%f28, %r69;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f29, %f27, %f28;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r70, %f29;
+	shfl.sync.bfly.b32	%r71, %r70, 2, 31, -1;
+	mov.b32 	%f30, %r71;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f31, %f29, %f30;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r72, %f31;
+	shfl.sync.bfly.b32	%r73, %r72, 1, 31, -1;
+	mov.b32 	%f32, %r73;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p17, %r59, 0;
+	shr.u32 	%r74, %r58, 3;
+	and.b32  	%r75, %r74, 4;
+	mov.u32 	%r76, global_smem;
+	add.s32 	%r34, %r76, %r75;
+	mov.b32 	%r35, %f33;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r35;
+	bar.sync 	0;
+	setp.lt.s32 	%p18, %r58, 2;
+	add.s32 	%r37, %r76, %r60;
+	@%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
+	mov.b32 	%f34, %r36;
+	shfl.sync.bfly.b32	%r77, %r36, 1, 31, -1;
+	mov.b32 	%f35, %r77;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f36, %f34, %f35;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r78, %r58, 1;
+	setp.eq.b32 	%p24, %r78, 1;
+	not.pred 	%p25, %p24;
+	and.pred  	%p19, %p18, %p25;
+	mov.b32 	%r39, %f36;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r39;
+	bar.sync 	0;
+	ld.shared.f32 	%f37, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f38, %f37, 0f00000000;
+$L__tmp16:
+	.loc	1 45 20
+	mov.b32 	%r41, %f38;
+	mov.b32 	%r42, 1132462080;
+	div.full.f32 %r40, %r41, %r42;
+	mov.b32 	%f39, %r40;
+	.loc	1 46 19
+	sub.f32 	%f40, %f18, %f39;
+	sub.f32 	%f41, %f17, %f39;
+	sub.f32 	%f42, %f19, %f39;
+	sub.f32 	%f43, %f20, %f39;
+	.loc	1 47 20
+	mul.f32 	%f44, %f41, %f41;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f45, %f40, %f40, %f44;
+	fma.rn.f32 	%f46, %f42, %f42, %f45;
+	fma.rn.f32 	%f47, %f43, %f43, %f46;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r79, %f47;
+	shfl.sync.bfly.b32	%r80, %r79, 16, 31, -1;
+	mov.b32 	%f48, %r80;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r81, %f49;
+	shfl.sync.bfly.b32	%r82, %r81, 8, 31, -1;
+	mov.b32 	%f50, %r82;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f51, %f49, %f50;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r83, %f51;
+	shfl.sync.bfly.b32	%r84, %r83, 4, 31, -1;
+	mov.b32 	%f52, %r84;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f53, %f51, %f52;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r85, %f53;
+	shfl.sync.bfly.b32	%r86, %r85, 2, 31, -1;
+	mov.b32 	%f54, %r86;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f55, %f53, %f54;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r87, %f55;
+	shfl.sync.bfly.b32	%r88, %r87, 1, 31, -1;
+	mov.b32 	%f56, %r88;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r44, %f57;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r44;
+	bar.sync 	0;
+	@%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
+	mov.b32 	%f58, %r45;
+	shfl.sync.bfly.b32	%r89, %r45, 1, 31, -1;
+	mov.b32 	%f59, %r89;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f60, %f58, %f59;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r48, %f60;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r48;
+	bar.sync 	0;
+	ld.shared.f32 	%f61, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f62, %f61, 0f00000000;
+$L__tmp33:
+	.loc	1 53 20
+	mov.b32 	%r50, %f62;
+	div.full.f32 %r49, %r50, %r42;
+	mov.b32 	%f63, %r49;
+	.loc	1 55 20
+	add.f32 	%f64, %f63, 0f3727C5AC;
+	.loc	1 56 26
+	rsqrt.approx.ftz.f32 	%f65, %f64;
+	.loc	1 33 36
+	mov.b32 	%f66, %r29;
+	mov.b32 	%f67, %r28;
+	mov.b32 	%f68, %r27;
+	mov.b32 	%f69, %r26;
+	.loc	1 57 20
+	mul.f32 	%f70, %f40, %f65;
+	mul.f32 	%f71, %f41, %f65;
+	mul.f32 	%f72, %f42, %f65;
+	mul.f32 	%f73, %f43, %f65;
+	.loc	1 58 20
+	mul.f32 	%f74, %f70, %f69;
+	mul.f32 	%f75, %f71, %f68;
+	mul.f32 	%f76, %f72, %f67;
+	mul.f32 	%f77, %f73, %f66;
+	.loc	1 60 25
+	add.s64 	%rd5, %rd10, %rd12;
+	.loc	1 60 48
+	mov.b32 	%r52, %f74;
+	cvt.rn.bf16.f32 %rs9, %r52;
+	mov.b32 	%r53, %f75;
+	cvt.rn.bf16.f32 %rs10, %r53;
+	mov.b32 	%r54, %f76;
+	cvt.rn.bf16.f32 %rs11, %r54;
+	mov.b32 	%r55, %f77;
+	cvt.rn.bf16.f32 %rs12, %r55;
+	mov.b32 	%r90, {%rs9, %rs10};
+	mov.b32 	%r91, {%rs11, %rs12};
+	@%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r90, %r91 };
+	.loc	1 60 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/fr/cfrefrv25bx2ibxbe2x4lejxbs5umypl6khaq66iftrtmjo55mug.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 395
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 102
+.b8 114
+.b8 101
+.b8 102
+.b8 114
+.b8 118
+.b8 50
+.b8 53
+.b8 98
+.b8 120
+.b8 50
+.b8 105
+.b8 98
+.b8 120
+.b8 98
+.b8 101
+.b8 50
+.b8 120
+.b8 52
+.b8 108
+.b8 101
+.b8 106
+.b8 120
+.b8 98
+.b8 115
+.b8 53
+.b8 117
+.b8 109
+.b8 121
+.b8 112
+.b8 108
+.b8 54
+.b8 107
+.b8 104
+.b8 97
+.b8 113
+.b8 54
+.b8 54
+.b8 105
+.b8 102
+.b8 116
+.b8 114
+.b8 116
+.b8 109
+.b8 106
+.b8 111
+.b8 53
+.b8 53
+.b8 109
+.b8 117
+.b8 103
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 102
+.b8 114
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 42
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 50
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 50
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 50
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 399
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 399
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..1b80454a261c05bdbafbd096b9b705bbebca392c
--- /dev/null
+++ b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir
@@ -0,0 +1,63 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %24 = arith.addf %23, %cst_2 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
+    %27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
+    %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %31 = arith.addf %30, %cst_2 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_0 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
+    %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
+    %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
+    %38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %40 = arith.truncf %37 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..678dfd5327d867e46e71eab8d5ce7a215016f7ae
--- /dev/null
+++ b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttir
@@ -0,0 +1,62 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %20 = arith.addf %8, %12 : tensor<256xf32>
+    %21 = arith.addf %20, %16 : tensor<256xf32>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32>) -> f32
+    %24 = arith.addf %23, %cst_0 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32>
+    %27 = arith.subf %21, %26 : tensor<256xf32>
+    %28 = arith.mulf %27, %27 : tensor<256xf32>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32>) -> f32
+    %31 = arith.addf %30, %cst_0 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_2 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32>
+    %36 = arith.mulf %27, %35 : tensor<256xf32>
+    %37 = arith.mulf %36, %19 : tensor<256xf32>
+    %38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %40 = arith.truncf %37 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc219ffbb3021e19f3b35e9be96086e23c9c4b
--- /dev/null
+++ b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir
@@ -0,0 +1,24 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
+    %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
+    %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
+    %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
+    %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
+    %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
+    %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
+    %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
+    tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
+    tt.return
+  }
+}
diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttir b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d2a9a5cb9370160003226fec8c61be7abfe7c35e
--- /dev/null
+++ b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..9f1cbda056c2c304428aa1a5f21bdeb5856f9f54
--- /dev/null
+++ b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir
@@ -0,0 +1,165 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<16x128xbf16, #blocked>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_7 : tensor<16x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_9 : tensor<16x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_9 : tensor<16x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_8 : tensor<16x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<16x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_5 : tensor<16x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<16x1xi64, #blocked>) -> tensor<16x128xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<16x128xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<16x128xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<16x128xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<16x128x!tt.ptr<bf16, 1>, #blocked>, tensor<16x128xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<16x128xbf16, #blocked> to tensor<16x128xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<16x128xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<16x128xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<16x128xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<16x128xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_4 : tensor<16x128xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_10 : tensor<16x128xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<16x128xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<16x128xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<16x128xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<16x128xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<16x128xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<16x128xi1, #blocked2>, tensor<16x128xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<16x128xf32, #blocked2>) -> tensor<16x128xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_11 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_11, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked>
+    %45 = tt.broadcast %42 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<16x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<16x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x128xi32, #blocked>
+      %52 = tt.broadcast %50 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked>
+      %53 = arith.addi %52, %22 : tensor<16x128xi32, #blocked>
+      %54 = tt.addptr %23, %53 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi32, #blocked>
+      %55 = tt.broadcast %51 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked>
+      %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
+      %57 = arith.addi %52, %25 : tensor<16x128xi32, #blocked>
+      %58 = tt.addptr %26, %57 : tensor<16x128x!tt.ptr<bf16, 1>, #blocked>, tensor<16x128xi32, #blocked>
+      %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16, #blocked>
+      %60 = arith.extf %59 : tensor<16x128xbf16, #blocked> to tensor<16x128xf32, #blocked>
+      %61 = tt.addptr %44, %50 : tensor<1x128x!tt.ptr<f32, 1>, #blocked>, tensor<1x128xi32, #blocked>
+      %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1, #blocked1>
+      %63 = arith.extsi %50 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %64 = tt.broadcast %63 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked>
+      %65 = arith.addi %64, %37 : tensor<16x128xi64, #blocked>
+      %66 = tt.addptr %38, %65 : tensor<16x128x!tt.ptr<f32, 1>, #blocked>, tensor<16x128xi64, #blocked>
+      %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked>
+      %68 = arith.addf %67, %56 : tensor<16x128xf32, #blocked>
+      %69 = arith.addf %68, %60 : tensor<16x128xf32, #blocked>
+      %70 = arith.subf %69, %45 : tensor<16x128xf32, #blocked>
+      %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
+      %72 = tt.broadcast %71 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked>
+      %73 = arith.mulf %70, %72 : tensor<16x128xf32, #blocked>
+      %74 = tt.broadcast %62 : (tensor<1x128xf32, #blocked>) -> tensor<16x128xf32, #blocked>
+      %75 = arith.mulf %73, %74 : tensor<16x128xf32, #blocked>
+      %76 = tt.addptr %48, %57 : tensor<16x128x!tt.ptr<bf16, 1>, #blocked>, tensor<16x128xi32, #blocked>
+      %77 = arith.truncf %75 : tensor<16x128xf32, #blocked> to tensor<16x128xbf16, #blocked>
+      tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16, #blocked>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..086209f02fa414eb7f71d0671a0195683c0ed573
--- /dev/null
+++ b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttir
@@ -0,0 +1,153 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x128xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<16x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<16x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x128xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
+    %cst_9 = arith.constant dense<256> : tensor<16x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x128xi32>
+    %cst_11 = arith.constant dense<512> : tensor<16x1xi32>
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c16_i32 : i32
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
+    %5 = arith.addi %4, %3 : tensor<16x1xi32>
+    %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
+    %12 = arith.muli %11, %cst_9 : tensor<16x1xi32>
+    %13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %15 = arith.muli %5, %cst_9 : tensor<16x1xi32>
+    %16 = tt.broadcast %15 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    %18 = arith.addi %10, %cst_4 : tensor<16x1xi64>
+    %19 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
+    %20 = arith.select %19, %18, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %21 = arith.cmpi sge, %20, %cst_3 : tensor<16x1xi64>
+    %22 = arith.cmpi slt, %20, %cst_4 : tensor<16x1xi64>
+    %23 = arith.andi %21, %22 : tensor<16x1xi1>
+    %24 = arith.muli %20, %cst_2 : tensor<16x1xi64>
+    %25 = tt.broadcast %24 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>)  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
+      %52 = arith.addi %51, %7 : tensor<1x128xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
+      %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %55 = arith.addi %54, %13 : tensor<16x128xi32>
+      %56 = tt.addptr %14, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %59 = arith.addi %54, %16 : tensor<16x128xi32>
+      %60 = tt.addptr %17, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16>
+      %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
+      tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %63 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
+      %64 = tt.broadcast %63 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %65 = arith.addi %64, %25 : tensor<16x128xi64>
+      %66 = tt.addptr %26, %65 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %68 = arith.addf %67, %58 : tensor<16x128xf32>
+      %69 = arith.addf %68, %62 : tensor<16x128xf32>
+      %70 = arith.subf %69, %arg9 : tensor<16x128xf32>
+      %71 = arith.addf %arg11, %cst_1 : tensor<16x128xf32>
+      %72 = arith.divf %70, %71 : tensor<16x128xf32>
+      %73 = arith.addf %arg9, %72 : tensor<16x128xf32>
+      %74 = arith.subf %69, %73 : tensor<16x128xf32>
+      %75 = arith.mulf %70, %74 : tensor<16x128xf32>
+      %76 = arith.addf %arg10, %75 : tensor<16x128xf32>
+      %77 = arith.select %57, %73, %arg9 : tensor<16x128xi1>, tensor<16x128xf32>
+      %78 = arith.select %57, %76, %arg10 : tensor<16x128xi1>, tensor<16x128xf32>
+      %79 = arith.select %57, %71, %arg11 : tensor<16x128xi1>, tensor<16x128xf32>
+      scf.yield %77, %78, %79 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>
+    }
+    %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %51 = arith.subf %arg11, %arg8 : f32
+      %52 = arith.addf %arg10, %arg13 : f32
+      %53 = arith.cmpf oeq, %52, %cst_0 : f32
+      %54 = arith.divf %arg13, %52 : f32
+      %55 = arith.select %53, %cst_0, %54 : f32
+      %56 = arith.mulf %51, %55 : f32
+      %57 = arith.addf %arg8, %56 : f32
+      %58 = arith.addf %arg9, %arg12 : f32
+      %59 = arith.mulf %51, %51 : f32
+      %60 = arith.mulf %59, %arg10 : f32
+      %61 = arith.mulf %60, %55 : f32
+      %62 = arith.addf %58, %61 : f32
+      tt.reduce.return %57, %62, %52 : f32, f32, f32
+    }) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
+    %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
+    %31 = arith.muli %11, %cst_9 : tensor<16x1xi32>
+    %32 = tt.broadcast %31 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %34 = arith.muli %5, %cst_9 : tensor<16x1xi32>
+    %35 = tt.broadcast %34 : (tensor<16x1xi32>) -> tensor<16x128xi32>
+    %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>>
+    %38 = arith.addi %10, %cst_4 : tensor<16x1xi64>
+    %39 = arith.cmpi slt, %10, %cst_3 : tensor<16x1xi64>
+    %40 = arith.select %39, %38, %10 : tensor<16x1xi1>, tensor<16x1xi64>
+    %41 = arith.cmpi sge, %40, %cst_3 : tensor<16x1xi64>
+    %42 = arith.cmpi slt, %40, %cst_4 : tensor<16x1xi64>
+    %43 = arith.andi %41, %42 : tensor<16x1xi1>
+    %44 = arith.muli %40, %cst_2 : tensor<16x1xi64>
+    %45 = tt.broadcast %44 : (tensor<16x1xi64>) -> tensor<16x128xi64>
+    %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
+    %47 = tt.broadcast %29 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+    %48 = arith.divf %30, %cst_6 : tensor<16x1xf32>
+    %49 = arith.addf %48, %cst_5 : tensor<16x1xf32>
+    %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x128x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x128xi32>
+      %52 = arith.addi %51, %7 : tensor<1x128xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x128xi32>
+      %54 = tt.broadcast %52 : (tensor<1x128xi32>) -> tensor<16x128xi32>
+      %55 = arith.addi %54, %32 : tensor<16x128xi32>
+      %56 = tt.addptr %33, %55 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
+      %57 = tt.broadcast %53 : (tensor<1x128xi1>) -> tensor<16x128xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32>
+      %59 = arith.addi %54, %35 : tensor<16x128xi32>
+      %60 = tt.addptr %36, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16>
+      %62 = arith.extf %61 : tensor<16x128xbf16> to tensor<16x128xf32>
+      %63 = tt.addptr %37, %52 : tensor<1x128x!tt.ptr<f32, 1>>, tensor<1x128xi32>
+      %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32>
+      tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<16x1xi1>
+      %65 = arith.extsi %52 : tensor<1x128xi32> to tensor<1x128xi64>
+      %66 = tt.broadcast %65 : (tensor<1x128xi64>) -> tensor<16x128xi64>
+      %67 = arith.addi %66, %45 : tensor<16x128xi64>
+      %68 = tt.addptr %46, %67 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi64>
+      %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
+      %70 = arith.addf %69, %58 : tensor<16x128xf32>
+      %71 = arith.addf %70, %62 : tensor<16x128xf32>
+      %72 = arith.subf %71, %47 : tensor<16x128xf32>
+      %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
+      %74 = tt.broadcast %73 : (tensor<16x1xf32>) -> tensor<16x128xf32>
+      %75 = arith.mulf %72, %74 : tensor<16x128xf32>
+      %76 = tt.broadcast %64 : (tensor<1x128xf32>) -> tensor<16x128xf32>
+      %77 = arith.mulf %75, %76 : tensor<16x128xf32>
+      %78 = tt.addptr %50, %59 : tensor<16x128x!tt.ptr<bf16, 1>>, tensor<16x128xi32>
+      %79 = arith.truncf %77 : tensor<16x128xf32> to tensor<16x128xbf16>
+      tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..eebb6bcc60fcb762d4622fb278da207b219409f2
Binary files /dev/null and b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin differ
diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..6c93281712f9e553538000e6a6a99e22f723d4fa
--- /dev/null
+++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx
@@ -0,0 +1,1360 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_29,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_30
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<96>;
+	.reg .b16 	%rs<37>;
+	.reg .b32 	%r<222>;
+	.reg .f32 	%f<186>;
+	.reg .b64 	%rd<93>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd60, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0];
+	ld.param.u64 	%rd61, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r188, %tid.x;
+	and.b32  	%r189, %r188, 31;
+	ld.param.u64 	%rd62, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2];
+	ld.param.u64 	%rd63, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3];
+	ld.param.u64 	%rd64, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4];
+	shl.b32 	%r190, %r188, 2;
+	ld.param.u64 	%rd65, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5];
+	and.b32  	%r191, %r190, 252;
+	ld.param.u64 	%rd66, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6];
+	ld.param.u64 	%rd67, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r192, %r1, 8;
+	ld.param.u64 	%rd68, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8];
+	.loc	1 30 36
+	or.b32  	%r193, %r192, %r191;
+	ld.param.u64 	%rd69, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9];
+	ld.param.u64 	%rd70, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10];
+	.loc	1 30 30
+	mul.wide.s32 	%rd71, %r193, 4;
+	add.s64 	%rd1, %rd60, %rd71;
+	ld.param.u64 	%rd72, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11];
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	ld.param.u64 	%rd73, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12];
+	ld.param.u64 	%rd74, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13];
+	ld.param.u64 	%rd75, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14];
+	ld.param.u64 	%rd76, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15];
+	ld.param.u64 	%rd77, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16];
+	mov.b32 	%f1, %r2;
+	ld.param.u64 	%rd78, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17];
+	mov.b32 	%f2, %r3;
+	ld.param.u64 	%rd79, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18];
+	mov.b32 	%f3, %r4;
+	ld.param.u64 	%rd80, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19];
+	mov.b32 	%f4, %r5;
+	ld.param.u64 	%rd81, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20];
+	.loc	1 31 30
+	mul.wide.s32 	%rd82, %r193, 2;
+	add.s64 	%rd2, %rd61, %rd82;
+	ld.param.u64 	%rd83, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21];
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	ld.param.u64 	%rd84, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22];
+	ld.param.u64 	%rd85, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23];
+	ld.param.u64 	%rd86, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24];
+	cvt.u16.u32 	%rs1, %r10;
+	ld.param.u64 	%rd87, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25];
+	ld.param.u64 	%rd88, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26];
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	ld.param.u64 	%rd89, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27];
+	cvt.u16.u32 	%rs3, %r11;
+	ld.param.u64 	%rd90, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28];
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd62, %rd82;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f9, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f10, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f11, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f12, %r25;
+	.loc	1 33 30
+	mul.wide.s32 	%rd91, %r1, 4;
+	add.s64 	%rd4, %rd63, %rd91;
+	.loc	1 33 35
+	mov.u32 %r26, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
+	mov.b32 	%f13, %r26;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
+	mov.u32 %r28, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
+	.loc	1 34 30
+	add.s64 	%rd8, %rd64, %rd91;
+	.loc	1 34 35
+	mov.u32 %r30, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r30 }, [ %rd8 + 0 ];
+	mov.b32 	%f14, %r30;
+	mov.u32 %r31, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
+	mov.u32 %r32, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
+	mov.u32 %r33, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
+	.loc	1 35 31
+	add.s64 	%rd12, %rd65, %rd82;
+	.loc	1 35 47
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd12 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	cvt.u16.u32 	%rs9, %r34;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r34; }
+	cvt.u16.u32 	%rs11, %r35;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r35; }
+	.loc	1 35 68
+	cvt.f32.bf16 %r38, %rs9;
+	mov.b32 	%f15, %r38;
+	cvt.f32.bf16 %r39, %rs10;
+	mov.b32 	%f16, %r39;
+	cvt.f32.bf16 %r40, %rs11;
+	mov.b32 	%f17, %r40;
+	cvt.f32.bf16 %r41, %rs12;
+	mov.b32 	%f18, %r41;
+	.loc	1 36 31
+	add.s64 	%rd13, %rd66, %rd91;
+	.loc	1 36 36
+	mov.u32 %r42, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r42 }, [ %rd13 + 0 ];
+	mov.b32 	%f19, %r42;
+	mov.u32 %r43, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r43 }, [ %rd13 + 0 ];
+	mov.u32 %r44, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r44 }, [ %rd13 + 0 ];
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r45 }, [ %rd13 + 0 ];
+	.loc	1 37 31
+	add.s64 	%rd17, %rd67, %rd91;
+	.loc	1 37 36
+	mov.u32 %r46, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r46 }, [ %rd17 + 0 ];
+	mov.b32 	%f20, %r46;
+	mov.u32 %r47, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r47 }, [ %rd17 + 0 ];
+	mov.u32 %r48, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r48 }, [ %rd17 + 0 ];
+	mov.u32 %r49, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r49 }, [ %rd17 + 0 ];
+	.loc	1 38 31
+	add.s64 	%rd21, %rd68, %rd82;
+	.loc	1 38 47
+	mov.u32 %r50, 0x0;
+	mov.u32 %r51, 0x0;
+	@%p1 ld.global.v2.b32 { %r50, %r51 }, [ %rd21 + 0 ];
+	@!%p1 mov.u32 %r50, %r6;
+	@!%p1 mov.u32 %r51, %r6;
+	cvt.u16.u32 	%rs13, %r50;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r50; }
+	cvt.u16.u32 	%rs15, %r51;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r51; }
+	.loc	1 38 68
+	cvt.f32.bf16 %r54, %rs13;
+	mov.b32 	%f21, %r54;
+	cvt.f32.bf16 %r55, %rs14;
+	mov.b32 	%f22, %r55;
+	cvt.f32.bf16 %r56, %rs15;
+	mov.b32 	%f23, %r56;
+	cvt.f32.bf16 %r57, %rs16;
+	mov.b32 	%f24, %r57;
+	.loc	1 39 31
+	add.s64 	%rd22, %rd69, %rd82;
+	.loc	1 39 47
+	mov.u32 %r58, 0x0;
+	mov.u32 %r59, 0x0;
+	@%p1 ld.global.v2.b32 { %r58, %r59 }, [ %rd22 + 0 ];
+	@!%p1 mov.u32 %r58, %r6;
+	@!%p1 mov.u32 %r59, %r6;
+	cvt.u16.u32 	%rs17, %r58;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r58; }
+	cvt.u16.u32 	%rs19, %r59;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r59; }
+	.loc	1 39 68
+	cvt.f32.bf16 %r62, %rs17;
+	mov.b32 	%f25, %r62;
+	cvt.f32.bf16 %r63, %rs18;
+	mov.b32 	%f26, %r63;
+	cvt.f32.bf16 %r64, %rs19;
+	mov.b32 	%f27, %r64;
+	cvt.f32.bf16 %r65, %rs20;
+	mov.b32 	%f28, %r65;
+	.loc	1 40 32
+	add.s64 	%rd23, %rd70, %rd82;
+	.loc	1 40 48
+	mov.u32 %r66, 0x0;
+	mov.u32 %r67, 0x0;
+	@%p1 ld.global.v2.b32 { %r66, %r67 }, [ %rd23 + 0 ];
+	@!%p1 mov.u32 %r66, %r6;
+	@!%p1 mov.u32 %r67, %r6;
+	cvt.u16.u32 	%rs21, %r66;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r66; }
+	cvt.u16.u32 	%rs23, %r67;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r67; }
+	.loc	1 40 69
+	cvt.f32.bf16 %r70, %rs21;
+	mov.b32 	%f29, %r70;
+	cvt.f32.bf16 %r71, %rs22;
+	mov.b32 	%f30, %r71;
+	cvt.f32.bf16 %r72, %rs23;
+	mov.b32 	%f31, %r72;
+	cvt.f32.bf16 %r73, %rs24;
+	mov.b32 	%f32, %r73;
+	.loc	1 41 32
+	add.s64 	%rd24, %rd72, %rd91;
+	.loc	1 41 37
+	mov.u32 %r74, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r74 }, [ %rd24 + 0 ];
+	mov.b32 	%f33, %r74;
+	mov.u32 %r75, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r75 }, [ %rd24 + 0 ];
+	mov.u32 %r76, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r76 }, [ %rd24 + 0 ];
+	mov.u32 %r77, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r77 }, [ %rd24 + 0 ];
+	.loc	1 42 32
+	add.s64 	%rd28, %rd73, %rd91;
+	.loc	1 42 37
+	mov.u32 %r78, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r78 }, [ %rd28 + 0 ];
+	mov.b32 	%f34, %r78;
+	mov.u32 %r79, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r79 }, [ %rd28 + 0 ];
+	mov.u32 %r80, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r80 }, [ %rd28 + 0 ];
+	mov.u32 %r81, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r81 }, [ %rd28 + 0 ];
+	.loc	1 43 32
+	add.s64 	%rd32, %rd74, %rd82;
+	.loc	1 43 48
+	mov.u32 %r82, 0x0;
+	mov.u32 %r83, 0x0;
+	@%p1 ld.global.v2.b32 { %r82, %r83 }, [ %rd32 + 0 ];
+	@!%p1 mov.u32 %r82, %r6;
+	@!%p1 mov.u32 %r83, %r6;
+	cvt.u16.u32 	%rs25, %r82;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r82; }
+	cvt.u16.u32 	%rs27, %r83;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r83; }
+	.loc	1 43 69
+	cvt.f32.bf16 %r86, %rs25;
+	mov.b32 	%f35, %r86;
+	cvt.f32.bf16 %r87, %rs26;
+	mov.b32 	%f36, %r87;
+	cvt.f32.bf16 %r88, %rs27;
+	mov.b32 	%f37, %r88;
+	cvt.f32.bf16 %r89, %rs28;
+	mov.b32 	%f38, %r89;
+	.loc	1 44 32
+	add.s64 	%rd33, %rd75, %rd91;
+	.loc	1 44 37
+	mov.u32 %r90, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r90 }, [ %rd33 + 0 ];
+	mov.b32 	%f39, %r90;
+	mov.u32 %r91, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r91 }, [ %rd33 + 0 ];
+	mov.u32 %r92, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r92 }, [ %rd33 + 0 ];
+	mov.u32 %r93, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r93 }, [ %rd33 + 0 ];
+	.loc	1 45 32
+	add.s64 	%rd37, %rd76, %rd91;
+	.loc	1 45 37
+	mov.u32 %r94, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r94 }, [ %rd37 + 0 ];
+	mov.b32 	%f40, %r94;
+	mov.u32 %r95, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r95 }, [ %rd37 + 0 ];
+	mov.u32 %r96, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r96 }, [ %rd37 + 0 ];
+	mov.u32 %r97, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r97 }, [ %rd37 + 0 ];
+	.loc	1 46 32
+	add.s64 	%rd41, %rd77, %rd82;
+	.loc	1 46 48
+	mov.u32 %r98, 0x0;
+	mov.u32 %r99, 0x0;
+	@%p1 ld.global.v2.b32 { %r98, %r99 }, [ %rd41 + 0 ];
+	@!%p1 mov.u32 %r98, %r6;
+	@!%p1 mov.u32 %r99, %r6;
+	cvt.u16.u32 	%rs29, %r98;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r98; }
+	cvt.u16.u32 	%rs31, %r99;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r99; }
+	.loc	1 46 69
+	cvt.f32.bf16 %r102, %rs29;
+	mov.b32 	%f41, %r102;
+	cvt.f32.bf16 %r103, %rs30;
+	mov.b32 	%f42, %r103;
+	cvt.f32.bf16 %r104, %rs31;
+	mov.b32 	%f43, %r104;
+	cvt.f32.bf16 %r105, %rs32;
+	mov.b32 	%f44, %r105;
+	.loc	1 47 32
+	add.s64 	%rd42, %rd78, %rd91;
+	.loc	1 47 37
+	mov.u32 %r106, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r106 }, [ %rd42 + 0 ];
+	mov.b32 	%f45, %r106;
+	mov.u32 %r107, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r107 }, [ %rd42 + 0 ];
+	mov.u32 %r108, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r108 }, [ %rd42 + 0 ];
+	mov.u32 %r109, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r109 }, [ %rd42 + 0 ];
+	.loc	1 48 32
+	add.s64 	%rd46, %rd79, %rd91;
+	.loc	1 48 37
+	mov.u32 %r143, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r143 }, [ %rd46 + 0 ];
+	mov.b32 	%f46, %r143;
+	mov.u32 %r111, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r111 }, [ %rd46 + 0 ];
+	mov.u32 %r112, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r112 }, [ %rd46 + 0 ];
+	mov.u32 %r113, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r113 }, [ %rd46 + 0 ];
+	.loc	1 49 32
+	add.s64 	%rd50, %rd80, %rd71;
+	.loc	1 49 48
+	mov.u32 %r114, 0x0;
+	mov.u32 %r115, 0x0;
+	mov.u32 %r116, 0x0;
+	mov.u32 %r117, 0x0;
+	@%p1 ld.global.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd50 + 0 ];
+	@!%p1 mov.u32 %r114, %r6;
+	@!%p1 mov.u32 %r115, %r6;
+	@!%p1 mov.u32 %r116, %r6;
+	@!%p1 mov.u32 %r117, %r6;
+	.loc	1 50 32
+	mul.wide.u32 	%rd92, %r191, 4;
+	add.s64 	%rd51, %rd81, %rd92;
+	.loc	1 50 37
+	mov.u32 %r122, 0x0;
+	mov.u32 %r123, 0x0;
+	mov.u32 %r124, 0x0;
+	mov.u32 %r125, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd51 + 0 ];
+	@!%p1 mov.u32 %r122, %r6;
+	@!%p1 mov.u32 %r123, %r6;
+	@!%p1 mov.u32 %r124, %r6;
+	@!%p1 mov.u32 %r125, %r6;
+	.loc	1 52 18
+	add.f32 	%f47, %f5, %f1;
+	add.f32 	%f48, %f6, %f2;
+	add.f32 	%f49, %f7, %f3;
+	add.f32 	%f50, %f8, %f4;
+	.loc	1 54 18
+	add.f32 	%f51, %f47, %f9;
+	add.f32 	%f52, %f48, %f10;
+	add.f32 	%f53, %f49, %f11;
+	add.f32 	%f54, %f50, %f12;
+	.loc	1 55 18
+	sub.f32 	%f55, %f51, %f13;
+	sub.f32 	%f56, %f52, %f13;
+	sub.f32 	%f57, %f53, %f13;
+	sub.f32 	%f58, %f54, %f13;
+	.loc	1 56 19
+	mul.f32 	%f59, %f55, %f14;
+	mul.f32 	%f60, %f56, %f14;
+	mul.f32 	%f61, %f57, %f14;
+	mul.f32 	%f62, %f58, %f14;
+	.loc	1 58 19
+	add.f32 	%f63, %f51, %f15;
+	add.f32 	%f64, %f52, %f16;
+	add.f32 	%f65, %f53, %f17;
+	add.f32 	%f66, %f54, %f18;
+	.loc	1 59 20
+	sub.f32 	%f67, %f63, %f19;
+	sub.f32 	%f68, %f64, %f19;
+	sub.f32 	%f69, %f65, %f19;
+	sub.f32 	%f70, %f66, %f19;
+	.loc	1 60 20
+	mul.f32 	%f71, %f67, %f20;
+	mul.f32 	%f72, %f68, %f20;
+	mul.f32 	%f73, %f69, %f20;
+	mul.f32 	%f74, %f70, %f20;
+	.loc	1 62 20
+	add.f32 	%f75, %f63, %f21;
+	add.f32 	%f76, %f64, %f22;
+	add.f32 	%f77, %f65, %f23;
+	add.f32 	%f78, %f66, %f24;
+	.loc	1 64 20
+	add.f32 	%f79, %f75, %f25;
+	add.f32 	%f80, %f76, %f26;
+	add.f32 	%f81, %f77, %f27;
+	add.f32 	%f82, %f78, %f28;
+	.loc	1 66 20
+	add.f32 	%f83, %f79, %f29;
+	add.f32 	%f84, %f80, %f30;
+	add.f32 	%f85, %f81, %f31;
+	add.f32 	%f86, %f82, %f32;
+	.loc	1 67 20
+	sub.f32 	%f87, %f83, %f33;
+	sub.f32 	%f88, %f84, %f33;
+	sub.f32 	%f89, %f85, %f33;
+	sub.f32 	%f90, %f86, %f33;
+	.loc	1 68 20
+	mul.f32 	%f91, %f87, %f34;
+	mul.f32 	%f92, %f88, %f34;
+	mul.f32 	%f93, %f89, %f34;
+	mul.f32 	%f94, %f90, %f34;
+	.loc	1 70 20
+	add.f32 	%f95, %f83, %f35;
+	add.f32 	%f96, %f84, %f36;
+	add.f32 	%f97, %f85, %f37;
+	add.f32 	%f98, %f86, %f38;
+	.loc	1 71 20
+	sub.f32 	%f99, %f95, %f39;
+	sub.f32 	%f100, %f96, %f39;
+	sub.f32 	%f101, %f97, %f39;
+	sub.f32 	%f102, %f98, %f39;
+	.loc	1 72 20
+	mul.f32 	%f103, %f99, %f40;
+	mul.f32 	%f104, %f100, %f40;
+	mul.f32 	%f105, %f101, %f40;
+	mul.f32 	%f106, %f102, %f40;
+	.loc	1 74 20
+	add.f32 	%f107, %f95, %f41;
+	add.f32 	%f108, %f96, %f42;
+	add.f32 	%f109, %f97, %f43;
+	add.f32 	%f110, %f98, %f44;
+	.loc	1 75 20
+	sub.f32 	%f111, %f107, %f45;
+	sub.f32 	%f112, %f108, %f45;
+	sub.f32 	%f113, %f109, %f45;
+	sub.f32 	%f114, %f110, %f45;
+	.loc	1 76 20
+	mul.f32 	%f115, %f111, %f46;
+	mul.f32 	%f116, %f112, %f46;
+	mul.f32 	%f117, %f113, %f46;
+	mul.f32 	%f118, %f114, %f46;
+	.loc	1 49 48
+	mov.b32 	%f119, %r115;
+	mov.b32 	%f120, %r114;
+	.loc	1 50 37
+	mov.b32 	%f121, %r123;
+	mov.b32 	%f122, %r122;
+	.loc	1 77 20
+	mul.f32 	%f123, %f120, %f122;
+	mul.f32 	%f124, %f119, %f121;
+	.loc	1 49 48
+	mov.b32 	%f125, %r116;
+	mov.b32 	%f126, %r117;
+	.loc	1 50 37
+	mov.b32 	%f127, %r124;
+	mov.b32 	%f128, %r125;
+	.loc	1 77 20
+	mul.f32 	%f129, %f126, %f128;
+	mul.f32 	%f130, %f125, %f127;
+$L__tmp1:
+	.loc	2 233 15
+	fma.rn.f32 	%f131, %f120, %f122, %f124;
+	fma.rn.f32 	%f132, %f125, %f127, %f131;
+	fma.rn.f32 	%f133, %f126, %f128, %f132;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r194, %f133;
+	shfl.sync.bfly.b32	%r195, %r194, 16, 31, -1;
+	mov.b32 	%f134, %r195;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f135, %f133, %f134;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r196, %f135;
+	shfl.sync.bfly.b32	%r197, %r196, 8, 31, -1;
+	mov.b32 	%f136, %r197;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f137, %f135, %f136;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r198, %f137;
+	shfl.sync.bfly.b32	%r199, %r198, 4, 31, -1;
+	mov.b32 	%f138, %r199;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f139, %f137, %f138;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r200, %f139;
+	shfl.sync.bfly.b32	%r201, %r200, 2, 31, -1;
+	mov.b32 	%f140, %r201;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f141, %f139, %f140;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r202, %f141;
+	shfl.sync.bfly.b32	%r203, %r202, 1, 31, -1;
+	mov.b32 	%f142, %r203;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f143, %f141, %f142;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p80, %r189, 0;
+	shr.u32 	%r204, %r188, 3;
+	and.b32  	%r205, %r204, 4;
+	mov.u32 	%r206, global_smem;
+	add.s32 	%r130, %r206, %r205;
+	mov.b32 	%r131, %f143;
+	@%p80 st.shared.b32 [ %r130 + 0 ], %r131;
+	bar.sync 	0;
+	setp.lt.s32 	%p81, %r188, 2;
+	add.s32 	%r133, %r206, %r190;
+	@%p81 ld.shared.b32 %r132, [ %r133 + 0 ];
+	mov.b32 	%f144, %r132;
+	shfl.sync.bfly.b32	%r207, %r132, 1, 31, -1;
+	mov.b32 	%f145, %r207;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f146, %f144, %f145;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r208, %r188, 1;
+	setp.eq.b32 	%p94, %r208, 1;
+	not.pred 	%p95, %p94;
+	and.pred  	%p82, %p81, %p95;
+	mov.b32 	%r135, %f146;
+	@%p82 st.shared.b32 [ %r133 + 0 ], %r135;
+	bar.sync 	0;
+	ld.shared.f32 	%f147, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f148, %f147, 0f00000000;
+$L__tmp16:
+	.loc	1 81 20
+	mul.f32 	%f149, %f116, %f124;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f150, %f115, %f123, %f149;
+	fma.rn.f32 	%f151, %f117, %f130, %f150;
+	fma.rn.f32 	%f152, %f118, %f129, %f151;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r209, %f152;
+	shfl.sync.bfly.b32	%r210, %r209, 16, 31, -1;
+	mov.b32 	%f153, %r210;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f154, %f152, %f153;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r211, %f154;
+	shfl.sync.bfly.b32	%r212, %r211, 8, 31, -1;
+	mov.b32 	%f155, %r212;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f156, %f154, %f155;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r213, %f156;
+	shfl.sync.bfly.b32	%r214, %r213, 4, 31, -1;
+	mov.b32 	%f157, %r214;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f158, %f156, %f157;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r215, %f158;
+	shfl.sync.bfly.b32	%r216, %r215, 2, 31, -1;
+	mov.b32 	%f159, %r216;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f160, %f158, %f159;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r217, %f160;
+	shfl.sync.bfly.b32	%r218, %r217, 1, 31, -1;
+	mov.b32 	%f161, %r218;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f162, %f160, %f161;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r137, %f162;
+	@%p80 st.shared.b32 [ %r130 + 0 ], %r137;
+	bar.sync 	0;
+	@%p81 ld.shared.b32 %r138, [ %r133 + 0 ];
+	mov.b32 	%f163, %r138;
+	shfl.sync.bfly.b32	%r219, %r138, 1, 31, -1;
+	mov.b32 	%f164, %r219;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f165, %f163, %f164;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r141, %f165;
+	@%p82 st.shared.b32 [ %r133 + 0 ], %r141;
+	bar.sync 	0;
+	ld.shared.f32 	%f166, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f167, %f166, 0f00000000;
+	mov.b32 	%r144, 1132462080;
+$L__tmp33:
+	.loc	1 86 20
+	div.full.f32 %r142, %r143, %r144;
+	mov.b32 	%f168, %r142;
+	.loc	1 88 20
+	neg.f32 	%f169, %f148;
+	fma.rn.f32 	%f170, %f123, 0f43800000, %f169;
+	fma.rn.f32 	%f171, %f124, 0f43800000, %f169;
+	fma.rn.f32 	%f172, %f130, 0f43800000, %f169;
+	fma.rn.f32 	%f173, %f129, 0f43800000, %f169;
+	.loc	1 90 20
+	neg.f32 	%f174, %f115;
+	fma.rn.f32 	%f175, %f174, %f167, %f170;
+	neg.f32 	%f176, %f116;
+	fma.rn.f32 	%f177, %f176, %f167, %f171;
+	neg.f32 	%f178, %f117;
+	fma.rn.f32 	%f179, %f178, %f167, %f172;
+	neg.f32 	%f180, %f118;
+	fma.rn.f32 	%f181, %f180, %f167, %f173;
+	.loc	1 91 20
+	mul.f32 	%f182, %f168, %f175;
+	mul.f32 	%f183, %f168, %f177;
+	mul.f32 	%f184, %f168, %f179;
+	mul.f32 	%f185, %f168, %f181;
+	.loc	1 93 25
+	add.s64 	%rd52, %rd83, %rd71;
+	.loc	1 93 48
+	mov.b32 	%r154, %f59;
+	mov.b32 	%r155, %f60;
+	mov.b32 	%r156, %f61;
+	mov.b32 	%r157, %f62;
+	@%p1 st.global.v4.b32 [ %rd52 + 0 ], { %r154, %r155, %r156, %r157 };
+	.loc	1 94 25
+	add.s64 	%rd53, %rd84, %rd71;
+	.loc	1 94 48
+	mov.b32 	%r158, %f71;
+	mov.b32 	%r159, %f72;
+	mov.b32 	%r160, %f73;
+	mov.b32 	%r161, %f74;
+	@%p1 st.global.v4.b32 [ %rd53 + 0 ], { %r158, %r159, %r160, %r161 };
+	.loc	1 95 25
+	add.s64 	%rd54, %rd85, %rd71;
+	.loc	1 95 48
+	mov.b32 	%r162, %f75;
+	mov.b32 	%r163, %f76;
+	mov.b32 	%r164, %f77;
+	mov.b32 	%r165, %f78;
+	@%p1 st.global.v4.b32 [ %rd54 + 0 ], { %r162, %r163, %r164, %r165 };
+	.loc	1 96 25
+	add.s64 	%rd55, %rd86, %rd71;
+	.loc	1 96 48
+	mov.b32 	%r166, %f91;
+	mov.b32 	%r167, %f92;
+	mov.b32 	%r168, %f93;
+	mov.b32 	%r169, %f94;
+	@%p1 st.global.v4.b32 [ %rd55 + 0 ], { %r166, %r167, %r168, %r169 };
+	.loc	1 97 25
+	add.s64 	%rd56, %rd87, %rd71;
+	.loc	1 97 48
+	mov.b32 	%r170, %f103;
+	mov.b32 	%r171, %f104;
+	mov.b32 	%r172, %f105;
+	mov.b32 	%r173, %f106;
+	@%p1 st.global.v4.b32 [ %rd56 + 0 ], { %r170, %r171, %r172, %r173 };
+	.loc	1 98 25
+	add.s64 	%rd57, %rd88, %rd71;
+	.loc	1 98 48
+	mov.b32 	%r174, %f115;
+	mov.b32 	%r175, %f116;
+	mov.b32 	%r176, %f117;
+	mov.b32 	%r177, %f118;
+	@%p1 st.global.v4.b32 [ %rd57 + 0 ], { %r174, %r175, %r176, %r177 };
+	.loc	1 99 25
+	add.s64 	%rd58, %rd89, %rd71;
+	.loc	1 99 48
+	mov.b32 	%r178, %f182;
+	mov.b32 	%r179, %f183;
+	mov.b32 	%r180, %f184;
+	mov.b32 	%r181, %f185;
+	@%p1 st.global.v4.b32 [ %rd58 + 0 ], { %r178, %r179, %r180, %r181 };
+	.loc	1 100 25
+	add.s64 	%rd59, %rd90, %rd82;
+	.loc	1 100 48
+	cvt.rn.bf16.f32 %rs33, %r178;
+	cvt.rn.bf16.f32 %rs34, %r179;
+	cvt.rn.bf16.f32 %rs35, %r180;
+	cvt.rn.bf16.f32 %rs36, %r181;
+	mov.b32 	%r220, {%rs33, %rs34};
+	mov.b32 	%r221, {%rs35, %rs36};
+	@%p1 st.global.v2.b32 [ %rd59 + 0 ], { %r220, %r221 };
+	.loc	1 100 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/yo/cyo4ksjyladdfw6jgu5nyxbapyihb5b54nc6mogi76rx2lajsiff.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 533
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 121
+.b8 111
+.b8 52
+.b8 107
+.b8 115
+.b8 106
+.b8 121
+.b8 108
+.b8 97
+.b8 100
+.b8 100
+.b8 102
+.b8 119
+.b8 54
+.b8 106
+.b8 103
+.b8 117
+.b8 53
+.b8 110
+.b8 121
+.b8 120
+.b8 98
+.b8 97
+.b8 112
+.b8 121
+.b8 105
+.b8 104
+.b8 98
+.b8 53
+.b8 98
+.b8 53
+.b8 52
+.b8 110
+.b8 99
+.b8 54
+.b8 109
+.b8 111
+.b8 103
+.b8 105
+.b8 55
+.b8 54
+.b8 114
+.b8 120
+.b8 50
+.b8 108
+.b8 97
+.b8 106
+.b8 115
+.b8 105
+.b8 102
+.b8 102
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 121
+.b8 111
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 80
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 80
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 80
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 84
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 84
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 84
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 537
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 537
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.cubin b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..98b8843640d8f4a6c3f43f28200d8db47faa455c
Binary files /dev/null and b/.triton/dump/fc9988fe599aa093558c9034385b9a0b/triton_.cubin differ