0-hero
/

gpt2-pos-encoding-experiment-10B

Model card Files Files and versions Community

gpt2-pos-encoding-experiment-10B / .triton /dump /1ed98b0d136db679153ca6a42fff755c /triton_.llir

0-hero

Add files using upload-large-folder tool

d742687 verified 5 months ago

raw

history blame

30.6 kB

	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"

	@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
	@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
	@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
	@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
	@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
	@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
	@global_smem = external addrspace(3) global [0 x i8]
	@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1

	declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr

	define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
	%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
	%9 = and i32 %8, 31, !dbg !10
	%10 = lshr i32 %8, 5, !dbg !10
	%11 = lshr i32 %8, 6, !dbg !10
	%12 = and i32 %11, 1, !dbg !10
	%13 = and i32 %8, 1, !dbg !10
	%14 = and i32 %10, 1, !dbg !11
	%urem = shl i32 %8, 2, !dbg !11
	%15 = and i32 %urem, 252, !dbg !11
	%16 = shl i32 %8, 1, !dbg !11
	%17 = and i32 %16, 254, !dbg !11
	%18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
	%19 = shl i32 %18, 1, !dbg !13
	%20 = or i32 %19, %12, !dbg !14
	%21 = or i32 %19, %13, !dbg !14
	%22 = sext i32 %20 to i64, !dbg !15
	%23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !15
	%24 = sext i32 %21 to i64, !dbg !15
	%25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !15
	%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
	%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
	%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
	%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !16
	%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !16
	%31 = srem i32 %20, 512, !dbg !17
	%32 = shl nsw i32 %31, 8, !dbg !18
	%33 = or i32 %32, %15, !dbg !19
	%34 = sext i32 %33 to i64, !dbg !20
	%35 = getelementptr float, ptr addrspace(1) %2, i64 %34, !dbg !20
	%36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
	%37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !21
	%38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !21
	%39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !21
	%40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !21
	%41 = bitcast i32 %37 to float, !dbg !21
	%42 = bitcast i32 %38 to float, !dbg !21
	%43 = bitcast i32 %39 to float, !dbg !21
	%44 = bitcast i32 %40 to float, !dbg !21
	%45 = add i64 %30, 50257, !dbg !22
	%46 = icmp slt i64 %26, 0, !dbg !23
	%47 = icmp slt i64 %30, 0, !dbg !23
	%48 = select i1 %47, i64 %45, i64 %30, !dbg !24
	%49 = icmp ugt i64 %48, 50256, !dbg !25
	br i1 %49, label %50, label %51, !dbg !26

	50: ; preds = %7
	tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !26
	br label %51, !dbg !26

	51: ; preds = %50, %7
	%52 = shl i64 %26, 8, !dbg !27
	%53 = add i64 %52, 12865792, !dbg !27
	%54 = select i1 %46, i64 %53, i64 %52, !dbg !27
	%55 = zext nneg i32 %15 to i64
	%56 = or i64 %54, %55, !dbg !28
	%57 = getelementptr float, ptr addrspace(1) %1, i64 %56, !dbg !29
	%58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !30
	%59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !30
	%60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !30
	%61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !30
	%62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !30
	%63 = bitcast i32 %59 to float, !dbg !30
	%64 = bitcast i32 %60 to float, !dbg !30
	%65 = bitcast i32 %61 to float, !dbg !30
	%66 = bitcast i32 %62 to float, !dbg !30
	%67 = fadd float %41, %63, !dbg !31
	%68 = fadd float %42, %64, !dbg !31
	%69 = fadd float %43, %65, !dbg !31
	%70 = fadd float %44, %66, !dbg !31
	%71 = fadd float %67, 0.000000e+00, !dbg !32
	%72 = fadd float %68, 0.000000e+00, !dbg !32
	%73 = fadd float %69, 0.000000e+00, !dbg !32
	%74 = fadd float %70, 0.000000e+00, !dbg !32
	%75 = fsub float %67, %71, !dbg !36
	%76 = fsub float %68, %72, !dbg !36
	%77 = fsub float %69, %73, !dbg !36
	%78 = fsub float %70, %74, !dbg !36
	%79 = fmul float %67, %75, !dbg !37
	%80 = fmul float %68, %76, !dbg !37
	%81 = fmul float %69, %77, !dbg !37
	%82 = fmul float %70, %78, !dbg !37
	%83 = fadd float %79, 0.000000e+00, !dbg !38
	%84 = fadd float %80, 0.000000e+00, !dbg !38
	%85 = fadd float %81, 0.000000e+00, !dbg !38
	%86 = fadd float %82, 0.000000e+00, !dbg !38
	%87 = fsub float %72, %71, !dbg !39
	%88 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !43
	%89 = fmul float %88, %87, !dbg !44
	%90 = fadd float %71, %89, !dbg !45
	%91 = fadd float %83, %84, !dbg !46
	%92 = fmul float %87, %87, !dbg !47
	%93 = fmul float %88, %92, !dbg !48
	%94 = fadd float %93, %91, !dbg !49
	%95 = fsub float %73, %90, !dbg !39
	%96 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !43
	%97 = fmul float %96, %95, !dbg !44
	%98 = fadd float %90, %97, !dbg !45
	%99 = fadd float %85, %94, !dbg !46
	%100 = fmul float %95, %95, !dbg !47
	%101 = fmul float %100, 2.000000e+00, !dbg !50
	%102 = fmul float %96, %101, !dbg !48
	%103 = fadd float %99, %102, !dbg !49
	%104 = fsub float %74, %98, !dbg !39
	%105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !43
	%106 = fmul float %105, %104, !dbg !44
	%107 = fadd float %98, %106, !dbg !45
	%108 = fadd float %86, %103, !dbg !46
	%109 = fmul float %104, %104, !dbg !47
	%110 = fmul float %109, 3.000000e+00, !dbg !50
	%111 = fmul float %105, %110, !dbg !48
	%112 = fadd float %108, %111, !dbg !49
	%113 = bitcast float %107 to i32, !dbg !51
	%114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 16, i32 31), !dbg !51
	%115 = bitcast i32 %114 to float, !dbg !51
	%116 = bitcast float %112 to i32, !dbg !51
	%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !51
	%118 = bitcast i32 %117 to float, !dbg !51
	%119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !51
	%120 = bitcast i32 %119 to float, !dbg !51
	%121 = fsub float %115, %107, !dbg !39
	%122 = fadd float %120, 4.000000e+00, !dbg !53
	%123 = fcmp oeq float %122, 0.000000e+00, !dbg !54
	%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %120, float %122) #6, !dbg !43
	%125 = select i1 %123, float 0.000000e+00, float %124, !dbg !55
	%126 = fmul float %125, %121, !dbg !44
	%127 = fadd float %107, %126, !dbg !45
	%128 = fadd float %112, %118, !dbg !46
	%129 = fmul float %121, %121, !dbg !47
	%130 = fmul float %129, 4.000000e+00, !dbg !50
	%131 = fmul float %125, %130, !dbg !48
	%132 = fadd float %128, %131, !dbg !49
	%133 = bitcast float %127 to i32, !dbg !51
	%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 8, i32 31), !dbg !51
	%135 = bitcast i32 %134 to float, !dbg !51
	%136 = bitcast float %132 to i32, !dbg !51
	%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !51
	%138 = bitcast i32 %137 to float, !dbg !51
	%139 = bitcast float %122 to i32, !dbg !51
	%140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !51
	%141 = bitcast i32 %140 to float, !dbg !51
	%142 = fsub float %135, %127, !dbg !39
	%143 = fadd float %122, %141, !dbg !53
	%144 = fcmp oeq float %143, 0.000000e+00, !dbg !54
	%145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %143) #6, !dbg !43
	%146 = select i1 %144, float 0.000000e+00, float %145, !dbg !55
	%147 = fmul float %146, %142, !dbg !44
	%148 = fadd float %127, %147, !dbg !45
	%149 = fadd float %132, %138, !dbg !46
	%150 = fmul float %142, %142, !dbg !47
	%151 = fmul float %122, %150, !dbg !50
	%152 = fmul float %146, %151, !dbg !48
	%153 = fadd float %149, %152, !dbg !49
	%154 = bitcast float %148 to i32, !dbg !51
	%155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !51
	%156 = bitcast i32 %155 to float, !dbg !51
	%157 = bitcast float %153 to i32, !dbg !51
	%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 4, i32 31), !dbg !51
	%159 = bitcast i32 %158 to float, !dbg !51
	%160 = bitcast float %143 to i32, !dbg !51
	%161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 4, i32 31), !dbg !51
	%162 = bitcast i32 %161 to float, !dbg !51
	%163 = fsub float %156, %148, !dbg !39
	%164 = fadd float %143, %162, !dbg !53
	%165 = fcmp oeq float %164, 0.000000e+00, !dbg !54
	%166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %162, float %164) #6, !dbg !43
	%167 = select i1 %165, float 0.000000e+00, float %166, !dbg !55
	%168 = fmul float %167, %163, !dbg !44
	%169 = fadd float %148, %168, !dbg !45
	%170 = fadd float %153, %159, !dbg !46
	%171 = fmul float %163, %163, !dbg !47
	%172 = fmul float %143, %171, !dbg !50
	%173 = fmul float %167, %172, !dbg !48
	%174 = fadd float %170, %173, !dbg !49
	%175 = bitcast float %169 to i32, !dbg !51
	%176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 2, i32 31), !dbg !51
	%177 = bitcast i32 %176 to float, !dbg !51
	%178 = bitcast float %174 to i32, !dbg !51
	%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 2, i32 31), !dbg !51
	%180 = bitcast i32 %179 to float, !dbg !51
	%181 = bitcast float %164 to i32, !dbg !51
	%182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 2, i32 31), !dbg !51
	%183 = bitcast i32 %182 to float, !dbg !51
	%184 = fsub float %177, %169, !dbg !39
	%185 = fadd float %164, %183, !dbg !53
	%186 = fcmp oeq float %185, 0.000000e+00, !dbg !54
	%187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float %185) #6, !dbg !43
	%188 = select i1 %186, float 0.000000e+00, float %187, !dbg !55
	%189 = fmul float %188, %184, !dbg !44
	%190 = fadd float %169, %189, !dbg !45
	%191 = fadd float %174, %180, !dbg !46
	%192 = fmul float %184, %184, !dbg !47
	%193 = fmul float %164, %192, !dbg !50
	%194 = fmul float %188, %193, !dbg !48
	%195 = fadd float %191, %194, !dbg !49
	%196 = bitcast float %190 to i32, !dbg !51
	%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !51
	%198 = bitcast i32 %197 to float, !dbg !51
	%199 = bitcast float %195 to i32, !dbg !51
	%200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !51
	%201 = bitcast i32 %200 to float, !dbg !51
	%202 = bitcast float %185 to i32, !dbg !51
	%203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !51
	%204 = bitcast i32 %203 to float, !dbg !51
	%205 = fsub float %198, %190, !dbg !39
	%206 = fadd float %185, %204, !dbg !53
	%207 = fcmp oeq float %206, 0.000000e+00, !dbg !54
	%208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !43
	%209 = select i1 %207, float 0.000000e+00, float %208, !dbg !55
	%210 = fmul float %205, %209, !dbg !44
	%211 = fadd float %190, %210, !dbg !45
	%212 = fadd float %195, %201, !dbg !46
	%213 = fmul float %205, %205, !dbg !47
	%214 = fmul float %185, %213, !dbg !50
	%215 = fmul float %209, %214, !dbg !48
	%216 = fadd float %212, %215, !dbg !49
	%217 = icmp eq i32 %9, 0, !dbg !51
	%218 = shl nuw nsw i32 %12, 1, !dbg !51
	%219 = or i32 %218, %14, !dbg !51
	%220 = zext nneg i32 %219 to i64, !dbg !51
	%221 = getelementptr float, ptr addrspace(3) @global_smem, i64 %220, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %221, float %211, i1 %217) #6, !dbg !51
	%222 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %220, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %222, float %216, i1 %217) #6, !dbg !51
	%223 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %220, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %217) #6, !dbg !51
	tail call void @llvm.nvvm.barrier0(), !dbg !51
	%224 = icmp slt i32 %8, 4, !dbg !51
	%225 = sext i32 %8 to i64, !dbg !51
	%226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !51
	%227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #6, !dbg !51
	%228 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %225, !dbg !51
	%229 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %228, i1 %224) #6, !dbg !51
	%230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %225, !dbg !51
	%231 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %230, i1 %224) #6, !dbg !51
	%232 = bitcast float %227 to i32, !dbg !51
	%233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !51
	%234 = bitcast i32 %233 to float, !dbg !51
	%235 = bitcast float %229 to i32, !dbg !51
	%236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !51
	%237 = bitcast i32 %236 to float, !dbg !51
	%238 = bitcast float %231 to i32, !dbg !51
	%239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !51
	%240 = bitcast i32 %239 to float, !dbg !51
	%241 = fsub float %234, %227, !dbg !39
	%242 = fadd float %231, %240, !dbg !53
	%243 = fcmp oeq float %242, 0.000000e+00, !dbg !54
	%244 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %242) #6, !dbg !43
	%245 = select i1 %243, float 0.000000e+00, float %244, !dbg !55
	%246 = fmul float %241, %245, !dbg !44
	%247 = fadd float %227, %246, !dbg !45
	%248 = fadd float %229, %237, !dbg !46
	%249 = fmul float %241, %241, !dbg !47
	%250 = fmul float %231, %249, !dbg !50
	%251 = fmul float %250, %245, !dbg !48
	%252 = fadd float %248, %251, !dbg !49
	%253 = icmp eq i32 %13, 0, !dbg !51
	%254 = and i1 %224, %253, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %247, i1 %254) #6, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %228, float %252, i1 %254) #6, !dbg !51
	tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %230, float %242, i1 %254) #6, !dbg !51
	tail call void @llvm.nvvm.barrier0(), !dbg !51
	%255 = zext nneg i32 %218 to i64, !dbg !51
	%256 = getelementptr float, ptr addrspace(3) @global_smem, i64 %255, !dbg !51
	%257 = load float, ptr addrspace(3) %256, align 4, !dbg !51
	%258 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %255, !dbg !51
	%259 = load float, ptr addrspace(3) %258, align 4, !dbg !51
	%260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !56
	%261 = zext nneg i32 %17 to i64, !dbg !57
	%262 = getelementptr float, ptr addrspace(1) %3, i64 %261, !dbg !57
	%263 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %262, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !58
	br i1 %49, label %264, label %265, !dbg !59

	264: ; preds = %51
	tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !59
	br label %265, !dbg !59

	265: ; preds = %264, %51
	%266 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %57, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
	%267 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
	%268 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
	%269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
	%270 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %259, float 2.560000e+02) #6, !dbg !61
	%271 = fadd float %267, 0x3EE4F8B580000000, !dbg !62
	%272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
	%.not.i = icmp eq i32 %272, 0, !dbg !63
	br i1 %.not.i, label %275, label %273, !dbg !63

	273: ; preds = %265
	%274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !63
	br label %__nv_rsqrtf.exit, !dbg !63

	275: ; preds = %265
	%276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !63
	br label %__nv_rsqrtf.exit, !dbg !63

	__nv_rsqrtf.exit: ; preds = %273, %275
	%.0.i = phi float [ %274, %273 ], [ %276, %275 ], !dbg !63
	%277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
	%278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
	%279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !63
	%280 = extractvalue { i32, i32, i32, i32 } %266, 3, !dbg !60
	%281 = bitcast i32 %280 to float, !dbg !60
	%282 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !56
	%283 = bitcast i32 %282 to float, !dbg !56
	%284 = fadd float %283, %281, !dbg !64
	%285 = fsub float %284, %257, !dbg !65
	%286 = extractvalue { i32, i32, i32, i32 } %266, 2, !dbg !60
	%287 = bitcast i32 %286 to float, !dbg !60
	%288 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !56
	%289 = bitcast i32 %288 to float, !dbg !56
	%290 = fadd float %289, %287, !dbg !64
	%291 = fsub float %290, %257, !dbg !65
	%292 = extractvalue { i32, i32, i32, i32 } %266, 1, !dbg !60
	%293 = bitcast i32 %292 to float, !dbg !60
	%294 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !56
	%295 = bitcast i32 %294 to float, !dbg !56
	%296 = fadd float %295, %293, !dbg !64
	%297 = fsub float %296, %257, !dbg !65
	%298 = extractvalue { i32, i32, i32, i32 } %266, 0, !dbg !60
	%299 = bitcast i32 %298 to float, !dbg !60
	%300 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !56
	%301 = bitcast i32 %300 to float, !dbg !56
	%302 = fadd float %301, %299, !dbg !64
	%303 = fsub float %302, %257, !dbg !65
	%304 = extractvalue { i32, i32 } %263, 0, !dbg !58
	%305 = extractvalue { i32, i32 } %263, 1, !dbg !58
	%306 = fmul float %303, %.0.i, !dbg !66
	%307 = fmul float %297, %.0.i, !dbg !66
	%308 = fmul float %291, %.0.i, !dbg !66
	%309 = fmul float %285, %.0.i, !dbg !66
	tail call void @llvm.nvvm.barrier0(), !dbg !67
	%310 = getelementptr float, ptr addrspace(3) @global_smem, i64 %261, !dbg !67
	%311 = insertelement <2 x i32> undef, i32 %304, i64 0, !dbg !67
	%312 = insertelement <2 x i32> %311, i32 %305, i64 1, !dbg !67
	store <2 x i32> %312, ptr addrspace(3) %310, align 8, !dbg !67
	tail call void @llvm.nvvm.barrier0(), !dbg !67
	%313 = getelementptr float, ptr addrspace(3) @global_smem, i64 %55, !dbg !67
	%314 = load float, ptr addrspace(3) %313, align 16, !dbg !67
	%315 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 1, !dbg !67
	%316 = load float, ptr addrspace(3) %315, align 4, !dbg !67
	%317 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 2, !dbg !67
	%318 = load float, ptr addrspace(3) %317, align 8, !dbg !67
	%319 = getelementptr inbounds <4 x float>, ptr addrspace(3) %313, i64 0, i64 3, !dbg !67
	%320 = load float, ptr addrspace(3) %319, align 4, !dbg !67
	%321 = fmul float %306, %314, !dbg !67
	%322 = fmul float %307, %316, !dbg !67
	%323 = fmul float %308, %318, !dbg !67
	%324 = fmul float %309, %320, !dbg !67
	%325 = shl i32 %20, 8, !dbg !68
	%326 = or i32 %325, %15, !dbg !69
	%327 = sext i32 %326 to i64, !dbg !70
	%328 = getelementptr i16, ptr addrspace(1) %4, i64 %327, !dbg !70
	%329 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !71
	%330 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %322) #6, !dbg !71
	%331 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %323) #6, !dbg !71
	%332 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %324) #6, !dbg !71
	%333 = insertelement <2 x i16> undef, i16 %329, i64 0, !dbg !71
	%334 = insertelement <2 x i16> %333, i16 %330, i64 1, !dbg !71
	%335 = bitcast <2 x i16> %334 to i32, !dbg !71
	%336 = insertelement <2 x i16> undef, i16 %331, i64 0, !dbg !71
	%337 = insertelement <2 x i16> %336, i16 %332, i64 1, !dbg !71
	%338 = bitcast <2 x i16> %337 to i32, !dbg !71
	tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %335, i32 %338, ptr addrspace(1) %328, i1 true) #6, !dbg !71
	ret void, !dbg !72
	}

	; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

	; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
	declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1

	; Function Attrs: convergent nocallback nounwind
	declare void @llvm.nvvm.barrier0() #2

	; Function Attrs: alwaysinline nounwind
	define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
	%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
	%.not = icmp eq i32 %1, 0
	br i1 %.not, label %4, label %2

	2: ; preds = %0
	%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
	br label %6

	4: ; preds = %0
	%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
	br label %6

	6: ; preds = %4, %2
	%.0 = phi float [ %3, %2 ], [ %5, %4 ]
	ret float %.0
	}

	declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4

	; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5

	; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare float @llvm.nvvm.rsqrt.approx.f(float) #5

	attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
	attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
	attributes #2 = { convergent nocallback nounwind }
	attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
	attributes #6 = { nounwind }

	!llvm.module.flags = !{!0, !1}
	!llvm.dbg.cu = !{!2}
	!nvvm.annotations = !{!4, !5, !5, !4}
	!llvm.ident = !{!6}

	!0 = !{i32 2, !"Debug Info Version", i32 3}
	!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
	!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
	!3 = !DIFile(filename: "clhe4a3stvufxafmq3kk5hodazz2efctffte646znjdnv3lqi5oa.py", directory: "/tmp/torchinductor_root/lh")
	!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
	!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
	!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
	!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition \| DISPFlagOptimized, unit: !2)
	!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
	!9 = !{}
	!10 = !DILocation(line: 22, column: 44, scope: !7)
	!11 = !DILocation(line: 24, column: 33, scope: !7)
	!12 = !DILocation(line: 21, column: 28, scope: !7)
	!13 = !DILocation(line: 21, column: 33, scope: !7)
	!14 = !DILocation(line: 22, column: 23, scope: !7)
	!15 = !DILocation(line: 26, column: 30, scope: !7)
	!16 = !DILocation(line: 26, column: 35, scope: !7)
	!17 = !DILocation(line: 27, column: 18, scope: !7)
	!18 = !DILocation(line: 35, column: 44, scope: !7)
	!19 = !DILocation(line: 35, column: 40, scope: !7)
	!20 = !DILocation(line: 35, column: 34, scope: !7)
	!21 = !DILocation(line: 35, column: 50, scope: !7)
	!22 = !DILocation(line: 36, column: 22, scope: !7)
	!23 = !DILocation(line: 37, column: 22, scope: !7)
	!24 = !DILocation(line: 38, column: 36, scope: !7)
	!25 = !DILocation(line: 39, column: 40, scope: !7)
	!26 = !DILocation(line: 39, column: 55, scope: !7)
	!27 = !DILocation(line: 40, column: 44, scope: !7)
	!28 = !DILocation(line: 40, column: 40, scope: !7)
	!29 = !DILocation(line: 40, column: 34, scope: !7)
	!30 = !DILocation(line: 40, column: 52, scope: !7)
	!31 = !DILocation(line: 41, column: 22, scope: !7)
	!32 = !DILocation(line: 98, column: 22, scope: !33, inlinedAt: !35)
	!33 = distinct !DILexicalBlockFile(scope: !7, file: !34, discriminator: 0)
	!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
	!35 = !DILocation(line: 44, column: 38, scope: !33)
	!36 = !DILocation(line: 101, column: 30, scope: !33, inlinedAt: !35)
	!37 = !DILocation(line: 101, column: 22, scope: !33, inlinedAt: !35)
	!38 = !DILocation(line: 101, column: 13, scope: !33, inlinedAt: !35)
	!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
	!40 = distinct !DILexicalBlockFile(scope: !33, file: !34, discriminator: 0)
	!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
	!42 = !DILocation(line: 50, column: 41, scope: !40)
	!43 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
	!44 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
	!45 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
	!46 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
	!47 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
	!48 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
	!49 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
	!50 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
	!51 = !DILocation(line: 120, column: 46, scope: !33, inlinedAt: !52)
	!52 = !DILocation(line: 50, column: 41, scope: !33)
	!53 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
	!54 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
	!55 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
	!56 = !DILocation(line: 59, column: 51, scope: !7)
	!57 = !DILocation(line: 60, column: 35, scope: !7)
	!58 = !DILocation(line: 60, column: 40, scope: !7)
	!59 = !DILocation(line: 64, column: 57, scope: !7)
	!60 = !DILocation(line: 65, column: 54, scope: !7)
	!61 = !DILocation(line: 69, column: 23, scope: !7)
	!62 = !DILocation(line: 71, column: 24, scope: !7)
	!63 = !DILocation(line: 72, column: 30, scope: !7)
	!64 = !DILocation(line: 66, column: 24, scope: !7)
	!65 = !DILocation(line: 67, column: 24, scope: !7)
	!66 = !DILocation(line: 73, column: 24, scope: !7)
	!67 = !DILocation(line: 74, column: 24, scope: !7)
	!68 = !DILocation(line: 76, column: 39, scope: !7)
	!69 = !DILocation(line: 76, column: 35, scope: !7)
	!70 = !DILocation(line: 76, column: 29, scope: !7)
	!71 = !DILocation(line: 76, column: 52, scope: !7)
	!72 = !DILocation(line: 55, column: 4, scope: !7)