0-hero's picture
Add files using upload-large-folder tool
0def249 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
@global_smem = external addrspace(3) global [0 x i8]
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
%10 = and i32 %9, 31, !dbg !10
%11 = lshr i32 %9, 5, !dbg !10
%12 = and i32 %11, 1, !dbg !10
%urem = shl i32 %9, 2, !dbg !10
%13 = and i32 %urem, 252, !dbg !10
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
%15 = sext i32 %14 to i64, !dbg !12
%16 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !12
%17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
%18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
%19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
%20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
%21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !13
%22 = srem i32 %14, 512, !dbg !14
%23 = shl nsw i32 %22, 8, !dbg !15
%24 = or i32 %23, %13, !dbg !16
%25 = sext i32 %24 to i64, !dbg !17
%26 = getelementptr float, ptr addrspace(1) %2, i64 %25, !dbg !17
%27 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18
%28 = extractvalue { i32, i32, i32, i32 } %27, 0, !dbg !18
%29 = extractvalue { i32, i32, i32, i32 } %27, 1, !dbg !18
%30 = extractvalue { i32, i32, i32, i32 } %27, 2, !dbg !18
%31 = extractvalue { i32, i32, i32, i32 } %27, 3, !dbg !18
%32 = insertelement <2 x i32> poison, i32 %29, i64 0, !dbg !18
%33 = insertelement <2 x i32> %32, i32 %28, i64 1, !dbg !18
%34 = bitcast <2 x i32> %33 to <2 x float>, !dbg !18
%35 = bitcast i32 %30 to float, !dbg !18
%36 = bitcast i32 %31 to float, !dbg !18
%37 = shl i32 %14, 8, !dbg !19
%38 = or i32 %37, %13, !dbg !20
%39 = sext i32 %38 to i64, !dbg !21
%40 = getelementptr i16, ptr addrspace(1) %3, i64 %39, !dbg !21
%41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !22
%42 = extractvalue { i32, i32 } %41, 0, !dbg !22
%43 = extractvalue { i32, i32 } %41, 1, !dbg !22
%44 = trunc i32 %42 to i16, !dbg !22
%extelt.offset = lshr i32 %42, 16, !dbg !22
%45 = trunc i32 %extelt.offset to i16, !dbg !22
%46 = trunc i32 %43 to i16, !dbg !22
%extelt.offset1 = lshr i32 %43, 16, !dbg !22
%47 = trunc i32 %extelt.offset1 to i16, !dbg !22
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !23
%49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !23
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !23
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !23
%52 = add i64 %21, 50257, !dbg !24
%53 = icmp slt i64 %17, 0, !dbg !25
%54 = icmp slt i64 %21, 0, !dbg !25
%55 = select i1 %54, i64 %52, i64 %21, !dbg !26
%56 = icmp ugt i64 %55, 50256, !dbg !27
br i1 %56, label %57, label %58, !dbg !28
57: ; preds = %8
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !28
br label %58, !dbg !28
58: ; preds = %57, %8
%59 = shl i64 %17, 8, !dbg !29
%60 = add i64 %59, 12865792, !dbg !29
%61 = select i1 %53, i64 %60, i64 %59, !dbg !29
%62 = zext nneg i32 %13 to i64
%63 = or i64 %61, %62, !dbg !30
%64 = getelementptr float, ptr addrspace(1) %1, i64 %63, !dbg !31
%65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
%66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !32
%67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !32
%68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !32
%69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !32
%70 = bitcast i32 %68 to float, !dbg !32
%71 = bitcast i32 %69 to float, !dbg !32
%72 = fadd float %35, %70, !dbg !33
%73 = fadd float %36, %71, !dbg !33
%74 = fadd float %50, %72, !dbg !34
%75 = fadd float %51, %73, !dbg !34
%76 = insertelement <2 x i32> poison, i32 %67, i64 0, !dbg !32
%77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !32
%78 = bitcast <2 x i32> %77 to <2 x float>, !dbg !32
%79 = fadd <2 x float> %34, %78, !dbg !33
%80 = insertelement <2 x float> poison, float %49, i64 0, !dbg !34
%81 = insertelement <2 x float> %80, float %48, i64 1, !dbg !34
%82 = fadd <2 x float> %81, %79, !dbg !34
%83 = fadd <2 x float> %82, zeroinitializer, !dbg !35
%84 = fadd float %74, 0.000000e+00, !dbg !35
%85 = fadd float %75, 0.000000e+00, !dbg !35
%86 = extractelement <2 x float> %83, i64 1, !dbg !39
%87 = extractelement <2 x float> %82, i64 1, !dbg !43
%88 = fsub float %87, %86, !dbg !44
%89 = extractelement <2 x float> %83, i64 0, !dbg !39
%90 = extractelement <2 x float> %82, i64 0, !dbg !43
%91 = fsub float %90, %89, !dbg !44
%92 = fsub float %74, %84, !dbg !44
%93 = fsub float %75, %85, !dbg !44
%94 = fmul float %87, %88, !dbg !43
%95 = fmul float %90, %91, !dbg !43
%96 = fmul float %74, %92, !dbg !43
%97 = fmul float %75, %93, !dbg !43
%98 = fadd float %94, 0.000000e+00, !dbg !45
%99 = fadd float %95, 0.000000e+00, !dbg !45
%100 = fadd float %96, 0.000000e+00, !dbg !45
%101 = fadd float %97, 0.000000e+00, !dbg !45
%102 = fsub float %89, %86, !dbg !39
%103 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !46
%104 = fmul float %103, %102, !dbg !47
%105 = fadd float %86, %104, !dbg !48
%106 = fadd float %98, %99, !dbg !49
%107 = fmul float %102, %102, !dbg !50
%108 = fmul float %103, %107, !dbg !51
%109 = fadd float %108, %106, !dbg !52
%110 = fsub float %84, %105, !dbg !39
%111 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !46
%112 = fmul float %111, %110, !dbg !47
%113 = fadd float %105, %112, !dbg !48
%114 = fadd float %100, %109, !dbg !49
%115 = fmul float %110, %110, !dbg !50
%116 = fmul float %115, 2.000000e+00, !dbg !53
%117 = fmul float %111, %116, !dbg !51
%118 = fadd float %114, %117, !dbg !52
%119 = fsub float %85, %113, !dbg !39
%120 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !46
%121 = fmul float %120, %119, !dbg !47
%122 = fadd float %113, %121, !dbg !48
%123 = fadd float %101, %118, !dbg !49
%124 = fmul float %119, %119, !dbg !50
%125 = fmul float %124, 3.000000e+00, !dbg !53
%126 = fmul float %120, %125, !dbg !51
%127 = fadd float %123, %126, !dbg !52
%128 = bitcast float %122 to i32, !dbg !54
%129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 16, i32 31), !dbg !54
%130 = bitcast i32 %129 to float, !dbg !54
%131 = bitcast float %127 to i32, !dbg !54
%132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 16, i32 31), !dbg !54
%133 = bitcast i32 %132 to float, !dbg !54
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !54
%135 = bitcast i32 %134 to float, !dbg !54
%136 = fsub float %130, %122, !dbg !39
%137 = fadd float %135, 4.000000e+00, !dbg !56
%138 = fcmp oeq float %137, 0.000000e+00, !dbg !57
%139 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %137) #6, !dbg !46
%140 = select i1 %138, float 0.000000e+00, float %139, !dbg !58
%141 = fmul float %140, %136, !dbg !47
%142 = fadd float %122, %141, !dbg !48
%143 = fadd float %127, %133, !dbg !49
%144 = fmul float %136, %136, !dbg !50
%145 = fmul float %144, 4.000000e+00, !dbg !53
%146 = fmul float %140, %145, !dbg !51
%147 = fadd float %143, %146, !dbg !52
%148 = bitcast float %142 to i32, !dbg !54
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !54
%150 = bitcast i32 %149 to float, !dbg !54
%151 = bitcast float %147 to i32, !dbg !54
%152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 8, i32 31), !dbg !54
%153 = bitcast i32 %152 to float, !dbg !54
%154 = bitcast float %137 to i32, !dbg !54
%155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !54
%156 = bitcast i32 %155 to float, !dbg !54
%157 = fsub float %150, %142, !dbg !39
%158 = fadd float %137, %156, !dbg !56
%159 = fcmp oeq float %158, 0.000000e+00, !dbg !57
%160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %156, float %158) #6, !dbg !46
%161 = select i1 %159, float 0.000000e+00, float %160, !dbg !58
%162 = fmul float %161, %157, !dbg !47
%163 = fadd float %142, %162, !dbg !48
%164 = fadd float %147, %153, !dbg !49
%165 = fmul float %157, %157, !dbg !50
%166 = fmul float %137, %165, !dbg !53
%167 = fmul float %161, %166, !dbg !51
%168 = fadd float %164, %167, !dbg !52
%169 = bitcast float %163 to i32, !dbg !54
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 4, i32 31), !dbg !54
%171 = bitcast i32 %170 to float, !dbg !54
%172 = bitcast float %168 to i32, !dbg !54
%173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 4, i32 31), !dbg !54
%174 = bitcast i32 %173 to float, !dbg !54
%175 = bitcast float %158 to i32, !dbg !54
%176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !54
%177 = bitcast i32 %176 to float, !dbg !54
%178 = fsub float %171, %163, !dbg !39
%179 = fadd float %158, %177, !dbg !56
%180 = fcmp oeq float %179, 0.000000e+00, !dbg !57
%181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %179) #6, !dbg !46
%182 = select i1 %180, float 0.000000e+00, float %181, !dbg !58
%183 = fmul float %182, %178, !dbg !47
%184 = fadd float %163, %183, !dbg !48
%185 = fadd float %168, %174, !dbg !49
%186 = fmul float %178, %178, !dbg !50
%187 = fmul float %158, %186, !dbg !53
%188 = fmul float %182, %187, !dbg !51
%189 = fadd float %185, %188, !dbg !52
%190 = bitcast float %184 to i32, !dbg !54
%191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 2, i32 31), !dbg !54
%192 = bitcast i32 %191 to float, !dbg !54
%193 = bitcast float %189 to i32, !dbg !54
%194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 2, i32 31), !dbg !54
%195 = bitcast i32 %194 to float, !dbg !54
%196 = bitcast float %179 to i32, !dbg !54
%197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 2, i32 31), !dbg !54
%198 = bitcast i32 %197 to float, !dbg !54
%199 = fsub float %192, %184, !dbg !39
%200 = fadd float %179, %198, !dbg !56
%201 = fcmp oeq float %200, 0.000000e+00, !dbg !57
%202 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %198, float %200) #6, !dbg !46
%203 = select i1 %201, float 0.000000e+00, float %202, !dbg !58
%204 = fmul float %203, %199, !dbg !47
%205 = fadd float %184, %204, !dbg !48
%206 = fadd float %189, %195, !dbg !49
%207 = fmul float %199, %199, !dbg !50
%208 = fmul float %179, %207, !dbg !53
%209 = fmul float %203, %208, !dbg !51
%210 = fadd float %206, %209, !dbg !52
%211 = bitcast float %205 to i32, !dbg !54
%212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !54
%213 = bitcast i32 %212 to float, !dbg !54
%214 = bitcast float %210 to i32, !dbg !54
%215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !54
%216 = bitcast i32 %215 to float, !dbg !54
%217 = bitcast float %200 to i32, !dbg !54
%218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !54
%219 = bitcast i32 %218 to float, !dbg !54
%220 = fsub float %213, %205, !dbg !39
%221 = fadd float %200, %219, !dbg !56
%222 = fcmp oeq float %221, 0.000000e+00, !dbg !57
%223 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %219, float %221) #6, !dbg !46
%224 = select i1 %222, float 0.000000e+00, float %223, !dbg !58
%225 = fmul float %224, %220, !dbg !47
%226 = fadd float %205, %225, !dbg !48
%227 = fadd float %210, %216, !dbg !49
%228 = fmul float %220, %220, !dbg !50
%229 = fmul float %200, %228, !dbg !53
%230 = fmul float %224, %229, !dbg !51
%231 = fadd float %227, %230, !dbg !52
%232 = icmp eq i32 %10, 0, !dbg !54
%233 = zext nneg i32 %12 to i64, !dbg !54
%234 = getelementptr float, ptr addrspace(3) @global_smem, i64 %233, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, float %226, i1 %232) #6, !dbg !54
%235 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %233, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %235, float %231, i1 %232) #6, !dbg !54
%236 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %233, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, float %221, i1 %232) #6, !dbg !54
tail call void @llvm.nvvm.barrier0(), !dbg !54
%237 = icmp slt i32 %9, 2, !dbg !54
%238 = sext i32 %9 to i64, !dbg !54
%239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !54
%240 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %237) #6, !dbg !54
%241 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %238, !dbg !54
%242 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %241, i1 %237) #6, !dbg !54
%243 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %238, !dbg !54
%244 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 %237) #6, !dbg !54
%245 = bitcast float %240 to i32, !dbg !54
%246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 1, i32 31), !dbg !54
%247 = bitcast i32 %246 to float, !dbg !54
%248 = bitcast float %242 to i32, !dbg !54
%249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !54
%250 = bitcast i32 %249 to float, !dbg !54
%251 = bitcast float %244 to i32, !dbg !54
%252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !54
%253 = bitcast i32 %252 to float, !dbg !54
%254 = fsub float %247, %240, !dbg !39
%255 = fadd float %244, %253, !dbg !56
%256 = fcmp oeq float %255, 0.000000e+00, !dbg !57
%257 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %255) #6, !dbg !46
%258 = select i1 %256, float 0.000000e+00, float %257, !dbg !58
%259 = fmul float %254, %258, !dbg !47
%260 = fadd float %240, %259, !dbg !48
%261 = fadd float %242, %250, !dbg !49
%262 = fmul float %254, %254, !dbg !50
%263 = fmul float %244, %262, !dbg !53
%264 = fmul float %263, %258, !dbg !51
%265 = fadd float %261, %264, !dbg !52
%266 = and i32 %9, 1, !dbg !54
%267 = icmp eq i32 %266, 0, !dbg !54
%268 = and i1 %237, %267, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, float %260, i1 %268) #6, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %241, float %265, i1 %268) #6, !dbg !54
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, float %255, i1 %268) #6, !dbg !54
tail call void @llvm.nvvm.barrier0(), !dbg !54
%269 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !54
%270 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !54
%271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !59
%272 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !60
%273 = extractvalue { i32, i32 } %272, 0, !dbg !60
%274 = extractvalue { i32, i32 } %272, 1, !dbg !60
%275 = trunc i32 %273 to i16, !dbg !60
%extelt.offset2 = lshr i32 %273, 16, !dbg !60
%276 = trunc i32 %extelt.offset2 to i16, !dbg !60
%277 = trunc i32 %274 to i16, !dbg !60
%extelt.offset3 = lshr i32 %274, 16, !dbg !60
%278 = trunc i32 %extelt.offset3 to i16, !dbg !60
%279 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %275) #6, !dbg !61
%280 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %276) #6, !dbg !61
%281 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %277) #6, !dbg !61
%282 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %278) #6, !dbg !61
%283 = getelementptr float, ptr addrspace(1) %4, i64 %62, !dbg !62
%284 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %283, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
br i1 %56, label %285, label %286, !dbg !64
285: ; preds = %58
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !64
br label %286, !dbg !64
286: ; preds = %285, %58
%287 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %64, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !65
%288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
%289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
%290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
%291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %270, float 2.560000e+02) #6, !dbg !66
%292 = fadd float %288, 0x3EE4F8B580000000, !dbg !67
%293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
%.not.i = icmp eq i32 %293, 0, !dbg !68
br i1 %.not.i, label %296, label %294, !dbg !68
294: ; preds = %286
%295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %292), !dbg !68
br label %__nv_rsqrtf.exit, !dbg !68
296: ; preds = %286
%297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %292), !dbg !68
br label %__nv_rsqrtf.exit, !dbg !68
__nv_rsqrtf.exit: ; preds = %294, %296
%.0.i = phi float [ %295, %294 ], [ %297, %296 ], !dbg !68
%298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
%299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
%300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !68
%301 = extractvalue { i32, i32, i32, i32 } %287, 3, !dbg !65
%302 = bitcast i32 %301 to float, !dbg !65
%303 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !59
%304 = bitcast i32 %303 to float, !dbg !59
%305 = fadd float %304, %302, !dbg !69
%306 = fadd float %282, %305, !dbg !70
%307 = fsub float %306, %269, !dbg !71
%308 = extractvalue { i32, i32, i32, i32 } %287, 2, !dbg !65
%309 = bitcast i32 %308 to float, !dbg !65
%310 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !59
%311 = bitcast i32 %310 to float, !dbg !59
%312 = fadd float %311, %309, !dbg !69
%313 = fadd float %281, %312, !dbg !70
%314 = fsub float %313, %269, !dbg !71
%315 = extractvalue { i32, i32, i32, i32 } %287, 1, !dbg !65
%316 = bitcast i32 %315 to float, !dbg !65
%317 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !59
%318 = bitcast i32 %317 to float, !dbg !59
%319 = fadd float %318, %316, !dbg !69
%320 = fadd float %280, %319, !dbg !70
%321 = fsub float %320, %269, !dbg !71
%322 = extractvalue { i32, i32, i32, i32 } %287, 0, !dbg !65
%323 = bitcast i32 %322 to float, !dbg !65
%324 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !59
%325 = bitcast i32 %324 to float, !dbg !59
%326 = fadd float %325, %323, !dbg !69
%327 = fadd float %279, %326, !dbg !70
%328 = fsub float %327, %269, !dbg !71
%329 = extractvalue { i32, i32, i32, i32 } %284, 0, !dbg !63
%330 = bitcast i32 %329 to float, !dbg !63
%331 = extractvalue { i32, i32, i32, i32 } %284, 1, !dbg !63
%332 = bitcast i32 %331 to float, !dbg !63
%333 = extractvalue { i32, i32, i32, i32 } %284, 2, !dbg !63
%334 = bitcast i32 %333 to float, !dbg !63
%335 = extractvalue { i32, i32, i32, i32 } %284, 3, !dbg !63
%336 = bitcast i32 %335 to float, !dbg !63
%337 = fmul float %328, %.0.i, !dbg !72
%338 = fmul float %321, %.0.i, !dbg !72
%339 = fmul float %314, %.0.i, !dbg !72
%340 = fmul float %307, %.0.i, !dbg !72
%341 = fmul float %337, %330, !dbg !73
%342 = fmul float %338, %332, !dbg !73
%343 = fmul float %339, %334, !dbg !73
%344 = fmul float %340, %336, !dbg !73
%345 = getelementptr i16, ptr addrspace(1) %5, i64 %39, !dbg !74
%346 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !75
%347 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %342) #6, !dbg !75
%348 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %343) #6, !dbg !75
%349 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %344) #6, !dbg !75
%350 = insertelement <2 x i16> undef, i16 %346, i64 0, !dbg !75
%351 = insertelement <2 x i16> %350, i16 %347, i64 1, !dbg !75
%352 = bitcast <2 x i16> %351 to i32, !dbg !75
%353 = insertelement <2 x i16> undef, i16 %348, i64 0, !dbg !75
%354 = insertelement <2 x i16> %353, i16 %349, i64 1, !dbg !75
%355 = bitcast <2 x i16> %354 to i32, !dbg !75
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %352, i32 %355, ptr addrspace(1) %345, i1 true) #6, !dbg !75
ret void, !dbg !76
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline nounwind
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
%.not = icmp eq i32 %1, 0
br i1 %.not, label %4, label %2
2: ; preds = %0
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
br label %6
4: ; preds = %0
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
br label %6
6: ; preds = %4, %2
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
ret float %.0
}
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #6 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 24, column: 33, scope: !7)
!11 = !DILocation(line: 21, column: 28, scope: !7)
!12 = !DILocation(line: 26, column: 30, scope: !7)
!13 = !DILocation(line: 26, column: 35, scope: !7)
!14 = !DILocation(line: 27, column: 18, scope: !7)
!15 = !DILocation(line: 35, column: 44, scope: !7)
!16 = !DILocation(line: 35, column: 40, scope: !7)
!17 = !DILocation(line: 35, column: 34, scope: !7)
!18 = !DILocation(line: 35, column: 50, scope: !7)
!19 = !DILocation(line: 36, column: 44, scope: !7)
!20 = !DILocation(line: 36, column: 40, scope: !7)
!21 = !DILocation(line: 36, column: 34, scope: !7)
!22 = !DILocation(line: 36, column: 50, scope: !7)
!23 = !DILocation(line: 36, column: 101, scope: !7)
!24 = !DILocation(line: 37, column: 22, scope: !7)
!25 = !DILocation(line: 38, column: 22, scope: !7)
!26 = !DILocation(line: 39, column: 36, scope: !7)
!27 = !DILocation(line: 40, column: 40, scope: !7)
!28 = !DILocation(line: 40, column: 55, scope: !7)
!29 = !DILocation(line: 41, column: 44, scope: !7)
!30 = !DILocation(line: 41, column: 40, scope: !7)
!31 = !DILocation(line: 41, column: 34, scope: !7)
!32 = !DILocation(line: 41, column: 52, scope: !7)
!33 = !DILocation(line: 42, column: 22, scope: !7)
!34 = !DILocation(line: 44, column: 22, scope: !7)
!35 = !DILocation(line: 98, column: 22, scope: !36, inlinedAt: !38)
!36 = distinct !DILexicalBlockFile(scope: !7, file: !37, discriminator: 0)
!37 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
!38 = !DILocation(line: 47, column: 41, scope: !36)
!39 = !DILocation(line: 108, column: 21, scope: !40, inlinedAt: !41)
!40 = distinct !DILexicalBlockFile(scope: !36, file: !37, discriminator: 0)
!41 = !DILocation(line: 120, column: 46, scope: !40, inlinedAt: !42)
!42 = !DILocation(line: 53, column: 44, scope: !40)
!43 = !DILocation(line: 101, column: 22, scope: !36, inlinedAt: !38)
!44 = !DILocation(line: 101, column: 30, scope: !36, inlinedAt: !38)
!45 = !DILocation(line: 101, column: 13, scope: !36, inlinedAt: !38)
!46 = !DILocation(line: 110, column: 60, scope: !40, inlinedAt: !41)
!47 = !DILocation(line: 112, column: 25, scope: !40, inlinedAt: !41)
!48 = !DILocation(line: 112, column: 17, scope: !40, inlinedAt: !41)
!49 = !DILocation(line: 113, column: 15, scope: !40, inlinedAt: !41)
!50 = !DILocation(line: 113, column: 30, scope: !40, inlinedAt: !41)
!51 = !DILocation(line: 113, column: 49, scope: !40, inlinedAt: !41)
!52 = !DILocation(line: 113, column: 22, scope: !40, inlinedAt: !41)
!53 = !DILocation(line: 113, column: 38, scope: !40, inlinedAt: !41)
!54 = !DILocation(line: 120, column: 46, scope: !36, inlinedAt: !55)
!55 = !DILocation(line: 53, column: 44, scope: !36)
!56 = !DILocation(line: 109, column: 28, scope: !40, inlinedAt: !41)
!57 = !DILocation(line: 110, column: 39, scope: !40, inlinedAt: !41)
!58 = !DILocation(line: 110, column: 49, scope: !40, inlinedAt: !41)
!59 = !DILocation(line: 62, column: 51, scope: !7)
!60 = !DILocation(line: 63, column: 51, scope: !7)
!61 = !DILocation(line: 63, column: 103, scope: !7)
!62 = !DILocation(line: 64, column: 35, scope: !7)
!63 = !DILocation(line: 64, column: 40, scope: !7)
!64 = !DILocation(line: 68, column: 57, scope: !7)
!65 = !DILocation(line: 69, column: 54, scope: !7)
!66 = !DILocation(line: 75, column: 24, scope: !7)
!67 = !DILocation(line: 77, column: 24, scope: !7)
!68 = !DILocation(line: 78, column: 30, scope: !7)
!69 = !DILocation(line: 70, column: 24, scope: !7)
!70 = !DILocation(line: 72, column: 24, scope: !7)
!71 = !DILocation(line: 73, column: 24, scope: !7)
!72 = !DILocation(line: 79, column: 24, scope: !7)
!73 = !DILocation(line: 80, column: 24, scope: !7)
!74 = !DILocation(line: 82, column: 29, scope: !7)
!75 = !DILocation(line: 82, column: 52, scope: !7)
!76 = !DILocation(line: 58, column: 4, scope: !7)