0-hero's picture
Add files using upload-large-folder tool
0def249 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
@global_smem = external addrspace(3) global [0 x i8]
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
%10 = and i32 %9, 31, !dbg !10
%11 = lshr i32 %9, 5, !dbg !10
%12 = lshr i32 %9, 6, !dbg !10
%13 = and i32 %12, 1, !dbg !10
%14 = and i32 %9, 1, !dbg !10
%15 = and i32 %11, 1, !dbg !11
%urem = shl i32 %9, 2, !dbg !11
%16 = and i32 %urem, 252, !dbg !11
%17 = shl i32 %9, 1, !dbg !11
%18 = and i32 %17, 254, !dbg !11
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
%20 = shl i32 %19, 1, !dbg !13
%21 = or i32 %20, %13, !dbg !14
%22 = or i32 %20, %14, !dbg !14
%23 = sext i32 %21 to i64, !dbg !15
%24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !15
%25 = sext i32 %22 to i64, !dbg !15
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !15
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !16
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !16
%32 = srem i32 %21, 512, !dbg !17
%33 = shl nsw i32 %32, 8, !dbg !18
%34 = or i32 %33, %16, !dbg !19
%35 = sext i32 %34 to i64, !dbg !20
%36 = getelementptr float, ptr addrspace(1) %2, i64 %35, !dbg !20
%37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !21
%38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !21
%39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !21
%40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !21
%41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !21
%42 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !21
%43 = insertelement <2 x i32> %42, i32 %38, i64 1, !dbg !21
%44 = bitcast <2 x i32> %43 to <2 x float>, !dbg !21
%45 = bitcast i32 %40 to float, !dbg !21
%46 = bitcast i32 %41 to float, !dbg !21
%47 = shl i32 %21, 8, !dbg !22
%48 = or i32 %47, %16, !dbg !23
%49 = sext i32 %48 to i64, !dbg !24
%50 = getelementptr i16, ptr addrspace(1) %3, i64 %49, !dbg !24
%51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25
%52 = extractvalue { i32, i32 } %51, 0, !dbg !25
%53 = extractvalue { i32, i32 } %51, 1, !dbg !25
%54 = trunc i32 %52 to i16, !dbg !25
%extelt.offset = lshr i32 %52, 16, !dbg !25
%55 = trunc i32 %extelt.offset to i16, !dbg !25
%56 = trunc i32 %53 to i16, !dbg !25
%extelt.offset1 = lshr i32 %53, 16, !dbg !25
%57 = trunc i32 %extelt.offset1 to i16, !dbg !25
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #6, !dbg !26
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #6, !dbg !26
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #6, !dbg !26
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #6, !dbg !26
%62 = add i64 %31, 50257, !dbg !27
%63 = icmp slt i64 %27, 0, !dbg !28
%64 = icmp slt i64 %31, 0, !dbg !28
%65 = select i1 %64, i64 %62, i64 %31, !dbg !29
%66 = icmp ugt i64 %65, 50256, !dbg !30
br i1 %66, label %67, label %68, !dbg !31
67: ; preds = %8
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !31
br label %68, !dbg !31
68: ; preds = %67, %8
%69 = shl i64 %27, 8, !dbg !32
%70 = add i64 %69, 12865792, !dbg !32
%71 = select i1 %63, i64 %70, i64 %69, !dbg !32
%72 = zext nneg i32 %16 to i64
%73 = or i64 %71, %72, !dbg !33
%74 = getelementptr float, ptr addrspace(1) %1, i64 %73, !dbg !34
%75 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
%76 = extractvalue { i32, i32, i32, i32 } %75, 0, !dbg !35
%77 = extractvalue { i32, i32, i32, i32 } %75, 1, !dbg !35
%78 = extractvalue { i32, i32, i32, i32 } %75, 2, !dbg !35
%79 = extractvalue { i32, i32, i32, i32 } %75, 3, !dbg !35
%80 = bitcast i32 %78 to float, !dbg !35
%81 = bitcast i32 %79 to float, !dbg !35
%82 = fadd float %45, %80, !dbg !36
%83 = fadd float %46, %81, !dbg !36
%84 = fadd float %60, %82, !dbg !37
%85 = fadd float %61, %83, !dbg !37
%86 = insertelement <2 x i32> poison, i32 %77, i64 0, !dbg !35
%87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !35
%88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !35
%89 = fadd <2 x float> %44, %88, !dbg !36
%90 = insertelement <2 x float> poison, float %59, i64 0, !dbg !37
%91 = insertelement <2 x float> %90, float %58, i64 1, !dbg !37
%92 = fadd <2 x float> %91, %89, !dbg !37
%93 = fadd <2 x float> %92, zeroinitializer, !dbg !38
%94 = fadd float %84, 0.000000e+00, !dbg !38
%95 = fadd float %85, 0.000000e+00, !dbg !38
%96 = extractelement <2 x float> %93, i64 1, !dbg !42
%97 = extractelement <2 x float> %92, i64 1, !dbg !46
%98 = fsub float %97, %96, !dbg !47
%99 = extractelement <2 x float> %93, i64 0, !dbg !42
%100 = extractelement <2 x float> %92, i64 0, !dbg !46
%101 = fsub float %100, %99, !dbg !47
%102 = fsub float %84, %94, !dbg !47
%103 = fsub float %85, %95, !dbg !47
%104 = fmul float %97, %98, !dbg !46
%105 = fmul float %100, %101, !dbg !46
%106 = fmul float %84, %102, !dbg !46
%107 = fmul float %85, %103, !dbg !46
%108 = fadd float %104, 0.000000e+00, !dbg !48
%109 = fadd float %105, 0.000000e+00, !dbg !48
%110 = fadd float %106, 0.000000e+00, !dbg !48
%111 = fadd float %107, 0.000000e+00, !dbg !48
%112 = fsub float %99, %96, !dbg !42
%113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !49
%114 = fmul float %113, %112, !dbg !50
%115 = fadd float %96, %114, !dbg !51
%116 = fadd float %108, %109, !dbg !52
%117 = fmul float %112, %112, !dbg !53
%118 = fmul float %113, %117, !dbg !54
%119 = fadd float %118, %116, !dbg !55
%120 = fsub float %94, %115, !dbg !42
%121 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !49
%122 = fmul float %121, %120, !dbg !50
%123 = fadd float %115, %122, !dbg !51
%124 = fadd float %110, %119, !dbg !52
%125 = fmul float %120, %120, !dbg !53
%126 = fmul float %125, 2.000000e+00, !dbg !56
%127 = fmul float %121, %126, !dbg !54
%128 = fadd float %124, %127, !dbg !55
%129 = fsub float %95, %123, !dbg !42
%130 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !49
%131 = fmul float %130, %129, !dbg !50
%132 = fadd float %123, %131, !dbg !51
%133 = fadd float %111, %128, !dbg !52
%134 = fmul float %129, %129, !dbg !53
%135 = fmul float %134, 3.000000e+00, !dbg !56
%136 = fmul float %130, %135, !dbg !54
%137 = fadd float %133, %136, !dbg !55
%138 = bitcast float %132 to i32, !dbg !57
%139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !57
%140 = bitcast i32 %139 to float, !dbg !57
%141 = bitcast float %137 to i32, !dbg !57
%142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 16, i32 31), !dbg !57
%143 = bitcast i32 %142 to float, !dbg !57
%144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !57
%145 = bitcast i32 %144 to float, !dbg !57
%146 = fsub float %140, %132, !dbg !42
%147 = fadd float %145, 4.000000e+00, !dbg !59
%148 = fcmp oeq float %147, 0.000000e+00, !dbg !60
%149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float %147) #6, !dbg !49
%150 = select i1 %148, float 0.000000e+00, float %149, !dbg !61
%151 = fmul float %150, %146, !dbg !50
%152 = fadd float %132, %151, !dbg !51
%153 = fadd float %137, %143, !dbg !52
%154 = fmul float %146, %146, !dbg !53
%155 = fmul float %154, 4.000000e+00, !dbg !56
%156 = fmul float %150, %155, !dbg !54
%157 = fadd float %153, %156, !dbg !55
%158 = bitcast float %152 to i32, !dbg !57
%159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !57
%160 = bitcast i32 %159 to float, !dbg !57
%161 = bitcast float %157 to i32, !dbg !57
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !57
%163 = bitcast i32 %162 to float, !dbg !57
%164 = bitcast float %147 to i32, !dbg !57
%165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !57
%166 = bitcast i32 %165 to float, !dbg !57
%167 = fsub float %160, %152, !dbg !42
%168 = fadd float %147, %166, !dbg !59
%169 = fcmp oeq float %168, 0.000000e+00, !dbg !60
%170 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %166, float %168) #6, !dbg !49
%171 = select i1 %169, float 0.000000e+00, float %170, !dbg !61
%172 = fmul float %171, %167, !dbg !50
%173 = fadd float %152, %172, !dbg !51
%174 = fadd float %157, %163, !dbg !52
%175 = fmul float %167, %167, !dbg !53
%176 = fmul float %147, %175, !dbg !56
%177 = fmul float %171, %176, !dbg !54
%178 = fadd float %174, %177, !dbg !55
%179 = bitcast float %173 to i32, !dbg !57
%180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 4, i32 31), !dbg !57
%181 = bitcast i32 %180 to float, !dbg !57
%182 = bitcast float %178 to i32, !dbg !57
%183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !57
%184 = bitcast i32 %183 to float, !dbg !57
%185 = bitcast float %168 to i32, !dbg !57
%186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 4, i32 31), !dbg !57
%187 = bitcast i32 %186 to float, !dbg !57
%188 = fsub float %181, %173, !dbg !42
%189 = fadd float %168, %187, !dbg !59
%190 = fcmp oeq float %189, 0.000000e+00, !dbg !60
%191 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %187, float %189) #6, !dbg !49
%192 = select i1 %190, float 0.000000e+00, float %191, !dbg !61
%193 = fmul float %192, %188, !dbg !50
%194 = fadd float %173, %193, !dbg !51
%195 = fadd float %178, %184, !dbg !52
%196 = fmul float %188, %188, !dbg !53
%197 = fmul float %168, %196, !dbg !56
%198 = fmul float %192, %197, !dbg !54
%199 = fadd float %195, %198, !dbg !55
%200 = bitcast float %194 to i32, !dbg !57
%201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 2, i32 31), !dbg !57
%202 = bitcast i32 %201 to float, !dbg !57
%203 = bitcast float %199 to i32, !dbg !57
%204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 2, i32 31), !dbg !57
%205 = bitcast i32 %204 to float, !dbg !57
%206 = bitcast float %189 to i32, !dbg !57
%207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 2, i32 31), !dbg !57
%208 = bitcast i32 %207 to float, !dbg !57
%209 = fsub float %202, %194, !dbg !42
%210 = fadd float %189, %208, !dbg !59
%211 = fcmp oeq float %210, 0.000000e+00, !dbg !60
%212 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float %210) #6, !dbg !49
%213 = select i1 %211, float 0.000000e+00, float %212, !dbg !61
%214 = fmul float %213, %209, !dbg !50
%215 = fadd float %194, %214, !dbg !51
%216 = fadd float %199, %205, !dbg !52
%217 = fmul float %209, %209, !dbg !53
%218 = fmul float %189, %217, !dbg !56
%219 = fmul float %213, %218, !dbg !54
%220 = fadd float %216, %219, !dbg !55
%221 = bitcast float %215 to i32, !dbg !57
%222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !57
%223 = bitcast i32 %222 to float, !dbg !57
%224 = bitcast float %220 to i32, !dbg !57
%225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 1, i32 31), !dbg !57
%226 = bitcast i32 %225 to float, !dbg !57
%227 = bitcast float %210 to i32, !dbg !57
%228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 1, i32 31), !dbg !57
%229 = bitcast i32 %228 to float, !dbg !57
%230 = fsub float %223, %215, !dbg !42
%231 = fadd float %210, %229, !dbg !59
%232 = fcmp oeq float %231, 0.000000e+00, !dbg !60
%233 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %229, float %231) #6, !dbg !49
%234 = select i1 %232, float 0.000000e+00, float %233, !dbg !61
%235 = fmul float %234, %230, !dbg !50
%236 = fadd float %215, %235, !dbg !51
%237 = fadd float %220, %226, !dbg !52
%238 = fmul float %230, %230, !dbg !53
%239 = fmul float %210, %238, !dbg !56
%240 = fmul float %234, %239, !dbg !54
%241 = fadd float %237, %240, !dbg !55
%242 = icmp eq i32 %10, 0, !dbg !57
%243 = shl nuw nsw i32 %13, 1, !dbg !57
%244 = or i32 %243, %15, !dbg !57
%245 = zext nneg i32 %244 to i64, !dbg !57
%246 = getelementptr float, ptr addrspace(3) @global_smem, i64 %245, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %246, float %236, i1 %242) #6, !dbg !57
%247 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %245, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %247, float %241, i1 %242) #6, !dbg !57
%248 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %245, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, float %231, i1 %242) #6, !dbg !57
tail call void @llvm.nvvm.barrier0(), !dbg !57
%249 = icmp slt i32 %9, 4, !dbg !57
%250 = sext i32 %9 to i64, !dbg !57
%251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !57
%252 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %249) #6, !dbg !57
%253 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %250, !dbg !57
%254 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %253, i1 %249) #6, !dbg !57
%255 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %250, !dbg !57
%256 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %249) #6, !dbg !57
%257 = bitcast float %252 to i32, !dbg !57
%258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !57
%259 = bitcast i32 %258 to float, !dbg !57
%260 = bitcast float %254 to i32, !dbg !57
%261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 1, i32 31), !dbg !57
%262 = bitcast i32 %261 to float, !dbg !57
%263 = bitcast float %256 to i32, !dbg !57
%264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !57
%265 = bitcast i32 %264 to float, !dbg !57
%266 = fsub float %259, %252, !dbg !42
%267 = fadd float %256, %265, !dbg !59
%268 = fcmp oeq float %267, 0.000000e+00, !dbg !60
%269 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %265, float %267) #6, !dbg !49
%270 = select i1 %268, float 0.000000e+00, float %269, !dbg !61
%271 = fmul float %266, %270, !dbg !50
%272 = fadd float %252, %271, !dbg !51
%273 = fadd float %254, %262, !dbg !52
%274 = fmul float %266, %266, !dbg !53
%275 = fmul float %256, %274, !dbg !56
%276 = fmul float %275, %270, !dbg !54
%277 = fadd float %273, %276, !dbg !55
%278 = icmp eq i32 %14, 0, !dbg !57
%279 = and i1 %249, %278, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, float %272, i1 %279) #6, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, float %277, i1 %279) #6, !dbg !57
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, float %267, i1 %279) #6, !dbg !57
tail call void @llvm.nvvm.barrier0(), !dbg !57
%280 = zext nneg i32 %243 to i64, !dbg !57
%281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !57
%282 = load float, ptr addrspace(3) %281, align 4, !dbg !57
%283 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %280, !dbg !57
%284 = load float, ptr addrspace(3) %283, align 4, !dbg !57
%285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !62
%286 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %50, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !63
%287 = extractvalue { i32, i32 } %286, 0, !dbg !63
%288 = extractvalue { i32, i32 } %286, 1, !dbg !63
%289 = trunc i32 %287 to i16, !dbg !63
%extelt.offset2 = lshr i32 %287, 16, !dbg !63
%290 = trunc i32 %extelt.offset2 to i16, !dbg !63
%291 = trunc i32 %288 to i16, !dbg !63
%extelt.offset3 = lshr i32 %288, 16, !dbg !63
%292 = trunc i32 %extelt.offset3 to i16, !dbg !63
%293 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %289) #6, !dbg !64
%294 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %290) #6, !dbg !64
%295 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %291) #6, !dbg !64
%296 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %292) #6, !dbg !64
%297 = zext nneg i32 %18 to i64, !dbg !65
%298 = getelementptr float, ptr addrspace(1) %4, i64 %297, !dbg !65
%299 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %298, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !66
br i1 %66, label %300, label %301, !dbg !67
300: ; preds = %68
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !67
br label %301, !dbg !67
301: ; preds = %300, %68
%302 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %74, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
%303 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
%304 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
%305 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
%306 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float 2.560000e+02) #6, !dbg !69
%307 = fadd float %303, 0x3EE4F8B580000000, !dbg !70
%308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
%.not.i = icmp eq i32 %308, 0, !dbg !71
br i1 %.not.i, label %311, label %309, !dbg !71
309: ; preds = %301
%310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %307), !dbg !71
br label %__nv_rsqrtf.exit, !dbg !71
311: ; preds = %301
%312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %307), !dbg !71
br label %__nv_rsqrtf.exit, !dbg !71
__nv_rsqrtf.exit: ; preds = %309, %311
%.0.i = phi float [ %310, %309 ], [ %312, %311 ], !dbg !71
%313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
%314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
%315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !71
%316 = extractvalue { i32, i32, i32, i32 } %302, 3, !dbg !68
%317 = bitcast i32 %316 to float, !dbg !68
%318 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !62
%319 = bitcast i32 %318 to float, !dbg !62
%320 = fadd float %319, %317, !dbg !72
%321 = fadd float %296, %320, !dbg !73
%322 = fsub float %321, %282, !dbg !74
%323 = extractvalue { i32, i32, i32, i32 } %302, 2, !dbg !68
%324 = bitcast i32 %323 to float, !dbg !68
%325 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !62
%326 = bitcast i32 %325 to float, !dbg !62
%327 = fadd float %326, %324, !dbg !72
%328 = fadd float %295, %327, !dbg !73
%329 = fsub float %328, %282, !dbg !74
%330 = extractvalue { i32, i32, i32, i32 } %302, 1, !dbg !68
%331 = bitcast i32 %330 to float, !dbg !68
%332 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !62
%333 = bitcast i32 %332 to float, !dbg !62
%334 = fadd float %333, %331, !dbg !72
%335 = fadd float %294, %334, !dbg !73
%336 = fsub float %335, %282, !dbg !74
%337 = extractvalue { i32, i32, i32, i32 } %302, 0, !dbg !68
%338 = bitcast i32 %337 to float, !dbg !68
%339 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !62
%340 = bitcast i32 %339 to float, !dbg !62
%341 = fadd float %340, %338, !dbg !72
%342 = fadd float %293, %341, !dbg !73
%343 = fsub float %342, %282, !dbg !74
%344 = extractvalue { i32, i32 } %299, 0, !dbg !66
%345 = extractvalue { i32, i32 } %299, 1, !dbg !66
%346 = fmul float %343, %.0.i, !dbg !75
%347 = fmul float %336, %.0.i, !dbg !75
%348 = fmul float %329, %.0.i, !dbg !75
%349 = fmul float %322, %.0.i, !dbg !75
tail call void @llvm.nvvm.barrier0(), !dbg !76
%350 = getelementptr float, ptr addrspace(3) @global_smem, i64 %297, !dbg !76
%351 = insertelement <2 x i32> undef, i32 %344, i64 0, !dbg !76
%352 = insertelement <2 x i32> %351, i32 %345, i64 1, !dbg !76
store <2 x i32> %352, ptr addrspace(3) %350, align 8, !dbg !76
tail call void @llvm.nvvm.barrier0(), !dbg !76
%353 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !76
%354 = load float, ptr addrspace(3) %353, align 16, !dbg !76
%355 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 1, !dbg !76
%356 = load float, ptr addrspace(3) %355, align 4, !dbg !76
%357 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 2, !dbg !76
%358 = load float, ptr addrspace(3) %357, align 8, !dbg !76
%359 = getelementptr inbounds <4 x float>, ptr addrspace(3) %353, i64 0, i64 3, !dbg !76
%360 = load float, ptr addrspace(3) %359, align 4, !dbg !76
%361 = fmul float %346, %354, !dbg !76
%362 = fmul float %347, %356, !dbg !76
%363 = fmul float %348, %358, !dbg !76
%364 = fmul float %349, %360, !dbg !76
%365 = getelementptr i16, ptr addrspace(1) %5, i64 %49, !dbg !77
%366 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %361) #6, !dbg !78
%367 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %362) #6, !dbg !78
%368 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %363) #6, !dbg !78
%369 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %364) #6, !dbg !78
%370 = insertelement <2 x i16> undef, i16 %366, i64 0, !dbg !78
%371 = insertelement <2 x i16> %370, i16 %367, i64 1, !dbg !78
%372 = bitcast <2 x i16> %371 to i32, !dbg !78
%373 = insertelement <2 x i16> undef, i16 %368, i64 0, !dbg !78
%374 = insertelement <2 x i16> %373, i16 %369, i64 1, !dbg !78
%375 = bitcast <2 x i16> %374 to i32, !dbg !78
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %372, i32 %375, ptr addrspace(1) %365, i1 true) #6, !dbg !78
ret void, !dbg !79
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline nounwind
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
%.not = icmp eq i32 %1, 0
br i1 %.not, label %4, label %2
2: ; preds = %0
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
br label %6
4: ; preds = %0
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
br label %6
6: ; preds = %4, %2
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
ret float %.0
}
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #6 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 22, column: 44, scope: !7)
!11 = !DILocation(line: 24, column: 33, scope: !7)
!12 = !DILocation(line: 21, column: 28, scope: !7)
!13 = !DILocation(line: 21, column: 33, scope: !7)
!14 = !DILocation(line: 22, column: 23, scope: !7)
!15 = !DILocation(line: 26, column: 30, scope: !7)
!16 = !DILocation(line: 26, column: 35, scope: !7)
!17 = !DILocation(line: 27, column: 18, scope: !7)
!18 = !DILocation(line: 35, column: 44, scope: !7)
!19 = !DILocation(line: 35, column: 40, scope: !7)
!20 = !DILocation(line: 35, column: 34, scope: !7)
!21 = !DILocation(line: 35, column: 50, scope: !7)
!22 = !DILocation(line: 36, column: 44, scope: !7)
!23 = !DILocation(line: 36, column: 40, scope: !7)
!24 = !DILocation(line: 36, column: 34, scope: !7)
!25 = !DILocation(line: 36, column: 50, scope: !7)
!26 = !DILocation(line: 36, column: 101, scope: !7)
!27 = !DILocation(line: 37, column: 22, scope: !7)
!28 = !DILocation(line: 38, column: 22, scope: !7)
!29 = !DILocation(line: 39, column: 36, scope: !7)
!30 = !DILocation(line: 40, column: 40, scope: !7)
!31 = !DILocation(line: 40, column: 55, scope: !7)
!32 = !DILocation(line: 41, column: 44, scope: !7)
!33 = !DILocation(line: 41, column: 40, scope: !7)
!34 = !DILocation(line: 41, column: 34, scope: !7)
!35 = !DILocation(line: 41, column: 52, scope: !7)
!36 = !DILocation(line: 42, column: 22, scope: !7)
!37 = !DILocation(line: 44, column: 22, scope: !7)
!38 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
!41 = !DILocation(line: 47, column: 41, scope: !39)
!42 = !DILocation(line: 108, column: 21, scope: !43, inlinedAt: !44)
!43 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
!44 = !DILocation(line: 120, column: 46, scope: !43, inlinedAt: !45)
!45 = !DILocation(line: 53, column: 44, scope: !43)
!46 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
!47 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
!48 = !DILocation(line: 101, column: 13, scope: !39, inlinedAt: !41)
!49 = !DILocation(line: 110, column: 60, scope: !43, inlinedAt: !44)
!50 = !DILocation(line: 112, column: 25, scope: !43, inlinedAt: !44)
!51 = !DILocation(line: 112, column: 17, scope: !43, inlinedAt: !44)
!52 = !DILocation(line: 113, column: 15, scope: !43, inlinedAt: !44)
!53 = !DILocation(line: 113, column: 30, scope: !43, inlinedAt: !44)
!54 = !DILocation(line: 113, column: 49, scope: !43, inlinedAt: !44)
!55 = !DILocation(line: 113, column: 22, scope: !43, inlinedAt: !44)
!56 = !DILocation(line: 113, column: 38, scope: !43, inlinedAt: !44)
!57 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !58)
!58 = !DILocation(line: 53, column: 44, scope: !39)
!59 = !DILocation(line: 109, column: 28, scope: !43, inlinedAt: !44)
!60 = !DILocation(line: 110, column: 39, scope: !43, inlinedAt: !44)
!61 = !DILocation(line: 110, column: 49, scope: !43, inlinedAt: !44)
!62 = !DILocation(line: 62, column: 51, scope: !7)
!63 = !DILocation(line: 63, column: 51, scope: !7)
!64 = !DILocation(line: 63, column: 103, scope: !7)
!65 = !DILocation(line: 64, column: 35, scope: !7)
!66 = !DILocation(line: 64, column: 40, scope: !7)
!67 = !DILocation(line: 68, column: 57, scope: !7)
!68 = !DILocation(line: 69, column: 54, scope: !7)
!69 = !DILocation(line: 75, column: 24, scope: !7)
!70 = !DILocation(line: 77, column: 24, scope: !7)
!71 = !DILocation(line: 78, column: 30, scope: !7)
!72 = !DILocation(line: 70, column: 24, scope: !7)
!73 = !DILocation(line: 72, column: 24, scope: !7)
!74 = !DILocation(line: 73, column: 24, scope: !7)
!75 = !DILocation(line: 79, column: 24, scope: !7)
!76 = !DILocation(line: 80, column: 24, scope: !7)
!77 = !DILocation(line: 82, column: 29, scope: !7)
!78 = !DILocation(line: 82, column: 52, scope: !7)
!79 = !DILocation(line: 58, column: 4, scope: !7)