0-hero's picture
Add files using upload-large-folder tool
afafe68 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%11 = lshr i32 %10, 5, !dbg !8
%urem = and i32 %10, 255, !dbg !8
%12 = or i32 %urem, 256, !dbg !8
%13 = or i32 %urem, 512, !dbg !8
%14 = or i32 %urem, 768, !dbg !8
%15 = or i32 %urem, 1024, !dbg !8
%16 = or i32 %urem, 1280, !dbg !8
%17 = or i32 %urem, 1536, !dbg !8
%18 = or i32 %urem, 1792, !dbg !8
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
%20 = sext i32 %19 to i64, !dbg !10
%21 = insertelement <8 x i32> poison, i32 %urem, i64 0
%22 = insertelement <8 x i32> %21, i32 %12, i64 1
%23 = insertelement <8 x i32> %22, i32 %13, i64 2
%24 = insertelement <8 x i32> %23, i32 %14, i64 3
%25 = insertelement <8 x i32> %24, i32 %15, i64 4
%26 = insertelement <8 x i32> %25, i32 %16, i64 5
%27 = insertelement <8 x i32> %26, i32 %17, i64 6
%28 = insertelement <8 x i32> %27, i32 %18, i64 7
%29 = zext <8 x i32> %28 to <8 x i64>
%30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12
%32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13
%33 = bitcast i32 %32 to float, !dbg !13
%34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14
%35 = bitcast i32 %34 to float, !dbg !14
%36 = mul nsw i64 %20, 50257, !dbg !15
%.not = icmp eq i64 %31, -1, !dbg !16
%37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17
%38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18
%invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19
%39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20
%40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20
br label %41, !dbg !19
41: ; preds = %9, %41
%42 = phi i32 [ 0, %9 ], [ %85, %41 ]
%43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ]
%44 = zext nneg i32 %42 to i64, !dbg !21
%45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21
%46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21
%47 = or <8 x i64> %46, %29, !dbg !21
%48 = icmp ult <8 x i64> %47, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
%49 = extractelement <8 x i64> %47, i64 0, !dbg !23
%gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23
%50 = extractelement <8 x i64> %47, i64 1, !dbg !23
%gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23
%51 = extractelement <8 x i64> %47, i64 2, !dbg !23
%gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23
%52 = extractelement <8 x i64> %47, i64 3, !dbg !23
%gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23
%53 = extractelement <8 x i64> %47, i64 4, !dbg !23
%gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23
%54 = extractelement <8 x i64> %47, i64 5, !dbg !23
%gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23
%55 = extractelement <8 x i64> %47, i64 6, !dbg !23
%gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23
%56 = extractelement <8 x i64> %47, i64 7, !dbg !23
%gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23
%57 = extractelement <8 x i1> %48, i64 0, !dbg !24
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24
%59 = extractelement <8 x i1> %48, i64 1, !dbg !24
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24
%61 = extractelement <8 x i1> %48, i64 2, !dbg !24
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24
%63 = extractelement <8 x i1> %48, i64 3, !dbg !24
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24
%65 = extractelement <8 x i1> %48, i64 4, !dbg !24
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24
%67 = extractelement <8 x i1> %48, i64 5, !dbg !24
%68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24
%69 = extractelement <8 x i1> %48, i64 6, !dbg !24
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24
%71 = extractelement <8 x i1> %48, i64 7, !dbg !24
%72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24
%73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24
%74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24
%75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24
%76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24
%77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24
%78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24
%79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24
%80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24
%81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24
%82 = fmul <8 x float> %40, %81, !dbg !20
%83 = select <8 x i1> %48, <8 x float> %82, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !25
%84 = fadd <8 x float> %43, %83, !dbg !25
%85 = add nuw nsw i32 %42, 2048, !dbg !19
%86 = icmp ult i32 %42, 48209, !dbg !19
br i1 %86, label %41, label %87, !dbg !19
87: ; preds = %41
%88 = and i32 %10, 31, !dbg !8
%89 = and i32 %11, 7, !dbg !8
%shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%90 = fadd <8 x float> %84, %shift, !dbg !26
%shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%91 = fadd <8 x float> %shift37, %90, !dbg !26
%shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%92 = fadd <8 x float> %shift38, %91, !dbg !26
%shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%93 = fadd <8 x float> %shift39, %92, !dbg !26
%shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%94 = fadd <8 x float> %shift40, %93, !dbg !26
%shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%95 = fadd <8 x float> %shift41, %94, !dbg !26
%shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
%96 = fadd <8 x float> %shift42, %95, !dbg !26
%97 = extractelement <8 x float> %96, i64 0, !dbg !26
%98 = bitcast float %97 to i32, !dbg !32
%99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32
%100 = bitcast i32 %99 to float, !dbg !32
%101 = fadd float %97, %100, !dbg !26
%102 = bitcast float %101 to i32, !dbg !32
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32
%104 = bitcast i32 %103 to float, !dbg !32
%105 = fadd float %101, %104, !dbg !26
%106 = bitcast float %105 to i32, !dbg !32
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32
%108 = bitcast i32 %107 to float, !dbg !32
%109 = fadd float %105, %108, !dbg !26
%110 = bitcast float %109 to i32, !dbg !32
%111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32
%112 = bitcast i32 %111 to float, !dbg !32
%113 = fadd float %109, %112, !dbg !26
%114 = bitcast float %113 to i32, !dbg !32
%115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32
%116 = bitcast i32 %115 to float, !dbg !32
%117 = fadd float %113, %116, !dbg !26
%118 = icmp eq i32 %88, 0, !dbg !32
%119 = zext nneg i32 %89 to i64, !dbg !32
%120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32
tail call void @llvm.nvvm.barrier0(), !dbg !32
%121 = icmp slt i32 %10, 8, !dbg !32
%122 = sext i32 %10 to i64, !dbg !32
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32
%124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32
%125 = bitcast float %124 to i32, !dbg !32
%126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32
%127 = bitcast i32 %126 to float, !dbg !32
%128 = fadd float %124, %127, !dbg !26
%129 = bitcast float %128 to i32, !dbg !32
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32
%131 = bitcast i32 %130 to float, !dbg !32
%132 = fadd float %128, %131, !dbg !26
%133 = bitcast float %132 to i32, !dbg !32
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32
%135 = bitcast i32 %134 to float, !dbg !32
%136 = fadd float %132, %135, !dbg !26
%137 = and i32 %10, 7, !dbg !32
%138 = icmp eq i32 %137, 0, !dbg !32
%139 = and i1 %121, %138, !dbg !32
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32
tail call void @llvm.nvvm.barrier0(), !dbg !32
%140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
%141 = extractelement <8 x i64> %29, i64 0, !dbg !34
%142 = extractelement <8 x i64> %29, i64 1, !dbg !34
%143 = extractelement <8 x i64> %29, i64 2, !dbg !34
%144 = extractelement <8 x i64> %29, i64 3, !dbg !34
%145 = extractelement <8 x i64> %29, i64 4, !dbg !34
%146 = extractelement <8 x i64> %29, i64 5, !dbg !34
%147 = extractelement <8 x i64> %29, i64 6, !dbg !34
%148 = extractelement <8 x i64> %29, i64 7, !dbg !34
br label %149, !dbg !35
149: ; preds = %87, %149
%150 = phi i32 [ 0, %87 ], [ %312, %149 ]
%151 = zext nneg i32 %150 to i64, !dbg !34
%152 = or i64 %141, %151, !dbg !34
%153 = or i64 %142, %151, !dbg !34
%154 = or i64 %143, %151, !dbg !34
%155 = or i64 %144, %151, !dbg !34
%156 = or i64 %145, %151, !dbg !34
%157 = or i64 %146, %151, !dbg !34
%158 = or i64 %147, %151, !dbg !34
%159 = or i64 %148, %151, !dbg !34
%160 = icmp ult i64 %152, 50257, !dbg !36
%161 = icmp ult i64 %153, 50257, !dbg !36
%162 = icmp ult i64 %154, 50257, !dbg !36
%163 = icmp ult i64 %155, 50257, !dbg !36
%164 = icmp ult i64 %156, 50257, !dbg !36
%165 = icmp ult i64 %157, 50257, !dbg !36
%166 = icmp ult i64 %158, 50257, !dbg !36
%167 = icmp ult i64 %159, 50257, !dbg !36
%168 = add nsw i64 %152, %36, !dbg !37
%169 = add nsw i64 %153, %36, !dbg !37
%170 = add nsw i64 %154, %36, !dbg !37
%171 = add nsw i64 %155, %36, !dbg !37
%172 = add nsw i64 %156, %36, !dbg !37
%173 = add nsw i64 %157, %36, !dbg !37
%174 = add nsw i64 %158, %36, !dbg !37
%175 = add nsw i64 %159, %36, !dbg !37
%176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38
%177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38
%178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38
%179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38
%180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38
%181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38
%182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38
%183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38
%184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39
%185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39
%186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39
%187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39
%188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39
%189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39
%190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39
%191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39
%192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40
%193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40
%194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40
%195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40
%196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40
%197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40
%198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40
%199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40
%200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41
%201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41
%202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41
%203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41
%204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41
%205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41
%206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41
%207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41
%208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42
%209 = bitcast i32 %208 to float, !dbg !42
%210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42
%211 = bitcast i32 %210 to float, !dbg !42
%212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42
%213 = bitcast i32 %212 to float, !dbg !42
%214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42
%215 = bitcast i32 %214 to float, !dbg !42
%216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42
%217 = bitcast i32 %216 to float, !dbg !42
%218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42
%219 = bitcast i32 %218 to float, !dbg !42
%220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42
%221 = bitcast i32 %220 to float, !dbg !42
%222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42
%223 = bitcast i32 %222 to float, !dbg !42
%224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43
%225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43
%226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43
%227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43
%228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43
%229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43
%230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43
%231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43
%232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44
%233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44
%234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44
%235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44
%236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44
%237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44
%238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44
%239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44
%240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45
%241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45
%242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45
%243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45
%244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45
%245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45
%246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45
%247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45
%248 = fmul float %38, %209, !dbg !46
%249 = fmul float %38, %211, !dbg !46
%250 = fmul float %38, %213, !dbg !46
%251 = fmul float %38, %215, !dbg !46
%252 = fmul float %38, %217, !dbg !46
%253 = fmul float %38, %219, !dbg !46
%254 = fmul float %38, %221, !dbg !46
%255 = fmul float %38, %223, !dbg !46
%256 = fmul float %240, 0x3FF7154760000000, !dbg !47
%257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47
%258 = fmul float %241, 0x3FF7154760000000, !dbg !47
%259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47
%260 = fmul float %242, 0x3FF7154760000000, !dbg !47
%261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47
%262 = fmul float %243, 0x3FF7154760000000, !dbg !47
%263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47
%264 = fmul float %244, 0x3FF7154760000000, !dbg !47
%265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47
%266 = fmul float %245, 0x3FF7154760000000, !dbg !47
%267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47
%268 = fmul float %246, 0x3FF7154760000000, !dbg !47
%269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47
%270 = fmul float %247, 0x3FF7154760000000, !dbg !47
%271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47
%272 = fmul float %140, %257, !dbg !48
%273 = fmul float %140, %259, !dbg !48
%274 = fmul float %140, %261, !dbg !48
%275 = fmul float %140, %263, !dbg !48
%276 = fmul float %140, %265, !dbg !48
%277 = fmul float %140, %267, !dbg !48
%278 = fmul float %140, %269, !dbg !48
%279 = fmul float %140, %271, !dbg !48
%280 = fsub float %248, %272, !dbg !49
%281 = fsub float %249, %273, !dbg !49
%282 = fsub float %250, %274, !dbg !49
%283 = fsub float %251, %275, !dbg !49
%284 = fsub float %252, %276, !dbg !49
%285 = fsub float %253, %277, !dbg !49
%286 = fsub float %254, %278, !dbg !49
%287 = fsub float %255, %279, !dbg !49
%288 = fadd float %192, %280, !dbg !50
%289 = fadd float %193, %281, !dbg !50
%290 = fadd float %194, %282, !dbg !50
%291 = fadd float %195, %283, !dbg !50
%292 = fadd float %196, %284, !dbg !50
%293 = fadd float %197, %285, !dbg !50
%294 = fadd float %198, %286, !dbg !50
%295 = fadd float %199, %287, !dbg !50
%296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51
%297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51
%298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51
%299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51
%300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51
%301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51
%302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51
%303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51
%304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52
%305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52
%306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52
%307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52
%308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52
%309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52
%310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52
%311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52
%312 = add nuw nsw i32 %150, 2048, !dbg !35
%313 = icmp ult i32 %150, 48209, !dbg !35
br i1 %313, label %149, label %314, !dbg !35
314: ; preds = %149
ret void, !dbg !53
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 24, column: 33, scope: !5)
!9 = !DILocation(line: 21, column: 28, scope: !5)
!10 = !DILocation(line: 21, column: 34, scope: !5)
!11 = !DILocation(line: 26, column: 30, scope: !5)
!12 = !DILocation(line: 26, column: 35, scope: !5)
!13 = !DILocation(line: 27, column: 19, scope: !5)
!14 = !DILocation(line: 29, column: 19, scope: !5)
!15 = !DILocation(line: 36, column: 46, scope: !5)
!16 = !DILocation(line: 38, column: 23, scope: !5)
!17 = !DILocation(line: 39, column: 22, scope: !5)
!18 = !DILocation(line: 41, column: 37, scope: !5)
!19 = !DILocation(line: 32, column: 36, scope: !5)
!20 = !DILocation(line: 42, column: 23, scope: !5)
!21 = !DILocation(line: 33, column: 27, scope: !5)
!22 = !DILocation(line: 34, column: 25, scope: !5)
!23 = !DILocation(line: 36, column: 34, scope: !5)
!24 = !DILocation(line: 36, column: 52, scope: !5)
!25 = !DILocation(line: 45, column: 40, scope: !5)
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
!31 = !DILocation(line: 46, column: 27, scope: !27)
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
!33 = !DILocation(line: 46, column: 27, scope: !29)
!34 = !DILocation(line: 52, column: 27, scope: !5)
!35 = !DILocation(line: 51, column: 36, scope: !5)
!36 = !DILocation(line: 53, column: 25, scope: !5)
!37 = !DILocation(line: 55, column: 41, scope: !5)
!38 = !DILocation(line: 55, column: 35, scope: !5)
!39 = !DILocation(line: 55, column: 53, scope: !5)
!40 = !DILocation(line: 55, column: 105, scope: !5)
!41 = !DILocation(line: 56, column: 35, scope: !5)
!42 = !DILocation(line: 56, column: 53, scope: !5)
!43 = !DILocation(line: 57, column: 35, scope: !5)
!44 = !DILocation(line: 57, column: 53, scope: !5)
!45 = !DILocation(line: 57, column: 105, scope: !5)
!46 = !DILocation(line: 63, column: 24, scope: !5)
!47 = !DILocation(line: 65, column: 23, scope: !5)
!48 = !DILocation(line: 66, column: 24, scope: !5)
!49 = !DILocation(line: 67, column: 24, scope: !5)
!50 = !DILocation(line: 69, column: 24, scope: !5)
!51 = !DILocation(line: 70, column: 29, scope: !5)
!52 = !DILocation(line: 70, column: 54, scope: !5)
!53 = !DILocation(line: 51, column: 4, scope: !5)