; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external addrspace(3) global [0 x i8] define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %11 = lshr i32 %10, 5, !dbg !8 %urem = and i32 %10, 255, !dbg !8 %12 = or i32 %urem, 256, !dbg !8 %13 = or i32 %urem, 512, !dbg !8 %14 = or i32 %urem, 768, !dbg !8 %15 = or i32 %urem, 1024, !dbg !8 %16 = or i32 %urem, 1280, !dbg !8 %17 = or i32 %urem, 1536, !dbg !8 %18 = or i32 %urem, 1792, !dbg !8 %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 %20 = sext i32 %19 to i64, !dbg !10 %21 = insertelement <8 x i32> poison, i32 %urem, i64 0 %22 = insertelement <8 x i32> %21, i32 %12, i64 1 %23 = insertelement <8 x i32> %22, i32 %13, i64 2 %24 = insertelement <8 x i32> %23, i32 %14, i64 3 %25 = insertelement <8 x i32> %24, i32 %15, i64 4 %26 = insertelement <8 x i32> %25, i32 %16, i64 5 %27 = insertelement <8 x i32> %26, i32 %17, i64 6 %28 = insertelement <8 x i32> %27, i32 %18, i64 7 %29 = zext <8 x i32> %28 to <8 x i64> %30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11 %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12 %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13 %33 = bitcast i32 %32 to float, !dbg !13 %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14 %35 = bitcast i32 %34 to float, !dbg !14 %36 = mul nsw i64 %20, 50257, !dbg !15 %.not = icmp eq i64 %31, -1, !dbg !16 %37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17 %38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18 %invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19 %39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20 %40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20 br label %41, !dbg !19 41: ; preds = %9, %41 %42 = phi i32 [ 0, %9 ], [ %85, %41 ] %43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ] %44 = zext nneg i32 %42 to i64, !dbg !21 %45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21 %46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21 %47 = or <8 x i64> %46, %29, !dbg !21 %48 = icmp ult <8 x i64> %47, , !dbg !22 %49 = extractelement <8 x i64> %47, i64 0, !dbg !23 %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23 %50 = extractelement <8 x i64> %47, i64 1, !dbg !23 %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23 %51 = extractelement <8 x i64> %47, i64 2, !dbg !23 %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23 %52 = extractelement <8 x i64> %47, i64 3, !dbg !23 %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23 %53 = extractelement <8 x i64> %47, i64 4, !dbg !23 %gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23 %54 = extractelement <8 x i64> %47, i64 5, !dbg !23 %gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23 %55 = extractelement <8 x i64> %47, i64 6, !dbg !23 %gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23 %56 = extractelement <8 x i64> %47, i64 7, !dbg !23 %gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23 %57 = extractelement <8 x i1> %48, i64 0, !dbg !24 %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24 %59 = extractelement <8 x i1> %48, i64 1, !dbg !24 %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24 %61 = extractelement <8 x i1> %48, i64 2, !dbg !24 %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24 %63 = extractelement <8 x i1> %48, i64 3, !dbg !24 %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24 %65 = extractelement <8 x i1> %48, i64 4, !dbg !24 %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24 %67 = extractelement <8 x i1> %48, i64 5, !dbg !24 %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24 %69 = extractelement <8 x i1> %48, i64 6, !dbg !24 %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24 %71 = extractelement <8 x i1> %48, i64 7, !dbg !24 %72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24 %73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24 %74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24 %75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24 %76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24 %77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24 %78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24 %79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24 %80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24 %81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24 %82 = fmul <8 x float> %40, %81, !dbg !20 %83 = select <8 x i1> %48, <8 x float> %82, <8 x float> , !dbg !25 %84 = fadd <8 x float> %43, %83, !dbg !25 %85 = add nuw nsw i32 %42, 2048, !dbg !19 %86 = icmp ult i32 %42, 48209, !dbg !19 br i1 %86, label %41, label %87, !dbg !19 87: ; preds = %41 %88 = and i32 %10, 31, !dbg !8 %89 = and i32 %11, 7, !dbg !8 %shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %90 = fadd <8 x float> %84, %shift, !dbg !26 %shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %91 = fadd <8 x float> %shift37, %90, !dbg !26 %shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %92 = fadd <8 x float> %shift38, %91, !dbg !26 %shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %93 = fadd <8 x float> %shift39, %92, !dbg !26 %shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %94 = fadd <8 x float> %shift40, %93, !dbg !26 %shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %95 = fadd <8 x float> %shift41, %94, !dbg !26 %shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> , !dbg !26 %96 = fadd <8 x float> %shift42, %95, !dbg !26 %97 = extractelement <8 x float> %96, i64 0, !dbg !26 %98 = bitcast float %97 to i32, !dbg !32 %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32 %100 = bitcast i32 %99 to float, !dbg !32 %101 = fadd float %97, %100, !dbg !26 %102 = bitcast float %101 to i32, !dbg !32 %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32 %104 = bitcast i32 %103 to float, !dbg !32 %105 = fadd float %101, %104, !dbg !26 %106 = bitcast float %105 to i32, !dbg !32 %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32 %108 = bitcast i32 %107 to float, !dbg !32 %109 = fadd float %105, %108, !dbg !26 %110 = bitcast float %109 to i32, !dbg !32 %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32 %112 = bitcast i32 %111 to float, !dbg !32 %113 = fadd float %109, %112, !dbg !26 %114 = bitcast float %113 to i32, !dbg !32 %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32 %116 = bitcast i32 %115 to float, !dbg !32 %117 = fadd float %113, %116, !dbg !26 %118 = icmp eq i32 %88, 0, !dbg !32 %119 = zext nneg i32 %89 to i64, !dbg !32 %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32 tail call void @llvm.nvvm.barrier0(), !dbg !32 %121 = icmp slt i32 %10, 8, !dbg !32 %122 = sext i32 %10 to i64, !dbg !32 %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32 %124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32 %125 = bitcast float %124 to i32, !dbg !32 %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32 %127 = bitcast i32 %126 to float, !dbg !32 %128 = fadd float %124, %127, !dbg !26 %129 = bitcast float %128 to i32, !dbg !32 %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32 %131 = bitcast i32 %130 to float, !dbg !32 %132 = fadd float %128, %131, !dbg !26 %133 = bitcast float %132 to i32, !dbg !32 %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32 %135 = bitcast i32 %134 to float, !dbg !32 %136 = fadd float %132, %135, !dbg !26 %137 = and i32 %10, 7, !dbg !32 %138 = icmp eq i32 %137, 0, !dbg !32 %139 = and i1 %121, %138, !dbg !32 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32 tail call void @llvm.nvvm.barrier0(), !dbg !32 %140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 %141 = extractelement <8 x i64> %29, i64 0, !dbg !34 %142 = extractelement <8 x i64> %29, i64 1, !dbg !34 %143 = extractelement <8 x i64> %29, i64 2, !dbg !34 %144 = extractelement <8 x i64> %29, i64 3, !dbg !34 %145 = extractelement <8 x i64> %29, i64 4, !dbg !34 %146 = extractelement <8 x i64> %29, i64 5, !dbg !34 %147 = extractelement <8 x i64> %29, i64 6, !dbg !34 %148 = extractelement <8 x i64> %29, i64 7, !dbg !34 br label %149, !dbg !35 149: ; preds = %87, %149 %150 = phi i32 [ 0, %87 ], [ %312, %149 ] %151 = zext nneg i32 %150 to i64, !dbg !34 %152 = or i64 %141, %151, !dbg !34 %153 = or i64 %142, %151, !dbg !34 %154 = or i64 %143, %151, !dbg !34 %155 = or i64 %144, %151, !dbg !34 %156 = or i64 %145, %151, !dbg !34 %157 = or i64 %146, %151, !dbg !34 %158 = or i64 %147, %151, !dbg !34 %159 = or i64 %148, %151, !dbg !34 %160 = icmp ult i64 %152, 50257, !dbg !36 %161 = icmp ult i64 %153, 50257, !dbg !36 %162 = icmp ult i64 %154, 50257, !dbg !36 %163 = icmp ult i64 %155, 50257, !dbg !36 %164 = icmp ult i64 %156, 50257, !dbg !36 %165 = icmp ult i64 %157, 50257, !dbg !36 %166 = icmp ult i64 %158, 50257, !dbg !36 %167 = icmp ult i64 %159, 50257, !dbg !36 %168 = add nsw i64 %152, %36, !dbg !37 %169 = add nsw i64 %153, %36, !dbg !37 %170 = add nsw i64 %154, %36, !dbg !37 %171 = add nsw i64 %155, %36, !dbg !37 %172 = add nsw i64 %156, %36, !dbg !37 %173 = add nsw i64 %157, %36, !dbg !37 %174 = add nsw i64 %158, %36, !dbg !37 %175 = add nsw i64 %159, %36, !dbg !37 %176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38 %177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38 %178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38 %179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38 %180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38 %181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38 %182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38 %183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38 %184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39 %185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39 %186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39 %187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39 %188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39 %189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39 %190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39 %191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39 %192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40 %193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40 %194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40 %195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40 %196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40 %197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40 %198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40 %199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40 %200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41 %201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41 %202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41 %203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41 %204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41 %205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41 %206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41 %207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41 %208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42 %209 = bitcast i32 %208 to float, !dbg !42 %210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42 %211 = bitcast i32 %210 to float, !dbg !42 %212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42 %213 = bitcast i32 %212 to float, !dbg !42 %214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42 %215 = bitcast i32 %214 to float, !dbg !42 %216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42 %217 = bitcast i32 %216 to float, !dbg !42 %218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42 %219 = bitcast i32 %218 to float, !dbg !42 %220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42 %221 = bitcast i32 %220 to float, !dbg !42 %222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42 %223 = bitcast i32 %222 to float, !dbg !42 %224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43 %225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43 %226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43 %227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43 %228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43 %229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43 %230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43 %231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43 %232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44 %233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44 %234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44 %235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44 %236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44 %237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44 %238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44 %239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44 %240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45 %241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45 %242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45 %243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45 %244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45 %245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45 %246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45 %247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45 %248 = fmul float %38, %209, !dbg !46 %249 = fmul float %38, %211, !dbg !46 %250 = fmul float %38, %213, !dbg !46 %251 = fmul float %38, %215, !dbg !46 %252 = fmul float %38, %217, !dbg !46 %253 = fmul float %38, %219, !dbg !46 %254 = fmul float %38, %221, !dbg !46 %255 = fmul float %38, %223, !dbg !46 %256 = fmul float %240, 0x3FF7154760000000, !dbg !47 %257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47 %258 = fmul float %241, 0x3FF7154760000000, !dbg !47 %259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47 %260 = fmul float %242, 0x3FF7154760000000, !dbg !47 %261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47 %262 = fmul float %243, 0x3FF7154760000000, !dbg !47 %263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47 %264 = fmul float %244, 0x3FF7154760000000, !dbg !47 %265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47 %266 = fmul float %245, 0x3FF7154760000000, !dbg !47 %267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47 %268 = fmul float %246, 0x3FF7154760000000, !dbg !47 %269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47 %270 = fmul float %247, 0x3FF7154760000000, !dbg !47 %271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47 %272 = fmul float %140, %257, !dbg !48 %273 = fmul float %140, %259, !dbg !48 %274 = fmul float %140, %261, !dbg !48 %275 = fmul float %140, %263, !dbg !48 %276 = fmul float %140, %265, !dbg !48 %277 = fmul float %140, %267, !dbg !48 %278 = fmul float %140, %269, !dbg !48 %279 = fmul float %140, %271, !dbg !48 %280 = fsub float %248, %272, !dbg !49 %281 = fsub float %249, %273, !dbg !49 %282 = fsub float %250, %274, !dbg !49 %283 = fsub float %251, %275, !dbg !49 %284 = fsub float %252, %276, !dbg !49 %285 = fsub float %253, %277, !dbg !49 %286 = fsub float %254, %278, !dbg !49 %287 = fsub float %255, %279, !dbg !49 %288 = fadd float %192, %280, !dbg !50 %289 = fadd float %193, %281, !dbg !50 %290 = fadd float %194, %282, !dbg !50 %291 = fadd float %195, %283, !dbg !50 %292 = fadd float %196, %284, !dbg !50 %293 = fadd float %197, %285, !dbg !50 %294 = fadd float %198, %286, !dbg !50 %295 = fadd float %199, %287, !dbg !50 %296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51 %297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51 %298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51 %299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51 %300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51 %301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51 %302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51 %303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51 %304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52 %305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52 %306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52 %307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52 %308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52 %309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52 %310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52 %311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52 tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52 %312 = add nuw nsw i32 %150, 2048, !dbg !35 %313 = icmp ult i32 %150, 48209, !dbg !35 br i1 %313, label %149, label %314, !dbg !35 314: ; preds = %149 ret void, !dbg !53 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } attributes #2 = { convergent nocallback nounwind } attributes #3 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 24, column: 33, scope: !5) !9 = !DILocation(line: 21, column: 28, scope: !5) !10 = !DILocation(line: 21, column: 34, scope: !5) !11 = !DILocation(line: 26, column: 30, scope: !5) !12 = !DILocation(line: 26, column: 35, scope: !5) !13 = !DILocation(line: 27, column: 19, scope: !5) !14 = !DILocation(line: 29, column: 19, scope: !5) !15 = !DILocation(line: 36, column: 46, scope: !5) !16 = !DILocation(line: 38, column: 23, scope: !5) !17 = !DILocation(line: 39, column: 22, scope: !5) !18 = !DILocation(line: 41, column: 37, scope: !5) !19 = !DILocation(line: 32, column: 36, scope: !5) !20 = !DILocation(line: 42, column: 23, scope: !5) !21 = !DILocation(line: 33, column: 27, scope: !5) !22 = !DILocation(line: 34, column: 25, scope: !5) !23 = !DILocation(line: 36, column: 34, scope: !5) !24 = !DILocation(line: 36, column: 52, scope: !5) !25 = !DILocation(line: 45, column: 40, scope: !5) !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") !29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) !31 = !DILocation(line: 46, column: 27, scope: !27) !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) !33 = !DILocation(line: 46, column: 27, scope: !29) !34 = !DILocation(line: 52, column: 27, scope: !5) !35 = !DILocation(line: 51, column: 36, scope: !5) !36 = !DILocation(line: 53, column: 25, scope: !5) !37 = !DILocation(line: 55, column: 41, scope: !5) !38 = !DILocation(line: 55, column: 35, scope: !5) !39 = !DILocation(line: 55, column: 53, scope: !5) !40 = !DILocation(line: 55, column: 105, scope: !5) !41 = !DILocation(line: 56, column: 35, scope: !5) !42 = !DILocation(line: 56, column: 53, scope: !5) !43 = !DILocation(line: 57, column: 35, scope: !5) !44 = !DILocation(line: 57, column: 53, scope: !5) !45 = !DILocation(line: 57, column: 105, scope: !5) !46 = !DILocation(line: 63, column: 24, scope: !5) !47 = !DILocation(line: 65, column: 23, scope: !5) !48 = !DILocation(line: 66, column: 24, scope: !5) !49 = !DILocation(line: 67, column: 24, scope: !5) !50 = !DILocation(line: 69, column: 24, scope: !5) !51 = !DILocation(line: 70, column: 29, scope: !5) !52 = !DILocation(line: 70, column: 54, scope: !5) !53 = !DILocation(line: 51, column: 4, scope: !5)