|
|
|
source_filename = "LLVMDialectModule" |
|
|
|
@global_smem = external addrspace(3) global [0 x i8] |
|
|
|
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 { |
|
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 |
|
%11 = lshr i32 %10, 5, !dbg !8 |
|
%urem = and i32 %10, 255, !dbg !8 |
|
%12 = or i32 %urem, 256, !dbg !8 |
|
%13 = or i32 %urem, 512, !dbg !8 |
|
%14 = or i32 %urem, 768, !dbg !8 |
|
%15 = or i32 %urem, 1024, !dbg !8 |
|
%16 = or i32 %urem, 1280, !dbg !8 |
|
%17 = or i32 %urem, 1536, !dbg !8 |
|
%18 = or i32 %urem, 1792, !dbg !8 |
|
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 |
|
%20 = sext i32 %19 to i64, !dbg !10 |
|
%21 = insertelement <8 x i32> poison, i32 %urem, i64 0 |
|
%22 = insertelement <8 x i32> %21, i32 %12, i64 1 |
|
%23 = insertelement <8 x i32> %22, i32 %13, i64 2 |
|
%24 = insertelement <8 x i32> %23, i32 %14, i64 3 |
|
%25 = insertelement <8 x i32> %24, i32 %15, i64 4 |
|
%26 = insertelement <8 x i32> %25, i32 %16, i64 5 |
|
%27 = insertelement <8 x i32> %26, i32 %17, i64 6 |
|
%28 = insertelement <8 x i32> %27, i32 %18, i64 7 |
|
%29 = zext <8 x i32> %28 to <8 x i64> |
|
%30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11 |
|
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12 |
|
%32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13 |
|
%33 = bitcast i32 %32 to float, !dbg !13 |
|
%34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14 |
|
%35 = bitcast i32 %34 to float, !dbg !14 |
|
%36 = mul nsw i64 %20, 50257, !dbg !15 |
|
%.not = icmp eq i64 %31, -1, !dbg !16 |
|
%37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17 |
|
%38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18 |
|
%invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19 |
|
%39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20 |
|
%40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20 |
|
br label %41, !dbg !19 |
|
|
|
41: |
|
%42 = phi i32 [ 0, %9 ], [ %85, %41 ] |
|
%43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ] |
|
%44 = zext nneg i32 %42 to i64, !dbg !21 |
|
%45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21 |
|
%46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21 |
|
%47 = or <8 x i64> %46, %29, !dbg !21 |
|
%48 = icmp ult <8 x i64> %47, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22 |
|
%49 = extractelement <8 x i64> %47, i64 0, !dbg !23 |
|
%gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23 |
|
%50 = extractelement <8 x i64> %47, i64 1, !dbg !23 |
|
%gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23 |
|
%51 = extractelement <8 x i64> %47, i64 2, !dbg !23 |
|
%gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23 |
|
%52 = extractelement <8 x i64> %47, i64 3, !dbg !23 |
|
%gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23 |
|
%53 = extractelement <8 x i64> %47, i64 4, !dbg !23 |
|
%gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23 |
|
%54 = extractelement <8 x i64> %47, i64 5, !dbg !23 |
|
%gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23 |
|
%55 = extractelement <8 x i64> %47, i64 6, !dbg !23 |
|
%gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23 |
|
%56 = extractelement <8 x i64> %47, i64 7, !dbg !23 |
|
%gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23 |
|
%57 = extractelement <8 x i1> %48, i64 0, !dbg !24 |
|
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24 |
|
%59 = extractelement <8 x i1> %48, i64 1, !dbg !24 |
|
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24 |
|
%61 = extractelement <8 x i1> %48, i64 2, !dbg !24 |
|
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24 |
|
%63 = extractelement <8 x i1> %48, i64 3, !dbg !24 |
|
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24 |
|
%65 = extractelement <8 x i1> %48, i64 4, !dbg !24 |
|
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24 |
|
%67 = extractelement <8 x i1> %48, i64 5, !dbg !24 |
|
%68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24 |
|
%69 = extractelement <8 x i1> %48, i64 6, !dbg !24 |
|
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24 |
|
%71 = extractelement <8 x i1> %48, i64 7, !dbg !24 |
|
%72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24 |
|
%73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24 |
|
%74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24 |
|
%75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24 |
|
%76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24 |
|
%77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24 |
|
%78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24 |
|
%79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24 |
|
%80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24 |
|
%81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24 |
|
%82 = fmul <8 x float> %40, %81, !dbg !20 |
|
%83 = select <8 x i1> %48, <8 x float> %82, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !25 |
|
%84 = fadd <8 x float> %43, %83, !dbg !25 |
|
%85 = add nuw nsw i32 %42, 2048, !dbg !19 |
|
%86 = icmp ult i32 %42, 48209, !dbg !19 |
|
br i1 %86, label %41, label %87, !dbg !19 |
|
|
|
87: |
|
%88 = and i32 %10, 31, !dbg !8 |
|
%89 = and i32 %11, 7, !dbg !8 |
|
%shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%90 = fadd <8 x float> %84, %shift, !dbg !26 |
|
%shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%91 = fadd <8 x float> %shift37, %90, !dbg !26 |
|
%shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%92 = fadd <8 x float> %shift38, %91, !dbg !26 |
|
%shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%93 = fadd <8 x float> %shift39, %92, !dbg !26 |
|
%shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%94 = fadd <8 x float> %shift40, %93, !dbg !26 |
|
%shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%95 = fadd <8 x float> %shift41, %94, !dbg !26 |
|
%shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26 |
|
%96 = fadd <8 x float> %shift42, %95, !dbg !26 |
|
%97 = extractelement <8 x float> %96, i64 0, !dbg !26 |
|
%98 = bitcast float %97 to i32, !dbg !32 |
|
%99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32 |
|
%100 = bitcast i32 %99 to float, !dbg !32 |
|
%101 = fadd float %97, %100, !dbg !26 |
|
%102 = bitcast float %101 to i32, !dbg !32 |
|
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32 |
|
%104 = bitcast i32 %103 to float, !dbg !32 |
|
%105 = fadd float %101, %104, !dbg !26 |
|
%106 = bitcast float %105 to i32, !dbg !32 |
|
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32 |
|
%108 = bitcast i32 %107 to float, !dbg !32 |
|
%109 = fadd float %105, %108, !dbg !26 |
|
%110 = bitcast float %109 to i32, !dbg !32 |
|
%111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32 |
|
%112 = bitcast i32 %111 to float, !dbg !32 |
|
%113 = fadd float %109, %112, !dbg !26 |
|
%114 = bitcast float %113 to i32, !dbg !32 |
|
%115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32 |
|
%116 = bitcast i32 %115 to float, !dbg !32 |
|
%117 = fadd float %113, %116, !dbg !26 |
|
%118 = icmp eq i32 %88, 0, !dbg !32 |
|
%119 = zext nneg i32 %89 to i64, !dbg !32 |
|
%120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !32 |
|
%121 = icmp slt i32 %10, 8, !dbg !32 |
|
%122 = sext i32 %10 to i64, !dbg !32 |
|
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32 |
|
%124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32 |
|
%125 = bitcast float %124 to i32, !dbg !32 |
|
%126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32 |
|
%127 = bitcast i32 %126 to float, !dbg !32 |
|
%128 = fadd float %124, %127, !dbg !26 |
|
%129 = bitcast float %128 to i32, !dbg !32 |
|
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32 |
|
%131 = bitcast i32 %130 to float, !dbg !32 |
|
%132 = fadd float %128, %131, !dbg !26 |
|
%133 = bitcast float %132 to i32, !dbg !32 |
|
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32 |
|
%135 = bitcast i32 %134 to float, !dbg !32 |
|
%136 = fadd float %132, %135, !dbg !26 |
|
%137 = and i32 %10, 7, !dbg !32 |
|
%138 = icmp eq i32 %137, 0, !dbg !32 |
|
%139 = and i1 %121, %138, !dbg !32 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !32 |
|
%140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 |
|
%141 = extractelement <8 x i64> %29, i64 0, !dbg !34 |
|
%142 = extractelement <8 x i64> %29, i64 1, !dbg !34 |
|
%143 = extractelement <8 x i64> %29, i64 2, !dbg !34 |
|
%144 = extractelement <8 x i64> %29, i64 3, !dbg !34 |
|
%145 = extractelement <8 x i64> %29, i64 4, !dbg !34 |
|
%146 = extractelement <8 x i64> %29, i64 5, !dbg !34 |
|
%147 = extractelement <8 x i64> %29, i64 6, !dbg !34 |
|
%148 = extractelement <8 x i64> %29, i64 7, !dbg !34 |
|
br label %149, !dbg !35 |
|
|
|
149: |
|
%150 = phi i32 [ 0, %87 ], [ %312, %149 ] |
|
%151 = zext nneg i32 %150 to i64, !dbg !34 |
|
%152 = or i64 %141, %151, !dbg !34 |
|
%153 = or i64 %142, %151, !dbg !34 |
|
%154 = or i64 %143, %151, !dbg !34 |
|
%155 = or i64 %144, %151, !dbg !34 |
|
%156 = or i64 %145, %151, !dbg !34 |
|
%157 = or i64 %146, %151, !dbg !34 |
|
%158 = or i64 %147, %151, !dbg !34 |
|
%159 = or i64 %148, %151, !dbg !34 |
|
%160 = icmp ult i64 %152, 50257, !dbg !36 |
|
%161 = icmp ult i64 %153, 50257, !dbg !36 |
|
%162 = icmp ult i64 %154, 50257, !dbg !36 |
|
%163 = icmp ult i64 %155, 50257, !dbg !36 |
|
%164 = icmp ult i64 %156, 50257, !dbg !36 |
|
%165 = icmp ult i64 %157, 50257, !dbg !36 |
|
%166 = icmp ult i64 %158, 50257, !dbg !36 |
|
%167 = icmp ult i64 %159, 50257, !dbg !36 |
|
%168 = add nsw i64 %152, %36, !dbg !37 |
|
%169 = add nsw i64 %153, %36, !dbg !37 |
|
%170 = add nsw i64 %154, %36, !dbg !37 |
|
%171 = add nsw i64 %155, %36, !dbg !37 |
|
%172 = add nsw i64 %156, %36, !dbg !37 |
|
%173 = add nsw i64 %157, %36, !dbg !37 |
|
%174 = add nsw i64 %158, %36, !dbg !37 |
|
%175 = add nsw i64 %159, %36, !dbg !37 |
|
%176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38 |
|
%177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38 |
|
%178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38 |
|
%179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38 |
|
%180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38 |
|
%181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38 |
|
%182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38 |
|
%183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38 |
|
%184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39 |
|
%185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39 |
|
%186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39 |
|
%187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39 |
|
%188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39 |
|
%189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39 |
|
%190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39 |
|
%191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39 |
|
%192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40 |
|
%193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40 |
|
%194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40 |
|
%195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40 |
|
%196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40 |
|
%197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40 |
|
%198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40 |
|
%199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40 |
|
%200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41 |
|
%201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41 |
|
%202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41 |
|
%203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41 |
|
%204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41 |
|
%205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41 |
|
%206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41 |
|
%207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41 |
|
%208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42 |
|
%209 = bitcast i32 %208 to float, !dbg !42 |
|
%210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42 |
|
%211 = bitcast i32 %210 to float, !dbg !42 |
|
%212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42 |
|
%213 = bitcast i32 %212 to float, !dbg !42 |
|
%214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42 |
|
%215 = bitcast i32 %214 to float, !dbg !42 |
|
%216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42 |
|
%217 = bitcast i32 %216 to float, !dbg !42 |
|
%218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42 |
|
%219 = bitcast i32 %218 to float, !dbg !42 |
|
%220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42 |
|
%221 = bitcast i32 %220 to float, !dbg !42 |
|
%222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42 |
|
%223 = bitcast i32 %222 to float, !dbg !42 |
|
%224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43 |
|
%225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43 |
|
%226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43 |
|
%227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43 |
|
%228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43 |
|
%229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43 |
|
%230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43 |
|
%231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43 |
|
%232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44 |
|
%233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44 |
|
%234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44 |
|
%235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44 |
|
%236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44 |
|
%237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44 |
|
%238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44 |
|
%239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44 |
|
%240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45 |
|
%241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45 |
|
%242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45 |
|
%243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45 |
|
%244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45 |
|
%245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45 |
|
%246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45 |
|
%247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45 |
|
%248 = fmul float %38, %209, !dbg !46 |
|
%249 = fmul float %38, %211, !dbg !46 |
|
%250 = fmul float %38, %213, !dbg !46 |
|
%251 = fmul float %38, %215, !dbg !46 |
|
%252 = fmul float %38, %217, !dbg !46 |
|
%253 = fmul float %38, %219, !dbg !46 |
|
%254 = fmul float %38, %221, !dbg !46 |
|
%255 = fmul float %38, %223, !dbg !46 |
|
%256 = fmul float %240, 0x3FF7154760000000, !dbg !47 |
|
%257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47 |
|
%258 = fmul float %241, 0x3FF7154760000000, !dbg !47 |
|
%259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47 |
|
%260 = fmul float %242, 0x3FF7154760000000, !dbg !47 |
|
%261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47 |
|
%262 = fmul float %243, 0x3FF7154760000000, !dbg !47 |
|
%263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47 |
|
%264 = fmul float %244, 0x3FF7154760000000, !dbg !47 |
|
%265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47 |
|
%266 = fmul float %245, 0x3FF7154760000000, !dbg !47 |
|
%267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47 |
|
%268 = fmul float %246, 0x3FF7154760000000, !dbg !47 |
|
%269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47 |
|
%270 = fmul float %247, 0x3FF7154760000000, !dbg !47 |
|
%271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47 |
|
%272 = fmul float %140, %257, !dbg !48 |
|
%273 = fmul float %140, %259, !dbg !48 |
|
%274 = fmul float %140, %261, !dbg !48 |
|
%275 = fmul float %140, %263, !dbg !48 |
|
%276 = fmul float %140, %265, !dbg !48 |
|
%277 = fmul float %140, %267, !dbg !48 |
|
%278 = fmul float %140, %269, !dbg !48 |
|
%279 = fmul float %140, %271, !dbg !48 |
|
%280 = fsub float %248, %272, !dbg !49 |
|
%281 = fsub float %249, %273, !dbg !49 |
|
%282 = fsub float %250, %274, !dbg !49 |
|
%283 = fsub float %251, %275, !dbg !49 |
|
%284 = fsub float %252, %276, !dbg !49 |
|
%285 = fsub float %253, %277, !dbg !49 |
|
%286 = fsub float %254, %278, !dbg !49 |
|
%287 = fsub float %255, %279, !dbg !49 |
|
%288 = fadd float %192, %280, !dbg !50 |
|
%289 = fadd float %193, %281, !dbg !50 |
|
%290 = fadd float %194, %282, !dbg !50 |
|
%291 = fadd float %195, %283, !dbg !50 |
|
%292 = fadd float %196, %284, !dbg !50 |
|
%293 = fadd float %197, %285, !dbg !50 |
|
%294 = fadd float %198, %286, !dbg !50 |
|
%295 = fadd float %199, %287, !dbg !50 |
|
%296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51 |
|
%297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51 |
|
%298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51 |
|
%299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51 |
|
%300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51 |
|
%301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51 |
|
%302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51 |
|
%303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51 |
|
%304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52 |
|
%305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52 |
|
%306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52 |
|
%307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52 |
|
%308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52 |
|
%309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52 |
|
%310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52 |
|
%311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52 |
|
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52 |
|
%312 = add nuw nsw i32 %150, 2048, !dbg !35 |
|
%313 = icmp ult i32 %150, 48209, !dbg !35 |
|
br i1 %313, label %149, label %314, !dbg !35 |
|
|
|
314: |
|
ret void, !dbg !53 |
|
} |
|
|
|
|
|
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 |
|
|
|
|
|
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 |
|
|
|
|
|
declare void @llvm.nvvm.barrier0() #2 |
|
|
|
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
|
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } |
|
attributes #2 = { convergent nocallback nounwind } |
|
attributes #3 = { nounwind } |
|
|
|
!llvm.module.flags = !{!0} |
|
!llvm.dbg.cu = !{!1} |
|
!nvvm.annotations = !{!3, !4, !4, !3} |
|
|
|
!0 = !{i32 2, !"Debug Info Version", i32 3} |
|
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) |
|
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz") |
|
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1} |
|
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256} |
|
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) |
|
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) |
|
!7 = !{} |
|
!8 = !DILocation(line: 24, column: 33, scope: !5) |
|
!9 = !DILocation(line: 21, column: 28, scope: !5) |
|
!10 = !DILocation(line: 21, column: 34, scope: !5) |
|
!11 = !DILocation(line: 26, column: 30, scope: !5) |
|
!12 = !DILocation(line: 26, column: 35, scope: !5) |
|
!13 = !DILocation(line: 27, column: 19, scope: !5) |
|
!14 = !DILocation(line: 29, column: 19, scope: !5) |
|
!15 = !DILocation(line: 36, column: 46, scope: !5) |
|
!16 = !DILocation(line: 38, column: 23, scope: !5) |
|
!17 = !DILocation(line: 39, column: 22, scope: !5) |
|
!18 = !DILocation(line: 41, column: 37, scope: !5) |
|
!19 = !DILocation(line: 32, column: 36, scope: !5) |
|
!20 = !DILocation(line: 42, column: 23, scope: !5) |
|
!21 = !DILocation(line: 33, column: 27, scope: !5) |
|
!22 = !DILocation(line: 34, column: 25, scope: !5) |
|
!23 = !DILocation(line: 36, column: 34, scope: !5) |
|
!24 = !DILocation(line: 36, column: 52, scope: !5) |
|
!25 = !DILocation(line: 45, column: 40, scope: !5) |
|
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) |
|
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) |
|
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") |
|
!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) |
|
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) |
|
!31 = !DILocation(line: 46, column: 27, scope: !27) |
|
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) |
|
!33 = !DILocation(line: 46, column: 27, scope: !29) |
|
!34 = !DILocation(line: 52, column: 27, scope: !5) |
|
!35 = !DILocation(line: 51, column: 36, scope: !5) |
|
!36 = !DILocation(line: 53, column: 25, scope: !5) |
|
!37 = !DILocation(line: 55, column: 41, scope: !5) |
|
!38 = !DILocation(line: 55, column: 35, scope: !5) |
|
!39 = !DILocation(line: 55, column: 53, scope: !5) |
|
!40 = !DILocation(line: 55, column: 105, scope: !5) |
|
!41 = !DILocation(line: 56, column: 35, scope: !5) |
|
!42 = !DILocation(line: 56, column: 53, scope: !5) |
|
!43 = !DILocation(line: 57, column: 35, scope: !5) |
|
!44 = !DILocation(line: 57, column: 53, scope: !5) |
|
!45 = !DILocation(line: 57, column: 105, scope: !5) |
|
!46 = !DILocation(line: 63, column: 24, scope: !5) |
|
!47 = !DILocation(line: 65, column: 23, scope: !5) |
|
!48 = !DILocation(line: 66, column: 24, scope: !5) |
|
!49 = !DILocation(line: 67, column: 24, scope: !5) |
|
!50 = !DILocation(line: 69, column: 24, scope: !5) |
|
!51 = !DILocation(line: 70, column: 29, scope: !5) |
|
!52 = !DILocation(line: 70, column: 54, scope: !5) |
|
!53 = !DILocation(line: 51, column: 4, scope: !5) |
|
|