; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external addrspace(3) global [0 x i8] define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %7 = and i32 %6, 31, !dbg !8 %8 = lshr i32 %6, 5, !dbg !8 %9 = shl i32 %6, 2, !dbg !8 %10 = and i32 %9, 60, !dbg !8 %11 = and i32 %8, 3, !dbg !9 %12 = lshr i32 %7, 4, !dbg !9 %13 = shl nuw nsw i32 %11, 1, !dbg !9 %14 = or i32 %13, %12, !dbg !9 %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 %16 = shl i32 %15, 6, !dbg !11 %17 = or i32 %16, %10, !dbg !12 %.frozen = freeze i32 %17 %18 = sdiv i32 %.frozen, 256, !dbg !13 %19 = mul i32 %18, 256 %.decomposed = sub i32 %.frozen, %19 %20 = shl i32 %18, 15, !dbg !14 %21 = add i32 %20, %.decomposed br label %22, !dbg !15 22: ; preds = %5, %22 %23 = phi i32 [ 0, %5 ], [ %58, %22 ] %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ] %25 = or i32 %23, %14, !dbg !16 %26 = shl i32 %25, 8, !dbg !17 %27 = add i32 %21, %26, !dbg !18 %28 = sext i32 %27 to i64, !dbg !19 %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19 %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 %31 = extractvalue { i32, i32 } %30, 0, !dbg !20 %32 = extractvalue { i32, i32 } %30, 1, !dbg !20 %33 = trunc i32 %31 to i16, !dbg !20 %extelt.offset = lshr i32 %31, 16, !dbg !20 %34 = trunc i32 %extelt.offset to i16, !dbg !20 %35 = trunc i32 %32 to i16, !dbg !20 %extelt.offset1 = lshr i32 %32, 16, !dbg !20 %36 = trunc i32 %extelt.offset1 to i16, !dbg !20 %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21 %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21 %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21 %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21 %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22 %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23 %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23 %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23 %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23 %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23 %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23 %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23 %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23 %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23 %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23 %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24 %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24 %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24 %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24 %56 = fmul <4 x float> %55, %51, !dbg !24 %57 = fadd <4 x float> %24, %56, !dbg !25 %58 = add nuw nsw i32 %23, 8, !dbg !15 %59 = icmp ult i32 %23, 120, !dbg !15 br i1 %59, label %22, label %60, !dbg !15 60: ; preds = %22 %61 = and i32 %6, 63, !dbg !8 %62 = or i32 %16, %61, !dbg !12 %63 = or i32 %10, 3, !dbg !26 %64 = or i32 %10, 2, !dbg !26 %65 = or i32 %10, 1, !dbg !26 %66 = extractelement <4 x float> %57, i64 0, !dbg !26 %67 = bitcast float %66 to i32, !dbg !26 %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26 %69 = bitcast i32 %68 to float, !dbg !26 %70 = fadd float %66, %69, !dbg !30 %71 = extractelement <4 x float> %57, i64 1, !dbg !26 %72 = bitcast float %71 to i32, !dbg !26 %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26 %74 = bitcast i32 %73 to float, !dbg !26 %75 = fadd float %71, %74, !dbg !30 %76 = extractelement <4 x float> %57, i64 2, !dbg !26 %77 = bitcast float %76 to i32, !dbg !26 %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26 %79 = bitcast i32 %78 to float, !dbg !26 %80 = fadd float %76, %79, !dbg !30 %81 = extractelement <4 x float> %57, i64 3, !dbg !26 %82 = bitcast float %81 to i32, !dbg !26 %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26 %84 = bitcast i32 %83 to float, !dbg !26 %85 = fadd float %81, %84, !dbg !30 %86 = icmp ult i32 %7, 16, !dbg !26 %87 = shl nuw nsw i32 %10, 2, !dbg !26 %88 = or i32 %87, %11, !dbg !26 %89 = zext nneg i32 %88 to i64, !dbg !26 %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26 %91 = shl nuw nsw i32 %65, 2, !dbg !26 %92 = or i32 %91, %11, !dbg !26 %93 = zext nneg i32 %92 to i64, !dbg !26 %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26 %95 = shl nuw nsw i32 %64, 2, !dbg !26 %96 = or i32 %95, %11, !dbg !26 %97 = zext nneg i32 %96 to i64, !dbg !26 %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26 %99 = shl nuw nsw i32 %63, 2, !dbg !26 %100 = or i32 %99, %11, !dbg !26 %101 = zext nneg i32 %100 to i64, !dbg !26 %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26 tail call void @llvm.nvvm.barrier0(), !dbg !26 %103 = icmp slt i32 %6, 256, !dbg !26 %104 = sext i32 %6 to i64, !dbg !26 %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26 %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26 %107 = bitcast float %106 to i32, !dbg !26 %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26 %109 = bitcast i32 %108 to float, !dbg !26 %110 = fadd float %106, %109, !dbg !30 %111 = bitcast float %110 to i32, !dbg !26 %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26 %113 = bitcast i32 %112 to float, !dbg !26 %114 = fadd float %110, %113, !dbg !30 %115 = and i32 %6, 3, !dbg !26 %116 = icmp eq i32 %115, 0, !dbg !26 %117 = and i1 %103, %116, !dbg !26 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26 %118 = add i32 %6, 128, !dbg !26 %119 = sext i32 %118 to i64, !dbg !26 %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26 %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26 %122 = bitcast float %121 to i32, !dbg !26 %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26 %124 = bitcast i32 %123 to float, !dbg !26 %125 = fadd float %121, %124, !dbg !30 %126 = bitcast float %125 to i32, !dbg !26 %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26 %128 = bitcast i32 %127 to float, !dbg !26 %129 = fadd float %125, %128, !dbg !30 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26 tail call void @llvm.nvvm.barrier0(), !dbg !26 %130 = zext nneg i32 %87 to i64, !dbg !26 %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26 %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26 %133 = zext nneg i32 %91 to i64, !dbg !26 %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26 %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26 %136 = zext nneg i32 %95 to i64, !dbg !26 %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26 %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26 %139 = zext nneg i32 %99 to i64, !dbg !26 %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26 %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26 tail call void @llvm.nvvm.barrier0(), !dbg !34 %142 = zext nneg i32 %10 to i64, !dbg !34 %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34 %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34 store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34 %145 = zext nneg i32 %65 to i64, !dbg !34 %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34 %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34 store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34 %148 = zext nneg i32 %64 to i64, !dbg !34 %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34 %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34 store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34 %151 = zext nneg i32 %63 to i64, !dbg !34 %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34 %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34 store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34 tail call void @llvm.nvvm.barrier0(), !dbg !34 %154 = zext nneg i32 %61 to i64, !dbg !34 %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34 %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34 %157 = sext i32 %62 to i64, !dbg !35 %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35 %159 = and i32 %6, 64, !dbg !36 %160 = icmp eq i32 %159, 0, !dbg !36 tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36 ret void, !dbg !37 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } attributes #2 = { convergent nocallback nounwind } attributes #3 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj") !3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128} !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 22, column: 44, scope: !5) !9 = !DILocation(line: 24, column: 33, scope: !5) !10 = !DILocation(line: 21, column: 28, scope: !5) !11 = !DILocation(line: 21, column: 33, scope: !5) !12 = !DILocation(line: 22, column: 23, scope: !5) !13 = !DILocation(line: 26, column: 20, scope: !5) !14 = !DILocation(line: 33, column: 57, scope: !5) !15 = !DILocation(line: 29, column: 36, scope: !5) !16 = !DILocation(line: 30, column: 27, scope: !5) !17 = !DILocation(line: 33, column: 44, scope: !5) !18 = !DILocation(line: 33, column: 51, scope: !5) !19 = !DILocation(line: 33, column: 34, scope: !5) !20 = !DILocation(line: 33, column: 63, scope: !5) !21 = !DILocation(line: 33, column: 115, scope: !5) !22 = !DILocation(line: 34, column: 34, scope: !5) !23 = !DILocation(line: 34, column: 63, scope: !5) !24 = !DILocation(line: 36, column: 22, scope: !5) !25 = !DILocation(line: 39, column: 38, scope: !5) !26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29) !27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") !29 = !DILocation(line: 40, column: 25, scope: !27) !30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32) !31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0) !32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33) !33 = !DILocation(line: 40, column: 25, scope: !31) !34 = !DILocation(line: 40, column: 28, scope: !5) !35 = !DILocation(line: 41, column: 25, scope: !5) !36 = !DILocation(line: 41, column: 36, scope: !5) !37 = !DILocation(line: 41, column: 4, scope: !5)