0-hero's picture
Add files using upload-large-folder tool
4bf7f8a verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%7 = and i32 %6, 31, !dbg !8
%8 = lshr i32 %6, 5, !dbg !8
%9 = shl i32 %6, 2, !dbg !8
%10 = and i32 %9, 12, !dbg !8
%11 = and i32 %6, 15, !dbg !8
%12 = and i32 %8, 7, !dbg !9
%13 = lshr i32 %7, 2, !dbg !9
%14 = shl nuw nsw i32 %12, 3, !dbg !9
%15 = or i32 %14, %13, !dbg !9
%16 = or i32 %15, 64, !dbg !9
%17 = or i32 %10, 1, !dbg !10
%18 = or i32 %10, 2, !dbg !10
%19 = or i32 %10, 3, !dbg !10
%20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
%21 = shl i32 %20, 4, !dbg !15
%22 = or i32 %21, %10, !dbg !16
%23 = or i32 %21, %11, !dbg !16
%24 = icmp ult i32 %16, 120, !dbg !17
%25 = shl nuw nsw i32 %15, 17, !dbg !18
%26 = shl nuw nsw i32 %16, 17, !dbg !18
%27 = add i32 %22, %25, !dbg !19
%28 = add i32 %22, %26, !dbg !19
%29 = sext i32 %27 to i64, !dbg !20
%30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !20
%31 = sext i32 %28 to i64, !dbg !20
%32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !20
%33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
%34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !21
%35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !21
%36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !21
%37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !21
%38 = bitcast i32 %34 to float, !dbg !21
%39 = bitcast i32 %35 to float, !dbg !21
%40 = bitcast i32 %36 to float, !dbg !21
%41 = bitcast i32 %37 to float, !dbg !21
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24) #3, !dbg !21
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21
%47 = bitcast i32 %43 to float, !dbg !21
%48 = bitcast i32 %44 to float, !dbg !21
%49 = bitcast i32 %45 to float, !dbg !21
%50 = bitcast i32 %46 to float, !dbg !21
%51 = fadd float %38, 0.000000e+00, !dbg !22
%52 = fadd float %39, 0.000000e+00, !dbg !22
%53 = fadd float %40, 0.000000e+00, !dbg !22
%54 = fadd float %41, 0.000000e+00, !dbg !22
%55 = fadd float %47, 0.000000e+00, !dbg !22
%56 = fadd float %48, 0.000000e+00, !dbg !22
%57 = fadd float %49, 0.000000e+00, !dbg !22
%58 = fadd float %50, 0.000000e+00, !dbg !22
%59 = select i1 %24, float %55, float 0.000000e+00, !dbg !23
%60 = select i1 %24, float %56, float 0.000000e+00, !dbg !23
%61 = select i1 %24, float %57, float 0.000000e+00, !dbg !23
%62 = select i1 %24, float %58, float 0.000000e+00, !dbg !23
%63 = fadd float %51, %59, !dbg !24
%64 = fadd float %52, %60, !dbg !24
%65 = fadd float %53, %61, !dbg !24
%66 = fadd float %54, %62, !dbg !24
%67 = bitcast float %63 to i32, !dbg !10
%68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !10
%69 = bitcast i32 %68 to float, !dbg !10
%70 = fadd float %63, %69, !dbg !24
%71 = bitcast float %70 to i32, !dbg !10
%72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !10
%73 = bitcast i32 %72 to float, !dbg !10
%74 = fadd float %70, %73, !dbg !24
%75 = bitcast float %74 to i32, !dbg !10
%76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 4, i32 31), !dbg !10
%77 = bitcast i32 %76 to float, !dbg !10
%78 = fadd float %74, %77, !dbg !24
%79 = bitcast float %64 to i32, !dbg !10
%80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 16, i32 31), !dbg !10
%81 = bitcast i32 %80 to float, !dbg !10
%82 = fadd float %64, %81, !dbg !24
%83 = bitcast float %82 to i32, !dbg !10
%84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 8, i32 31), !dbg !10
%85 = bitcast i32 %84 to float, !dbg !10
%86 = fadd float %82, %85, !dbg !24
%87 = bitcast float %86 to i32, !dbg !10
%88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 4, i32 31), !dbg !10
%89 = bitcast i32 %88 to float, !dbg !10
%90 = fadd float %86, %89, !dbg !24
%91 = bitcast float %65 to i32, !dbg !10
%92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %91, i32 16, i32 31), !dbg !10
%93 = bitcast i32 %92 to float, !dbg !10
%94 = fadd float %65, %93, !dbg !24
%95 = bitcast float %94 to i32, !dbg !10
%96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 8, i32 31), !dbg !10
%97 = bitcast i32 %96 to float, !dbg !10
%98 = fadd float %94, %97, !dbg !24
%99 = bitcast float %98 to i32, !dbg !10
%100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 4, i32 31), !dbg !10
%101 = bitcast i32 %100 to float, !dbg !10
%102 = fadd float %98, %101, !dbg !24
%103 = bitcast float %66 to i32, !dbg !10
%104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !10
%105 = bitcast i32 %104 to float, !dbg !10
%106 = fadd float %66, %105, !dbg !24
%107 = bitcast float %106 to i32, !dbg !10
%108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 8, i32 31), !dbg !10
%109 = bitcast i32 %108 to float, !dbg !10
%110 = fadd float %106, %109, !dbg !24
%111 = bitcast float %110 to i32, !dbg !10
%112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 4, i32 31), !dbg !10
%113 = bitcast i32 %112 to float, !dbg !10
%114 = fadd float %110, %113, !dbg !24
%115 = icmp ult i32 %7, 4, !dbg !10
%116 = shl nuw nsw i32 %10, 3, !dbg !10
%117 = or i32 %116, %12, !dbg !10
%118 = zext nneg i32 %117 to i64, !dbg !10
%119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !10
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %78, i1 %115) #3, !dbg !10
%120 = shl nuw nsw i32 %17, 3, !dbg !10
%121 = or i32 %120, %12, !dbg !10
%122 = zext nneg i32 %121 to i64, !dbg !10
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !10
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %90, i1 %115) #3, !dbg !10
%124 = shl nuw nsw i32 %18, 3, !dbg !10
%125 = or i32 %124, %12, !dbg !10
%126 = zext nneg i32 %125 to i64, !dbg !10
%127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !10
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %102, i1 %115) #3, !dbg !10
%128 = shl nuw nsw i32 %19, 3, !dbg !10
%129 = or i32 %128, %12, !dbg !10
%130 = zext nneg i32 %129 to i64, !dbg !10
%131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !10
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %114, i1 %115) #3, !dbg !10
tail call void @llvm.nvvm.barrier0(), !dbg !10
%132 = icmp slt i32 %6, 128, !dbg !10
%133 = sext i32 %6 to i64, !dbg !10
%134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !10
%135 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #3, !dbg !10
%136 = bitcast float %135 to i32, !dbg !10
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !10
%138 = bitcast i32 %137 to float, !dbg !10
%139 = fadd float %135, %138, !dbg !24
%140 = bitcast float %139 to i32, !dbg !10
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !10
%142 = bitcast i32 %141 to float, !dbg !10
%143 = fadd float %139, %142, !dbg !24
%144 = bitcast float %143 to i32, !dbg !10
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !10
%146 = bitcast i32 %145 to float, !dbg !10
%147 = fadd float %143, %146, !dbg !24
%148 = and i32 %6, 7, !dbg !10
%149 = icmp eq i32 %148, 0, !dbg !10
%150 = and i1 %132, %149, !dbg !10
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %147, i1 %150) #3, !dbg !10
tail call void @llvm.nvvm.barrier0(), !dbg !10
%151 = zext nneg i32 %116 to i64, !dbg !10
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
%153 = load float, ptr addrspace(3) %152, align 4, !dbg !10
%154 = zext nneg i32 %120 to i64, !dbg !10
%155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !10
%156 = load float, ptr addrspace(3) %155, align 4, !dbg !10
%157 = zext nneg i32 %124 to i64, !dbg !10
%158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !10
%159 = load float, ptr addrspace(3) %158, align 4, !dbg !10
%160 = zext nneg i32 %128 to i64, !dbg !10
%161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !10
%162 = load float, ptr addrspace(3) %161, align 4, !dbg !10
tail call void @llvm.nvvm.barrier0(), !dbg !28
%163 = zext nneg i32 %10 to i64, !dbg !28
%164 = getelementptr float, ptr addrspace(3) @global_smem, i64 %163, !dbg !28
%165 = insertelement <1 x float> undef, float %153, i64 0, !dbg !28
store <1 x float> %165, ptr addrspace(3) %164, align 4, !dbg !28
%166 = zext nneg i32 %17 to i64, !dbg !28
%167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !28
%168 = insertelement <1 x float> undef, float %156, i64 0, !dbg !28
store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !28
%169 = zext nneg i32 %18 to i64, !dbg !28
%170 = getelementptr float, ptr addrspace(3) @global_smem, i64 %169, !dbg !28
%171 = insertelement <1 x float> undef, float %159, i64 0, !dbg !28
store <1 x float> %171, ptr addrspace(3) %170, align 4, !dbg !28
%172 = zext nneg i32 %19 to i64, !dbg !28
%173 = getelementptr float, ptr addrspace(3) @global_smem, i64 %172, !dbg !28
%174 = insertelement <1 x float> undef, float %162, i64 0, !dbg !28
store <1 x float> %174, ptr addrspace(3) %173, align 4, !dbg !28
tail call void @llvm.nvvm.barrier0(), !dbg !28
%175 = zext nneg i32 %11 to i64, !dbg !28
%176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !28
%177 = load <1 x float>, ptr addrspace(3) %176, align 4, !dbg !28
%.frozen = freeze i32 %23
%178 = sdiv i32 %.frozen, 256, !dbg !29
%179 = mul i32 %178, 256
%.decomposed = sub i32 %.frozen, %179
%180 = sext i32 %178 to i64, !dbg !30
%181 = getelementptr i64, ptr addrspace(1) %1, i64 %180, !dbg !30
%182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %181, i1 true) #3, !dbg !31
%183 = lshr i64 %182, 54, !dbg !32
%184 = and i64 %183, 512, !dbg !32
%185 = add i64 %184, %182, !dbg !32
%186 = shl i64 %185, 8, !dbg !33
%187 = sext i32 %.decomposed to i64, !dbg !34
%188 = getelementptr float, ptr addrspace(1) %2, i64 %186, !dbg !35
%189 = getelementptr float, ptr addrspace(1) %188, i64 %187, !dbg !35
%190 = lshr i32 %7, 4, !dbg !36
%191 = shl nuw nsw i32 %12, 1, !dbg !36
%192 = or i32 %191, %190, !dbg !36
%193 = icmp eq i32 %192, 0, !dbg !36
%194 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %189, <1 x float> %177, i1 %193) #3, !dbg !36
ret void, !dbg !37
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!13 = !DILocation(line: 35, column: 25, scope: !11)
!14 = !DILocation(line: 21, column: 28, scope: !5)
!15 = !DILocation(line: 21, column: 33, scope: !5)
!16 = !DILocation(line: 22, column: 23, scope: !5)
!17 = !DILocation(line: 29, column: 25, scope: !5)
!18 = !DILocation(line: 31, column: 47, scope: !5)
!19 = !DILocation(line: 31, column: 40, scope: !5)
!20 = !DILocation(line: 31, column: 34, scope: !5)
!21 = !DILocation(line: 31, column: 53, scope: !5)
!22 = !DILocation(line: 33, column: 23, scope: !5)
!23 = !DILocation(line: 34, column: 38, scope: !5)
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
!27 = !DILocation(line: 35, column: 25, scope: !25)
!28 = !DILocation(line: 35, column: 28, scope: !5)
!29 = !DILocation(line: 36, column: 20, scope: !5)
!30 = !DILocation(line: 38, column: 30, scope: !5)
!31 = !DILocation(line: 38, column: 35, scope: !5)
!32 = !DILocation(line: 41, column: 32, scope: !5)
!33 = !DILocation(line: 45, column: 40, scope: !5)
!34 = !DILocation(line: 45, column: 36, scope: !5)
!35 = !DILocation(line: 45, column: 30, scope: !5)
!36 = !DILocation(line: 45, column: 55, scope: !5)
!37 = !DILocation(line: 45, column: 4, scope: !5)