0-hero's picture
Add files using upload-large-folder tool
00602c7 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%7 = and i32 %6, 31, !dbg !8
%8 = lshr i32 %6, 5, !dbg !8
%9 = and i32 %6, 3, !dbg !8
%10 = and i32 %8, 3, !dbg !9
%urem = and i32 %6, 127, !dbg !9
%11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
%12 = shl i32 %11, 2, !dbg !11
%13 = or i32 %12, %9, !dbg !12
%14 = icmp ult i32 %urem, 120, !dbg !13
%15 = shl nuw nsw i32 %urem, 17, !dbg !14
%16 = add i32 %12, %15, !dbg !15
%17 = sext i32 %16 to i64, !dbg !16
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16
%19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17
%20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17
%21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17
%22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17
%23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17
%24 = bitcast i32 %20 to float, !dbg !17
%25 = bitcast i32 %21 to float, !dbg !17
%26 = bitcast i32 %22 to float, !dbg !17
%27 = bitcast i32 %23 to float, !dbg !17
%28 = fadd float %24, 0.000000e+00, !dbg !18
%29 = fadd float %25, 0.000000e+00, !dbg !18
%30 = fadd float %26, 0.000000e+00, !dbg !18
%31 = fadd float %27, 0.000000e+00, !dbg !18
%32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19
%33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19
%34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19
%35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19
%36 = bitcast float %32 to i32, !dbg !20
%37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20
%38 = bitcast i32 %37 to float, !dbg !20
%39 = fadd float %32, %38, !dbg !24
%40 = bitcast float %39 to i32, !dbg !20
%41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20
%42 = bitcast i32 %41 to float, !dbg !20
%43 = fadd float %39, %42, !dbg !24
%44 = bitcast float %43 to i32, !dbg !20
%45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20
%46 = bitcast i32 %45 to float, !dbg !20
%47 = fadd float %43, %46, !dbg !24
%48 = bitcast float %47 to i32, !dbg !20
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20
%50 = bitcast i32 %49 to float, !dbg !20
%51 = fadd float %47, %50, !dbg !24
%52 = bitcast float %51 to i32, !dbg !20
%53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20
%54 = bitcast i32 %53 to float, !dbg !20
%55 = fadd float %51, %54, !dbg !24
%56 = bitcast float %33 to i32, !dbg !20
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
%58 = bitcast i32 %57 to float, !dbg !20
%59 = fadd float %33, %58, !dbg !24
%60 = bitcast float %59 to i32, !dbg !20
%61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20
%62 = bitcast i32 %61 to float, !dbg !20
%63 = fadd float %59, %62, !dbg !24
%64 = bitcast float %63 to i32, !dbg !20
%65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20
%66 = bitcast i32 %65 to float, !dbg !20
%67 = fadd float %63, %66, !dbg !24
%68 = bitcast float %67 to i32, !dbg !20
%69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20
%70 = bitcast i32 %69 to float, !dbg !20
%71 = fadd float %67, %70, !dbg !24
%72 = bitcast float %71 to i32, !dbg !20
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20
%74 = bitcast i32 %73 to float, !dbg !20
%75 = fadd float %71, %74, !dbg !24
%76 = bitcast float %34 to i32, !dbg !20
%77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20
%78 = bitcast i32 %77 to float, !dbg !20
%79 = fadd float %34, %78, !dbg !24
%80 = bitcast float %79 to i32, !dbg !20
%81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20
%82 = bitcast i32 %81 to float, !dbg !20
%83 = fadd float %79, %82, !dbg !24
%84 = bitcast float %83 to i32, !dbg !20
%85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20
%86 = bitcast i32 %85 to float, !dbg !20
%87 = fadd float %83, %86, !dbg !24
%88 = bitcast float %87 to i32, !dbg !20
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20
%90 = bitcast i32 %89 to float, !dbg !20
%91 = fadd float %87, %90, !dbg !24
%92 = bitcast float %91 to i32, !dbg !20
%93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20
%94 = bitcast i32 %93 to float, !dbg !20
%95 = fadd float %91, %94, !dbg !24
%96 = bitcast float %35 to i32, !dbg !20
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20
%98 = bitcast i32 %97 to float, !dbg !20
%99 = fadd float %35, %98, !dbg !24
%100 = bitcast float %99 to i32, !dbg !20
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20
%102 = bitcast i32 %101 to float, !dbg !20
%103 = fadd float %99, %102, !dbg !24
%104 = bitcast float %103 to i32, !dbg !20
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20
%106 = bitcast i32 %105 to float, !dbg !20
%107 = fadd float %103, %106, !dbg !24
%108 = bitcast float %107 to i32, !dbg !20
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20
%110 = bitcast i32 %109 to float, !dbg !20
%111 = fadd float %107, %110, !dbg !24
%112 = bitcast float %111 to i32, !dbg !20
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20
%114 = bitcast i32 %113 to float, !dbg !20
%115 = fadd float %111, %114, !dbg !24
%116 = icmp eq i32 %7, 0, !dbg !20
%117 = zext nneg i32 %10 to i64, !dbg !20
%118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20
%119 = or i32 %10, 4, !dbg !20
%120 = zext nneg i32 %119 to i64, !dbg !20
%121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20
%122 = or i32 %10, 8, !dbg !20
%123 = zext nneg i32 %122 to i64, !dbg !20
%124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20
%125 = or i32 %10, 12, !dbg !20
%126 = zext nneg i32 %125 to i64, !dbg !20
%127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20
tail call void @llvm.nvvm.barrier0(), !dbg !20
%128 = icmp slt i32 %6, 16, !dbg !20
%129 = sext i32 %6 to i64, !dbg !20
%130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20
%131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20
%132 = bitcast float %131 to i32, !dbg !20
%133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20
%134 = bitcast i32 %133 to float, !dbg !20
%135 = fadd float %131, %134, !dbg !24
%136 = bitcast float %135 to i32, !dbg !20
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20
%138 = bitcast i32 %137 to float, !dbg !20
%139 = fadd float %135, %138, !dbg !24
%140 = icmp eq i32 %9, 0, !dbg !20
%141 = and i1 %128, %140, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20
tail call void @llvm.nvvm.barrier0(), !dbg !20
%142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20
%143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20
%144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20
%145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20
tail call void @llvm.nvvm.barrier0(), !dbg !28
%146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28
store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28
%147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28
store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28
%148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28
store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28
%149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28
store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28
tail call void @llvm.nvvm.barrier0(), !dbg !28
%150 = zext nneg i32 %9 to i64, !dbg !28
%151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28
%152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28
%.frozen = freeze i32 %13
%153 = sdiv i32 %.frozen, 256, !dbg !29
%154 = mul i32 %153, 256
%.decomposed = sub i32 %.frozen, %154
%155 = sext i32 %153 to i64, !dbg !30
%156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30
%157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31
%158 = lshr i64 %157, 54, !dbg !32
%159 = and i64 %158, 512, !dbg !32
%160 = add i64 %159, %157, !dbg !32
%161 = shl i64 %160, 8, !dbg !33
%162 = sext i32 %.decomposed to i64, !dbg !34
%163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35
%164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35
%165 = lshr i32 %7, 2, !dbg !36
%166 = shl nuw nsw i32 %10, 3, !dbg !36
%167 = or i32 %166, %165, !dbg !36
%168 = icmp eq i32 %167, 0, !dbg !36
%169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36
ret void, !dbg !37
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 21, column: 28, scope: !5)
!11 = !DILocation(line: 21, column: 33, scope: !5)
!12 = !DILocation(line: 22, column: 23, scope: !5)
!13 = !DILocation(line: 29, column: 25, scope: !5)
!14 = !DILocation(line: 31, column: 47, scope: !5)
!15 = !DILocation(line: 31, column: 40, scope: !5)
!16 = !DILocation(line: 31, column: 34, scope: !5)
!17 = !DILocation(line: 31, column: 53, scope: !5)
!18 = !DILocation(line: 33, column: 23, scope: !5)
!19 = !DILocation(line: 34, column: 38, scope: !5)
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!23 = !DILocation(line: 35, column: 25, scope: !21)
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
!27 = !DILocation(line: 35, column: 25, scope: !25)
!28 = !DILocation(line: 35, column: 28, scope: !5)
!29 = !DILocation(line: 36, column: 20, scope: !5)
!30 = !DILocation(line: 38, column: 30, scope: !5)
!31 = !DILocation(line: 38, column: 35, scope: !5)
!32 = !DILocation(line: 41, column: 32, scope: !5)
!33 = !DILocation(line: 45, column: 40, scope: !5)
!34 = !DILocation(line: 45, column: 36, scope: !5)
!35 = !DILocation(line: 45, column: 30, scope: !5)
!36 = !DILocation(line: 45, column: 55, scope: !5)
!37 = !DILocation(line: 45, column: 4, scope: !5)