0-hero's picture
Add files using upload-large-folder tool
8c1fe04 verified
raw
history blame
7.12 kB
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%7 = and i32 %6, 63, !dbg !8
%8 = lshr i32 %6, 6, !dbg !9
%9 = and i32 %8, 3, !dbg !9
%10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
%11 = shl i32 %10, 6, !dbg !11
%12 = or i32 %11, %7, !dbg !12
br label %13, !dbg !13
13: ; preds = %5, %13
%14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
%15 = phi i32 [ 0, %5 ], [ %24, %13 ]
%16 = or i32 %15, %9, !dbg !14
%17 = shl i32 %16, 17, !dbg !15
%18 = add i32 %17, %12, !dbg !16
%19 = sext i32 %18 to i64, !dbg !17
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
%21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
%22 = bitcast i32 %21 to float, !dbg !18
%23 = fadd float %14, %22, !dbg !19
%24 = add nuw nsw i32 %15, 4, !dbg !13
%25 = icmp ult i32 %15, 116, !dbg !13
br i1 %25, label %13, label %26, !dbg !13
26: ; preds = %13
%27 = shl nuw nsw i32 %7, 2, !dbg !20
%28 = or i32 %27, %9, !dbg !20
%29 = zext nneg i32 %28 to i64, !dbg !20
%30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
tail call void @llvm.nvvm.barrier0(), !dbg !20
%31 = icmp slt i32 %6, 256, !dbg !20
%32 = sext i32 %6 to i64, !dbg !20
%33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
%34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
%35 = bitcast float %34 to i32, !dbg !20
%36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
%37 = bitcast i32 %36 to float, !dbg !20
%38 = fadd float %34, %37, !dbg !24
%39 = bitcast float %38 to i32, !dbg !20
%40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
%41 = bitcast i32 %40 to float, !dbg !20
%42 = fadd float %38, %41, !dbg !24
%43 = and i32 %6, 3, !dbg !20
%44 = icmp eq i32 %43, 0, !dbg !20
%45 = and i1 %31, %44, !dbg !20
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
tail call void @llvm.nvvm.barrier0(), !dbg !20
%46 = zext nneg i32 %27 to i64, !dbg !20
%47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
%48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
%.frozen = freeze i32 %12
%49 = sdiv i32 %.frozen, 256, !dbg !28
%50 = mul i32 %49, 256
%.decomposed = sub i32 %.frozen, %50
%51 = sext i32 %49 to i64, !dbg !29
%52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
%53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
%54 = lshr i64 %53, 54, !dbg !31
%55 = and i64 %54, 512, !dbg !31
%56 = add i64 %55, %53, !dbg !31
%57 = shl i64 %56, 8, !dbg !32
%58 = sext i32 %.decomposed to i64, !dbg !33
%59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
%60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
%61 = icmp eq i32 %9, 0, !dbg !35
%62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
%63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
ret void, !dbg !36
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #1
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind }
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 21, column: 28, scope: !5)
!11 = !DILocation(line: 21, column: 33, scope: !5)
!12 = !DILocation(line: 22, column: 23, scope: !5)
!13 = !DILocation(line: 27, column: 36, scope: !5)
!14 = !DILocation(line: 28, column: 27, scope: !5)
!15 = !DILocation(line: 31, column: 47, scope: !5)
!16 = !DILocation(line: 31, column: 40, scope: !5)
!17 = !DILocation(line: 31, column: 34, scope: !5)
!18 = !DILocation(line: 31, column: 53, scope: !5)
!19 = !DILocation(line: 34, column: 38, scope: !5)
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!23 = !DILocation(line: 35, column: 25, scope: !21)
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
!27 = !DILocation(line: 35, column: 25, scope: !25)
!28 = !DILocation(line: 36, column: 20, scope: !5)
!29 = !DILocation(line: 38, column: 30, scope: !5)
!30 = !DILocation(line: 38, column: 35, scope: !5)
!31 = !DILocation(line: 41, column: 32, scope: !5)
!32 = !DILocation(line: 45, column: 40, scope: !5)
!33 = !DILocation(line: 45, column: 36, scope: !5)
!34 = !DILocation(line: 45, column: 30, scope: !5)
!35 = !DILocation(line: 45, column: 55, scope: !5)
!36 = !DILocation(line: 45, column: 4, scope: !5)