0-hero's picture
Add files using upload-large-folder tool
4bf7f8a verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%11 = lshr i32 %10, 2, !dbg !8
%12 = and i32 %11, 63, !dbg !8
%13 = and i32 %10, 3, !dbg !9
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !10
%15 = sext i32 %14 to i64, !dbg !11
%16 = shl nsw i64 %15, 6, !dbg !12
%17 = zext nneg i32 %12 to i64
%18 = or i64 %16, %17, !dbg !13
%19 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
%20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #2, !dbg !15
%21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #2, !dbg !16
%22 = bitcast i32 %21 to float, !dbg !16
%23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #2, !dbg !17
%24 = bitcast i32 %23 to float, !dbg !17
%25 = mul nsw i64 %18, 50257, !dbg !18
%.not = icmp eq i64 %20, -1, !dbg !19
%26 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %22, float %24) #2, !dbg !20
%27 = select i1 %.not, float 0.000000e+00, float %26, !dbg !21
%28 = getelementptr float, ptr addrspace(1) %0, i64 %25
br label %29, !dbg !22
29: ; preds = %9, %29
%30 = phi float [ 0.000000e+00, %9 ], [ %40, %29 ]
%31 = phi i32 [ 0, %9 ], [ %41, %29 ]
%32 = or i32 %31, %13, !dbg !23
%33 = zext nneg i32 %32 to i64, !dbg !23
%34 = icmp ult i32 %32, 50257, !dbg !24
%35 = getelementptr float, ptr addrspace(1) %28, i64 %33, !dbg !25
%36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %35, i1 %34, i32 0, i1 %34) #2, !dbg !26
%37 = bitcast i32 %36 to float, !dbg !26
%38 = fmul float %27, %37, !dbg !27
%39 = select i1 %34, float %38, float -0.000000e+00, !dbg !28
%40 = fadd float %30, %39, !dbg !28
%41 = add nuw nsw i32 %31, 4, !dbg !22
%42 = icmp ult i32 %31, 50253, !dbg !22
br i1 %42, label %29, label %43, !dbg !22
43: ; preds = %29
%44 = bitcast float %40 to i32, !dbg !29
%45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !29
%46 = bitcast i32 %45 to float, !dbg !29
%47 = fadd float %40, %46, !dbg !33
%48 = bitcast float %47 to i32, !dbg !29
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 1, i32 31), !dbg !29
%50 = bitcast i32 %49 to float, !dbg !29
%51 = fadd float %47, %50, !dbg !33
br label %52, !dbg !37
52: ; preds = %43, %52
%53 = phi i32 [ 0, %43 ], [ %75, %52 ]
%54 = or i32 %53, %13, !dbg !38
%55 = zext nneg i32 %54 to i64, !dbg !38
%56 = icmp ult i32 %54, 50257, !dbg !39
%57 = add nsw i64 %25, %55, !dbg !40
%58 = getelementptr i16, ptr addrspace(1) %4, i64 %57, !dbg !41
%59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %58, i1 %56, i16 0, i1 %56) #2, !dbg !42
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #2, !dbg !43
%61 = getelementptr float, ptr addrspace(1) %0, i64 %57, !dbg !44
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %61, i1 %56, i32 0, i1 %56) #2, !dbg !45
%63 = bitcast i32 %62 to float, !dbg !45
%64 = getelementptr i16, ptr addrspace(1) %5, i64 %57, !dbg !46
%65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %64, i1 %56, i16 0, i1 %56) #2, !dbg !47
%66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !48
%67 = fmul float %27, %63, !dbg !49
%68 = fmul float %66, 0x3FF7154760000000, !dbg !50
%69 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %68) #2, !dbg !50
%70 = fmul float %51, %69, !dbg !51
%71 = fsub float %67, %70, !dbg !52
%72 = fadd float %60, %71, !dbg !53
%73 = getelementptr i16, ptr addrspace(1) %6, i64 %57, !dbg !54
%74 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %72) #2, !dbg !55
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %74, ptr addrspace(1) %73, i1 %56) #2, !dbg !55
%75 = add nuw nsw i32 %53, 4, !dbg !37
%76 = icmp ult i32 %53, 50253, !dbg !37
br i1 %76, label %52, label %77, !dbg !37
77: ; preds = %52
ret void, !dbg !56
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 21, column: 28, scope: !5)
!11 = !DILocation(line: 21, column: 34, scope: !5)
!12 = !DILocation(line: 21, column: 46, scope: !5)
!13 = !DILocation(line: 22, column: 23, scope: !5)
!14 = !DILocation(line: 26, column: 30, scope: !5)
!15 = !DILocation(line: 26, column: 35, scope: !5)
!16 = !DILocation(line: 27, column: 19, scope: !5)
!17 = !DILocation(line: 29, column: 19, scope: !5)
!18 = !DILocation(line: 36, column: 46, scope: !5)
!19 = !DILocation(line: 38, column: 23, scope: !5)
!20 = !DILocation(line: 39, column: 22, scope: !5)
!21 = !DILocation(line: 41, column: 37, scope: !5)
!22 = !DILocation(line: 32, column: 36, scope: !5)
!23 = !DILocation(line: 33, column: 27, scope: !5)
!24 = !DILocation(line: 34, column: 25, scope: !5)
!25 = !DILocation(line: 36, column: 34, scope: !5)
!26 = !DILocation(line: 36, column: 52, scope: !5)
!27 = !DILocation(line: 42, column: 23, scope: !5)
!28 = !DILocation(line: 45, column: 40, scope: !5)
!29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
!30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!32 = !DILocation(line: 46, column: 27, scope: !30)
!33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
!34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
!35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
!36 = !DILocation(line: 46, column: 27, scope: !34)
!37 = !DILocation(line: 51, column: 36, scope: !5)
!38 = !DILocation(line: 52, column: 27, scope: !5)
!39 = !DILocation(line: 53, column: 25, scope: !5)
!40 = !DILocation(line: 55, column: 41, scope: !5)
!41 = !DILocation(line: 55, column: 35, scope: !5)
!42 = !DILocation(line: 55, column: 53, scope: !5)
!43 = !DILocation(line: 55, column: 105, scope: !5)
!44 = !DILocation(line: 56, column: 35, scope: !5)
!45 = !DILocation(line: 56, column: 53, scope: !5)
!46 = !DILocation(line: 57, column: 35, scope: !5)
!47 = !DILocation(line: 57, column: 53, scope: !5)
!48 = !DILocation(line: 57, column: 105, scope: !5)
!49 = !DILocation(line: 63, column: 24, scope: !5)
!50 = !DILocation(line: 65, column: 23, scope: !5)
!51 = !DILocation(line: 66, column: 24, scope: !5)
!52 = !DILocation(line: 67, column: 24, scope: !5)
!53 = !DILocation(line: 69, column: 24, scope: !5)
!54 = !DILocation(line: 70, column: 29, scope: !5)
!55 = !DILocation(line: 70, column: 54, scope: !5)
!56 = !DILocation(line: 51, column: 4, scope: !5)