0-hero's picture
Add files using upload-large-folder tool
d742687 verified
raw
history blame
16.6 kB
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 {
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
%8 = and i32 %7, 31, !dbg !10
%9 = lshr i32 %7, 5, !dbg !10
%10 = and i32 %9, 1, !dbg !10
%urem = shl i32 %7, 2, !dbg !10
%11 = and i32 %urem, 252, !dbg !10
%12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
%13 = shl i32 %12, 8, !dbg !12
%14 = or i32 %13, %11, !dbg !13
%15 = sext i32 %14 to i64, !dbg !14
%16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14
%17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
%18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15
%19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15
%20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15
%21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15
%22 = bitcast i32 %18 to float, !dbg !15
%23 = bitcast i32 %19 to float, !dbg !15
%24 = bitcast i32 %20 to float, !dbg !15
%25 = bitcast i32 %21 to float, !dbg !15
%26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16
%27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
%28 = extractvalue { i32, i32 } %27, 0, !dbg !17
%29 = extractvalue { i32, i32 } %27, 1, !dbg !17
%30 = trunc i32 %28 to i16, !dbg !17
%extelt.offset = lshr i32 %28, 16, !dbg !17
%31 = trunc i32 %extelt.offset to i16, !dbg !17
%32 = trunc i32 %29 to i16, !dbg !17
%extelt.offset1 = lshr i32 %29, 16, !dbg !17
%33 = trunc i32 %extelt.offset1 to i16, !dbg !17
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
%38 = zext nneg i32 %11 to i64, !dbg !19
%39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19
%40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
%41 = fadd float %34, %22, !dbg !21
%42 = fadd float %35, %23, !dbg !21
%43 = fadd float %36, %24, !dbg !21
%44 = fadd float %37, %25, !dbg !21
%45 = fadd float %41, %42, !dbg !22
%46 = fadd float %45, %43, !dbg !22
%47 = fadd float %46, %44, !dbg !22
%48 = bitcast float %47 to i32, !dbg !28
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28
%50 = bitcast i32 %49 to float, !dbg !28
%51 = fadd float %47, %50, !dbg !22
%52 = bitcast float %51 to i32, !dbg !28
%53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28
%54 = bitcast i32 %53 to float, !dbg !28
%55 = fadd float %51, %54, !dbg !22
%56 = bitcast float %55 to i32, !dbg !28
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28
%58 = bitcast i32 %57 to float, !dbg !28
%59 = fadd float %55, %58, !dbg !22
%60 = bitcast float %59 to i32, !dbg !28
%61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28
%62 = bitcast i32 %61 to float, !dbg !28
%63 = fadd float %59, %62, !dbg !22
%64 = bitcast float %63 to i32, !dbg !28
%65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28
%66 = bitcast i32 %65 to float, !dbg !28
%67 = fadd float %63, %66, !dbg !22
%68 = icmp eq i32 %8, 0, !dbg !28
%69 = zext nneg i32 %10 to i64, !dbg !28
%70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28
tail call void @llvm.nvvm.barrier0(), !dbg !28
%71 = icmp slt i32 %7, 2, !dbg !28
%72 = sext i32 %7 to i64, !dbg !28
%73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28
%74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28
%75 = bitcast float %74 to i32, !dbg !28
%76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28
%77 = bitcast i32 %76 to float, !dbg !28
%78 = fadd float %74, %77, !dbg !22
%79 = and i32 %7, 1, !dbg !28
%80 = icmp eq i32 %79, 0, !dbg !28
%81 = and i1 %71, %80, !dbg !28
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28
tail call void @llvm.nvvm.barrier0(), !dbg !28
%82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
%83 = fadd float %82, 0.000000e+00, !dbg !30
%84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34
%85 = fsub float %41, %84, !dbg !35
%86 = fsub float %42, %84, !dbg !35
%87 = fsub float %43, %84, !dbg !35
%88 = fsub float %44, %84, !dbg !35
%89 = fmul float %85, %85, !dbg !36
%90 = fmul float %86, %86, !dbg !36
%91 = fmul float %87, %87, !dbg !36
%92 = fmul float %88, %88, !dbg !36
tail call void @llvm.nvvm.barrier0(), !dbg !37
%93 = fadd float %89, %90, !dbg !39
%94 = fadd float %91, %93, !dbg !39
%95 = fadd float %92, %94, !dbg !39
%96 = bitcast float %95 to i32, !dbg !37
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37
%98 = bitcast i32 %97 to float, !dbg !37
%99 = fadd float %95, %98, !dbg !39
%100 = bitcast float %99 to i32, !dbg !37
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37
%102 = bitcast i32 %101 to float, !dbg !37
%103 = fadd float %99, %102, !dbg !39
%104 = bitcast float %103 to i32, !dbg !37
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37
%106 = bitcast i32 %105 to float, !dbg !37
%107 = fadd float %103, %106, !dbg !39
%108 = bitcast float %107 to i32, !dbg !37
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37
%110 = bitcast i32 %109 to float, !dbg !37
%111 = fadd float %107, %110, !dbg !39
%112 = bitcast float %111 to i32, !dbg !37
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37
%114 = bitcast i32 %113 to float, !dbg !37
%115 = fadd float %111, %114, !dbg !39
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37
tail call void @llvm.nvvm.barrier0(), !dbg !37
%116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37
%117 = bitcast float %116 to i32, !dbg !37
%118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
%119 = bitcast i32 %118 to float, !dbg !37
%120 = fadd float %116, %119, !dbg !39
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37
tail call void @llvm.nvvm.barrier0(), !dbg !37
%121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
%122 = fadd float %121, 0.000000e+00, !dbg !42
%123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44
%124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45
%125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
%.not.i = icmp eq i32 %125, 0, !dbg !46
br i1 %.not.i, label %128, label %126, !dbg !46
126: ; preds = %6
%127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46
br label %__nv_rsqrtf.exit, !dbg !46
128: ; preds = %6
%129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46
br label %__nv_rsqrtf.exit, !dbg !46
__nv_rsqrtf.exit: ; preds = %126, %128
%.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46
%130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20
%131 = bitcast i32 %130 to float, !dbg !20
%132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20
%133 = bitcast i32 %132 to float, !dbg !20
%134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20
%135 = bitcast i32 %134 to float, !dbg !20
%136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20
%137 = bitcast i32 %136 to float, !dbg !20
%138 = fmul float %85, %.0.i, !dbg !47
%139 = fmul float %86, %.0.i, !dbg !47
%140 = fmul float %87, %.0.i, !dbg !47
%141 = fmul float %88, %.0.i, !dbg !47
%142 = fmul float %138, %137, !dbg !48
%143 = fmul float %139, %135, !dbg !48
%144 = fmul float %140, %133, !dbg !48
%145 = fmul float %141, %131, !dbg !48
%146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49
%147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50
%148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50
%149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50
%150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50
%151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50
%152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50
%153 = bitcast <2 x i16> %152 to i32, !dbg !50
%154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50
%155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50
%156 = bitcast <2 x i16> %155 to i32, !dbg !50
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50
ret void, !dbg !51
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline nounwind
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
%.not = icmp eq i32 %1, 0
br i1 %.not, label %4, label %2
2: ; preds = %0
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
br label %6
4: ; preds = %0
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
br label %6
6: ; preds = %4, %2
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
ret float %.0
}
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #6 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "ce5cemaf763zop7tgmdl7oghweh4i2o3g632qnkrhju2cthbxnfd.py", directory: "/tmp/torchinductor_root/e5")
!4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1}
!5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 26, column: 26, scope: !7)
!11 = !DILocation(line: 23, column: 28, scope: !7)
!12 = !DILocation(line: 30, column: 40, scope: !7)
!13 = !DILocation(line: 30, column: 36, scope: !7)
!14 = !DILocation(line: 30, column: 30, scope: !7)
!15 = !DILocation(line: 30, column: 46, scope: !7)
!16 = !DILocation(line: 31, column: 30, scope: !7)
!17 = !DILocation(line: 31, column: 46, scope: !7)
!18 = !DILocation(line: 31, column: 67, scope: !7)
!19 = !DILocation(line: 32, column: 31, scope: !7)
!20 = !DILocation(line: 32, column: 36, scope: !7)
!21 = !DILocation(line: 34, column: 18, scope: !7)
!22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
!23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
!26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
!27 = !DILocation(line: 39, column: 58, scope: !23)
!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
!29 = !DILocation(line: 39, column: 58, scope: !25)
!30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
!31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
!33 = !DILocation(line: 39, column: 45, scope: !31)
!34 = !DILocation(line: 42, column: 20, scope: !7)
!35 = !DILocation(line: 43, column: 19, scope: !7)
!36 = !DILocation(line: 44, column: 20, scope: !7)
!37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
!38 = !DILocation(line: 47, column: 59, scope: !25)
!39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
!40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
!41 = !DILocation(line: 47, column: 59, scope: !23)
!42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
!43 = !DILocation(line: 47, column: 45, scope: !31)
!44 = !DILocation(line: 50, column: 20, scope: !7)
!45 = !DILocation(line: 52, column: 20, scope: !7)
!46 = !DILocation(line: 53, column: 26, scope: !7)
!47 = !DILocation(line: 54, column: 20, scope: !7)
!48 = !DILocation(line: 55, column: 20, scope: !7)
!49 = !DILocation(line: 57, column: 25, scope: !7)
!50 = !DILocation(line: 57, column: 48, scope: !7)
!51 = !DILocation(line: 57, column: 4, scope: !7)