|
|
|
source_filename = "LLVMDialectModule" |
|
|
|
define void @triton__0d1d2d3d4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4) local_unnamed_addr !dbg !5 { |
|
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 |
|
%7 = shl i32 %6, 3, !dbg !8 |
|
%8 = and i32 %7, 1016, !dbg !8 |
|
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 |
|
%10 = shl i32 %9, 10, !dbg !10 |
|
%11 = or i32 %10, %8, !dbg !11 |
|
%.frozen = freeze i32 %11 |
|
%12 = sdiv i32 %.frozen, 256, !dbg !12 |
|
%13 = srem i32 %12, 3, !dbg !13 |
|
%14 = mul i32 %12, 256 |
|
%.decomposed = sub i32 %.frozen, %14 |
|
%15 = sdiv i32 %11, 768, !dbg !14 |
|
%16 = shl nsw i32 %15, 8, !dbg !15 |
|
%17 = add nsw i32 %16, %.decomposed, !dbg !16 |
|
%18 = sext i32 %17 to i64, !dbg !17 |
|
%19 = getelementptr i16, ptr addrspace(1) %0, i64 %18, !dbg !17 |
|
%20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %19, i1 true) #1, !dbg !18 |
|
%21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !18 |
|
%22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !18 |
|
%23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !18 |
|
%24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !18 |
|
%25 = trunc i32 %21 to i16, !dbg !18 |
|
%extelt.offset = lshr i32 %21, 16, !dbg !18 |
|
%26 = trunc i32 %extelt.offset to i16, !dbg !18 |
|
%27 = trunc i32 %22 to i16, !dbg !18 |
|
%extelt.offset1 = lshr i32 %22, 16, !dbg !18 |
|
%28 = trunc i32 %extelt.offset1 to i16, !dbg !18 |
|
%29 = trunc i32 %23 to i16, !dbg !18 |
|
%extelt.offset2 = lshr i32 %23, 16, !dbg !18 |
|
%30 = trunc i32 %extelt.offset2 to i16, !dbg !18 |
|
%31 = trunc i32 %24 to i16, !dbg !18 |
|
%extelt.offset3 = lshr i32 %24, 16, !dbg !18 |
|
%32 = trunc i32 %extelt.offset3 to i16, !dbg !18 |
|
%33 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #1, !dbg !19 |
|
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #1, !dbg !19 |
|
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #1, !dbg !19 |
|
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %28) #1, !dbg !19 |
|
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %29) #1, !dbg !19 |
|
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #1, !dbg !19 |
|
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #1, !dbg !19 |
|
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #1, !dbg !19 |
|
%41 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !20 |
|
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #1, !dbg !21 |
|
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21 |
|
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21 |
|
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21 |
|
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21 |
|
%47 = trunc i32 %43 to i16, !dbg !21 |
|
%extelt.offset4 = lshr i32 %43, 16, !dbg !21 |
|
%48 = trunc i32 %extelt.offset4 to i16, !dbg !21 |
|
%49 = trunc i32 %44 to i16, !dbg !21 |
|
%extelt.offset5 = lshr i32 %44, 16, !dbg !21 |
|
%50 = trunc i32 %extelt.offset5 to i16, !dbg !21 |
|
%51 = trunc i32 %45 to i16, !dbg !21 |
|
%extelt.offset6 = lshr i32 %45, 16, !dbg !21 |
|
%52 = trunc i32 %extelt.offset6 to i16, !dbg !21 |
|
%53 = trunc i32 %46 to i16, !dbg !21 |
|
%extelt.offset7 = lshr i32 %46, 16, !dbg !21 |
|
%54 = trunc i32 %extelt.offset7 to i16, !dbg !21 |
|
%55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #1, !dbg !22 |
|
%56 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #1, !dbg !22 |
|
%57 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #1, !dbg !22 |
|
%58 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #1, !dbg !22 |
|
%59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #1, !dbg !22 |
|
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #1, !dbg !22 |
|
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #1, !dbg !22 |
|
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %54) #1, !dbg !22 |
|
%63 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !23 |
|
%64 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %63, i1 true) #1, !dbg !24 |
|
%65 = extractvalue { i32, i32, i32, i32 } %64, 0, !dbg !24 |
|
%66 = extractvalue { i32, i32, i32, i32 } %64, 1, !dbg !24 |
|
%67 = extractvalue { i32, i32, i32, i32 } %64, 2, !dbg !24 |
|
%68 = extractvalue { i32, i32, i32, i32 } %64, 3, !dbg !24 |
|
%69 = trunc i32 %65 to i16, !dbg !24 |
|
%extelt.offset8 = lshr i32 %65, 16, !dbg !24 |
|
%70 = trunc i32 %extelt.offset8 to i16, !dbg !24 |
|
%71 = trunc i32 %66 to i16, !dbg !24 |
|
%extelt.offset9 = lshr i32 %66, 16, !dbg !24 |
|
%72 = trunc i32 %extelt.offset9 to i16, !dbg !24 |
|
%73 = trunc i32 %67 to i16, !dbg !24 |
|
%extelt.offset10 = lshr i32 %67, 16, !dbg !24 |
|
%74 = trunc i32 %extelt.offset10 to i16, !dbg !24 |
|
%75 = trunc i32 %68 to i16, !dbg !24 |
|
%extelt.offset11 = lshr i32 %68, 16, !dbg !24 |
|
%76 = trunc i32 %extelt.offset11 to i16, !dbg !24 |
|
%77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #1, !dbg !25 |
|
%78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #1, !dbg !25 |
|
%79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #1, !dbg !25 |
|
%80 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #1, !dbg !25 |
|
%81 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #1, !dbg !25 |
|
%82 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #1, !dbg !25 |
|
%83 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #1, !dbg !25 |
|
%84 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %76) #1, !dbg !25 |
|
%85 = icmp eq i32 %13, 2, !dbg !26 |
|
%86 = select i1 %85, float %33, float 0.000000e+00, !dbg !27 |
|
%87 = select i1 %85, float %34, float 0.000000e+00, !dbg !27 |
|
%88 = select i1 %85, float %35, float 0.000000e+00, !dbg !27 |
|
%89 = select i1 %85, float %36, float 0.000000e+00, !dbg !27 |
|
%90 = select i1 %85, float %37, float 0.000000e+00, !dbg !27 |
|
%91 = select i1 %85, float %38, float 0.000000e+00, !dbg !27 |
|
%92 = select i1 %85, float %39, float 0.000000e+00, !dbg !27 |
|
%93 = select i1 %85, float %40, float 0.000000e+00, !dbg !27 |
|
%94 = icmp eq i32 %13, 1, !dbg !28 |
|
%95 = select i1 %94, float %55, float 0.000000e+00, !dbg !29 |
|
%96 = select i1 %94, float %56, float 0.000000e+00, !dbg !29 |
|
%97 = select i1 %94, float %57, float 0.000000e+00, !dbg !29 |
|
%98 = select i1 %94, float %58, float 0.000000e+00, !dbg !29 |
|
%99 = select i1 %94, float %59, float 0.000000e+00, !dbg !29 |
|
%100 = select i1 %94, float %60, float 0.000000e+00, !dbg !29 |
|
%101 = select i1 %94, float %61, float 0.000000e+00, !dbg !29 |
|
%102 = select i1 %94, float %62, float 0.000000e+00, !dbg !29 |
|
%103 = fadd float %86, %95, !dbg !30 |
|
%104 = fadd float %87, %96, !dbg !30 |
|
%105 = fadd float %88, %97, !dbg !30 |
|
%106 = fadd float %89, %98, !dbg !30 |
|
%107 = fadd float %90, %99, !dbg !30 |
|
%108 = fadd float %91, %100, !dbg !30 |
|
%109 = fadd float %92, %101, !dbg !30 |
|
%110 = fadd float %93, %102, !dbg !30 |
|
%111 = icmp eq i32 %13, 0, !dbg !31 |
|
%112 = select i1 %111, float %77, float 0.000000e+00, !dbg !32 |
|
%113 = select i1 %111, float %78, float 0.000000e+00, !dbg !32 |
|
%114 = select i1 %111, float %79, float 0.000000e+00, !dbg !32 |
|
%115 = select i1 %111, float %80, float 0.000000e+00, !dbg !32 |
|
%116 = select i1 %111, float %81, float 0.000000e+00, !dbg !32 |
|
%117 = select i1 %111, float %82, float 0.000000e+00, !dbg !32 |
|
%118 = select i1 %111, float %83, float 0.000000e+00, !dbg !32 |
|
%119 = select i1 %111, float %84, float 0.000000e+00, !dbg !32 |
|
%120 = fadd float %103, %112, !dbg !33 |
|
%121 = fadd float %104, %113, !dbg !33 |
|
%122 = fadd float %105, %114, !dbg !33 |
|
%123 = fadd float %106, %115, !dbg !33 |
|
%124 = fadd float %107, %116, !dbg !33 |
|
%125 = fadd float %108, %117, !dbg !33 |
|
%126 = fadd float %109, %118, !dbg !33 |
|
%127 = fadd float %110, %119, !dbg !33 |
|
%128 = sext i32 %11 to i64, !dbg !34 |
|
%129 = getelementptr i16, ptr addrspace(1) %3, i64 %128, !dbg !34 |
|
%130 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %120) #1, !dbg !35 |
|
%131 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %121) #1, !dbg !35 |
|
%132 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %122) #1, !dbg !35 |
|
%133 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %123) #1, !dbg !35 |
|
%134 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %124) #1, !dbg !35 |
|
%135 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %125) #1, !dbg !35 |
|
%136 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %126) #1, !dbg !35 |
|
%137 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %127) #1, !dbg !35 |
|
%138 = insertelement <2 x i16> undef, i16 %130, i64 0, !dbg !35 |
|
%139 = insertelement <2 x i16> %138, i16 %131, i64 1, !dbg !35 |
|
%140 = bitcast <2 x i16> %139 to i32, !dbg !35 |
|
%141 = insertelement <2 x i16> undef, i16 %132, i64 0, !dbg !35 |
|
%142 = insertelement <2 x i16> %141, i16 %133, i64 1, !dbg !35 |
|
%143 = bitcast <2 x i16> %142 to i32, !dbg !35 |
|
%144 = insertelement <2 x i16> undef, i16 %134, i64 0, !dbg !35 |
|
%145 = insertelement <2 x i16> %144, i16 %135, i64 1, !dbg !35 |
|
%146 = bitcast <2 x i16> %145 to i32, !dbg !35 |
|
%147 = insertelement <2 x i16> undef, i16 %136, i64 0, !dbg !35 |
|
%148 = insertelement <2 x i16> %147, i16 %137, i64 1, !dbg !35 |
|
%149 = bitcast <2 x i16> %148 to i32, !dbg !35 |
|
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %140, i32 %143, i32 %146, i32 %149, ptr addrspace(1) %129, i1 true) #1, !dbg !35 |
|
ret void, !dbg !36 |
|
} |
|
|
|
|
|
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 |
|
|
|
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
|
attributes #1 = { nounwind } |
|
|
|
!llvm.module.flags = !{!0} |
|
!llvm.dbg.cu = !{!1} |
|
!nvvm.annotations = !{!3, !4, !4, !3} |
|
|
|
!0 = !{i32 2, !"Debug Info Version", i32 3} |
|
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) |
|
!2 = !DIFile(filename: "c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py", directory: "/tmp/torchinductor_root/63") |
|
!3 = !{ptr @triton__0d1d2d3d4de, !"kernel", i32 1} |
|
!4 = !{ptr @triton__0d1d2d3d4de, !"maxntidx", i32 128} |
|
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4de", linkageName: "triton__0d1d2d3d4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) |
|
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) |
|
!7 = !{} |
|
!8 = !DILocation(line: 21, column: 36, scope: !5) |
|
!9 = !DILocation(line: 20, column: 28, scope: !5) |
|
!10 = !DILocation(line: 20, column: 33, scope: !5) |
|
!11 = !DILocation(line: 21, column: 23, scope: !5) |
|
!12 = !DILocation(line: 23, column: 20, scope: !5) |
|
!13 = !DILocation(line: 23, column: 27, scope: !5) |
|
!14 = !DILocation(line: 25, column: 20, scope: !5) |
|
!15 = !DILocation(line: 27, column: 40, scope: !5) |
|
!16 = !DILocation(line: 27, column: 36, scope: !5) |
|
!17 = !DILocation(line: 27, column: 30, scope: !5) |
|
!18 = !DILocation(line: 27, column: 46, scope: !5) |
|
!19 = !DILocation(line: 27, column: 85, scope: !5) |
|
!20 = !DILocation(line: 28, column: 30, scope: !5) |
|
!21 = !DILocation(line: 28, column: 46, scope: !5) |
|
!22 = !DILocation(line: 28, column: 85, scope: !5) |
|
!23 = !DILocation(line: 29, column: 31, scope: !5) |
|
!24 = !DILocation(line: 29, column: 47, scope: !5) |
|
!25 = !DILocation(line: 29, column: 86, scope: !5) |
|
!26 = !DILocation(line: 32, column: 19, scope: !5) |
|
!27 = !DILocation(line: 34, column: 32, scope: !5) |
|
!28 = !DILocation(line: 36, column: 19, scope: !5) |
|
!29 = !DILocation(line: 37, column: 32, scope: !5) |
|
!30 = !DILocation(line: 38, column: 19, scope: !5) |
|
!31 = !DILocation(line: 40, column: 20, scope: !5) |
|
!32 = !DILocation(line: 41, column: 35, scope: !5) |
|
!33 = !DILocation(line: 42, column: 20, scope: !5) |
|
!34 = !DILocation(line: 43, column: 25, scope: !5) |
|
!35 = !DILocation(line: 43, column: 37, scope: !5) |
|
!36 = !DILocation(line: 43, column: 4, scope: !5) |
|
|