|
|
|
source_filename = "LLVMDialectModule" |
|
|
|
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { |
|
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 |
|
%5 = shl i32 %4, 3, !dbg !8 |
|
%6 = and i32 %5, 1016, !dbg !8 |
|
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 |
|
%8 = shl i32 %7, 10, !dbg !10 |
|
%9 = or i32 %8, %6, !dbg !11 |
|
%10 = or i32 %9, 4, !dbg !11 |
|
%11 = icmp slt i32 %9, 12865792, !dbg !12 |
|
%12 = sext i32 %9 to i64, !dbg !13 |
|
%13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13 |
|
%14 = sext i32 %10 to i64, !dbg !13 |
|
%15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13 |
|
%16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %13, i1 %11) #1, !dbg !14 |
|
%17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !14 |
|
%18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !14 |
|
%19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !14 |
|
%20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !14 |
|
%21 = bitcast i32 %17 to float, !dbg !14 |
|
%22 = bitcast i32 %18 to float, !dbg !14 |
|
%23 = bitcast i32 %19 to float, !dbg !14 |
|
%24 = bitcast i32 %20 to float, !dbg !14 |
|
%25 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 %11) #1, !dbg !14 |
|
%26 = extractvalue { i32, i32, i32, i32 } %25, 0, !dbg !14 |
|
%27 = extractvalue { i32, i32, i32, i32 } %25, 1, !dbg !14 |
|
%28 = extractvalue { i32, i32, i32, i32 } %25, 2, !dbg !14 |
|
%29 = extractvalue { i32, i32, i32, i32 } %25, 3, !dbg !14 |
|
%30 = bitcast i32 %26 to float, !dbg !14 |
|
%31 = bitcast i32 %27 to float, !dbg !14 |
|
%32 = bitcast i32 %28 to float, !dbg !14 |
|
%33 = bitcast i32 %29 to float, !dbg !14 |
|
%34 = getelementptr i16, ptr addrspace(1) %1, i64 %12, !dbg !15 |
|
%35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !16 |
|
%36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !16 |
|
%37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !16 |
|
%38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %24) #1, !dbg !16 |
|
%39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !16 |
|
%40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !16 |
|
%41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !16 |
|
%42 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %33) #1, !dbg !16 |
|
%43 = insertelement <2 x i16> undef, i16 %35, i64 0, !dbg !16 |
|
%44 = insertelement <2 x i16> %43, i16 %36, i64 1, !dbg !16 |
|
%45 = bitcast <2 x i16> %44 to i32, !dbg !16 |
|
%46 = insertelement <2 x i16> undef, i16 %37, i64 0, !dbg !16 |
|
%47 = insertelement <2 x i16> %46, i16 %38, i64 1, !dbg !16 |
|
%48 = bitcast <2 x i16> %47 to i32, !dbg !16 |
|
%49 = insertelement <2 x i16> undef, i16 %39, i64 0, !dbg !16 |
|
%50 = insertelement <2 x i16> %49, i16 %40, i64 1, !dbg !16 |
|
%51 = bitcast <2 x i16> %50 to i32, !dbg !16 |
|
%52 = insertelement <2 x i16> undef, i16 %41, i64 0, !dbg !16 |
|
%53 = insertelement <2 x i16> %52, i16 %42, i64 1, !dbg !16 |
|
%54 = bitcast <2 x i16> %53 to i32, !dbg !16 |
|
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %45, i32 %48, i32 %51, i32 %54, ptr addrspace(1) %34, i1 %11) #1, !dbg !16 |
|
ret void, !dbg !17 |
|
} |
|
|
|
|
|
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 |
|
|
|
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
|
attributes #1 = { nounwind } |
|
|
|
!llvm.module.flags = !{!0} |
|
!llvm.dbg.cu = !{!1} |
|
!nvvm.annotations = !{!3, !4, !4, !3} |
|
|
|
!0 = !{i32 2, !"Debug Info Version", i32 3} |
|
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) |
|
!2 = !DIFile(filename: "c3dqs5x45k2yonlaarvhjaaf2n4okr2444cmsi5lflqvzppalavv.py", directory: "/tmp/torchinductor_root/3d") |
|
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} |
|
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} |
|
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) |
|
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) |
|
!7 = !{} |
|
!8 = !DILocation(line: 21, column: 36, scope: !5) |
|
!9 = !DILocation(line: 20, column: 28, scope: !5) |
|
!10 = !DILocation(line: 20, column: 33, scope: !5) |
|
!11 = !DILocation(line: 21, column: 23, scope: !5) |
|
!12 = !DILocation(line: 22, column: 21, scope: !5) |
|
!13 = !DILocation(line: 24, column: 30, scope: !5) |
|
!14 = !DILocation(line: 24, column: 35, scope: !5) |
|
!15 = !DILocation(line: 26, column: 25, scope: !5) |
|
!16 = !DILocation(line: 26, column: 36, scope: !5) |
|
!17 = !DILocation(line: 26, column: 4, scope: !5) |
|
|