File size: 5,177 Bytes
485133c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
  %5 = shl i32 %4, 3, !dbg !8
  %6 = and i32 %5, 1016, !dbg !8
  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
  %8 = shl i32 %7, 10, !dbg !10
  %9 = or i32 %8, %6, !dbg !11
  %10 = or i32 %9, 4, !dbg !11
  %11 = sext i32 %9 to i64, !dbg !12
  %12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !12
  %13 = sext i32 %10 to i64, !dbg !12
  %14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
  %15 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %12, i1 true) #1, !dbg !13
  %16 = extractvalue { i32, i32, i32, i32 } %15, 0, !dbg !13
  %17 = extractvalue { i32, i32, i32, i32 } %15, 1, !dbg !13
  %18 = extractvalue { i32, i32, i32, i32 } %15, 2, !dbg !13
  %19 = extractvalue { i32, i32, i32, i32 } %15, 3, !dbg !13
  %20 = bitcast i32 %16 to float, !dbg !13
  %21 = bitcast i32 %17 to float, !dbg !13
  %22 = bitcast i32 %18 to float, !dbg !13
  %23 = bitcast i32 %19 to float, !dbg !13
  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %14, i1 true) #1, !dbg !13
  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !13
  %26 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !13
  %27 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !13
  %28 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !13
  %29 = bitcast i32 %25 to float, !dbg !13
  %30 = bitcast i32 %26 to float, !dbg !13
  %31 = bitcast i32 %27 to float, !dbg !13
  %32 = bitcast i32 %28 to float, !dbg !13
  %33 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !14
  %34 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
  %35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !15
  %36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !15
  %37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !15
  %38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %29) #1, !dbg !15
  %39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !15
  %40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !15
  %41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !15
  %42 = insertelement <2 x i16> undef, i16 %34, i64 0, !dbg !15
  %43 = insertelement <2 x i16> %42, i16 %35, i64 1, !dbg !15
  %44 = bitcast <2 x i16> %43 to i32, !dbg !15
  %45 = insertelement <2 x i16> undef, i16 %36, i64 0, !dbg !15
  %46 = insertelement <2 x i16> %45, i16 %37, i64 1, !dbg !15
  %47 = bitcast <2 x i16> %46 to i32, !dbg !15
  %48 = insertelement <2 x i16> undef, i16 %38, i64 0, !dbg !15
  %49 = insertelement <2 x i16> %48, i16 %39, i64 1, !dbg !15
  %50 = bitcast <2 x i16> %49 to i32, !dbg !15
  %51 = insertelement <2 x i16> undef, i16 %40, i64 0, !dbg !15
  %52 = insertelement <2 x i16> %51, i16 %41, i64 1, !dbg !15
  %53 = bitcast <2 x i16> %52 to i32, !dbg !15
  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %44, i32 %47, i32 %50, i32 %53, ptr addrspace(1) %33, i1 true) #1, !dbg !15
  ret void, !dbg !16
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nounwind }

!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "czjxjqxojsyyr4zmce6q6twysnucw6p4l5ujgp6ts2ecrm3ue3ex.py", directory: "/tmp/torchinductor_root/zj")
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 21, column: 36, scope: !5)
!9 = !DILocation(line: 20, column: 28, scope: !5)
!10 = !DILocation(line: 20, column: 33, scope: !5)
!11 = !DILocation(line: 21, column: 23, scope: !5)
!12 = !DILocation(line: 24, column: 30, scope: !5)
!13 = !DILocation(line: 24, column: 35, scope: !5)
!14 = !DILocation(line: 26, column: 25, scope: !5)
!15 = !DILocation(line: 26, column: 36, scope: !5)
!16 = !DILocation(line: 26, column: 4, scope: !5)