; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

@global_smem = external addrspace(3) global [0 x i8]

define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
  %7 = and i32 %6, 31, !dbg !8
  %8 = lshr i32 %6, 5, !dbg !8
  %9 = shl i32 %6, 2, !dbg !8
  %10 = and i32 %9, 60, !dbg !8
  %11 = and i32 %8, 3, !dbg !9
  %12 = lshr i32 %7, 4, !dbg !9
  %13 = shl nuw nsw i32 %11, 1, !dbg !9
  %14 = or i32 %13, %12, !dbg !9
  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
  %16 = shl i32 %15, 6, !dbg !11
  %17 = or i32 %16, %10, !dbg !12
  %.frozen = freeze i32 %17
  %18 = sdiv i32 %.frozen, 256, !dbg !13
  %19 = mul i32 %18, 256
  %.decomposed = sub i32 %.frozen, %19
  %20 = shl i32 %18, 15, !dbg !14
  %21 = add i32 %20, %.decomposed
  br label %22, !dbg !15

22:                                               ; preds = %5, %22
  %23 = phi i32 [ 0, %5 ], [ %58, %22 ]
  %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ]
  %25 = or i32 %23, %14, !dbg !16
  %26 = shl i32 %25, 8, !dbg !17
  %27 = add i32 %21, %26, !dbg !18
  %28 = sext i32 %27 to i64, !dbg !19
  %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19
  %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
  %31 = extractvalue { i32, i32 } %30, 0, !dbg !20
  %32 = extractvalue { i32, i32 } %30, 1, !dbg !20
  %33 = trunc i32 %31 to i16, !dbg !20
  %extelt.offset = lshr i32 %31, 16, !dbg !20
  %34 = trunc i32 %extelt.offset to i16, !dbg !20
  %35 = trunc i32 %32 to i16, !dbg !20
  %extelt.offset1 = lshr i32 %32, 16, !dbg !20
  %36 = trunc i32 %extelt.offset1 to i16, !dbg !20
  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21
  %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21
  %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21
  %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21
  %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22
  %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23
  %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23
  %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23
  %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23
  %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23
  %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23
  %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23
  %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23
  %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23
  %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23
  %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24
  %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24
  %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24
  %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24
  %56 = fmul <4 x float> %55, %51, !dbg !24
  %57 = fadd <4 x float> %24, %56, !dbg !25
  %58 = add nuw nsw i32 %23, 8, !dbg !15
  %59 = icmp ult i32 %23, 120, !dbg !15
  br i1 %59, label %22, label %60, !dbg !15

60:                                               ; preds = %22
  %61 = and i32 %6, 63, !dbg !8
  %62 = or i32 %16, %61, !dbg !12
  %63 = or i32 %10, 3, !dbg !26
  %64 = or i32 %10, 2, !dbg !26
  %65 = or i32 %10, 1, !dbg !26
  %66 = extractelement <4 x float> %57, i64 0, !dbg !26
  %67 = bitcast float %66 to i32, !dbg !26
  %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26
  %69 = bitcast i32 %68 to float, !dbg !26
  %70 = fadd float %66, %69, !dbg !30
  %71 = extractelement <4 x float> %57, i64 1, !dbg !26
  %72 = bitcast float %71 to i32, !dbg !26
  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26
  %74 = bitcast i32 %73 to float, !dbg !26
  %75 = fadd float %71, %74, !dbg !30
  %76 = extractelement <4 x float> %57, i64 2, !dbg !26
  %77 = bitcast float %76 to i32, !dbg !26
  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26
  %79 = bitcast i32 %78 to float, !dbg !26
  %80 = fadd float %76, %79, !dbg !30
  %81 = extractelement <4 x float> %57, i64 3, !dbg !26
  %82 = bitcast float %81 to i32, !dbg !26
  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26
  %84 = bitcast i32 %83 to float, !dbg !26
  %85 = fadd float %81, %84, !dbg !30
  %86 = icmp ult i32 %7, 16, !dbg !26
  %87 = shl nuw nsw i32 %10, 2, !dbg !26
  %88 = or i32 %87, %11, !dbg !26
  %89 = zext nneg i32 %88 to i64, !dbg !26
  %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26
  %91 = shl nuw nsw i32 %65, 2, !dbg !26
  %92 = or i32 %91, %11, !dbg !26
  %93 = zext nneg i32 %92 to i64, !dbg !26
  %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26
  %95 = shl nuw nsw i32 %64, 2, !dbg !26
  %96 = or i32 %95, %11, !dbg !26
  %97 = zext nneg i32 %96 to i64, !dbg !26
  %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26
  %99 = shl nuw nsw i32 %63, 2, !dbg !26
  %100 = or i32 %99, %11, !dbg !26
  %101 = zext nneg i32 %100 to i64, !dbg !26
  %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26
  tail call void @llvm.nvvm.barrier0(), !dbg !26
  %103 = icmp slt i32 %6, 256, !dbg !26
  %104 = sext i32 %6 to i64, !dbg !26
  %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26
  %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26
  %107 = bitcast float %106 to i32, !dbg !26
  %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26
  %109 = bitcast i32 %108 to float, !dbg !26
  %110 = fadd float %106, %109, !dbg !30
  %111 = bitcast float %110 to i32, !dbg !26
  %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26
  %113 = bitcast i32 %112 to float, !dbg !26
  %114 = fadd float %110, %113, !dbg !30
  %115 = and i32 %6, 3, !dbg !26
  %116 = icmp eq i32 %115, 0, !dbg !26
  %117 = and i1 %103, %116, !dbg !26
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26
  %118 = add i32 %6, 128, !dbg !26
  %119 = sext i32 %118 to i64, !dbg !26
  %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26
  %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26
  %122 = bitcast float %121 to i32, !dbg !26
  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26
  %124 = bitcast i32 %123 to float, !dbg !26
  %125 = fadd float %121, %124, !dbg !30
  %126 = bitcast float %125 to i32, !dbg !26
  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26
  %128 = bitcast i32 %127 to float, !dbg !26
  %129 = fadd float %125, %128, !dbg !30
  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26
  tail call void @llvm.nvvm.barrier0(), !dbg !26
  %130 = zext nneg i32 %87 to i64, !dbg !26
  %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26
  %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26
  %133 = zext nneg i32 %91 to i64, !dbg !26
  %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26
  %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26
  %136 = zext nneg i32 %95 to i64, !dbg !26
  %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26
  %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26
  %139 = zext nneg i32 %99 to i64, !dbg !26
  %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26
  %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26
  tail call void @llvm.nvvm.barrier0(), !dbg !34
  %142 = zext nneg i32 %10 to i64, !dbg !34
  %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34
  %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34
  store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34
  %145 = zext nneg i32 %65 to i64, !dbg !34
  %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34
  %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34
  store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34
  %148 = zext nneg i32 %64 to i64, !dbg !34
  %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34
  %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34
  store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34
  %151 = zext nneg i32 %63 to i64, !dbg !34
  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34
  %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34
  store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34
  tail call void @llvm.nvvm.barrier0(), !dbg !34
  %154 = zext nneg i32 %61 to i64, !dbg !34
  %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34
  %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34
  %157 = sext i32 %62 to i64, !dbg !35
  %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35
  %159 = and i32 %6, 64, !dbg !36
  %160 = icmp eq i32 %159, 0, !dbg !36
  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36
  ret void, !dbg !37
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1

; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #2

attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #2 = { convergent nocallback nounwind }
attributes #3 = { nounwind }

!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj")
!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 22, column: 44, scope: !5)
!9 = !DILocation(line: 24, column: 33, scope: !5)
!10 = !DILocation(line: 21, column: 28, scope: !5)
!11 = !DILocation(line: 21, column: 33, scope: !5)
!12 = !DILocation(line: 22, column: 23, scope: !5)
!13 = !DILocation(line: 26, column: 20, scope: !5)
!14 = !DILocation(line: 33, column: 57, scope: !5)
!15 = !DILocation(line: 29, column: 36, scope: !5)
!16 = !DILocation(line: 30, column: 27, scope: !5)
!17 = !DILocation(line: 33, column: 44, scope: !5)
!18 = !DILocation(line: 33, column: 51, scope: !5)
!19 = !DILocation(line: 33, column: 34, scope: !5)
!20 = !DILocation(line: 33, column: 63, scope: !5)
!21 = !DILocation(line: 33, column: 115, scope: !5)
!22 = !DILocation(line: 34, column: 34, scope: !5)
!23 = !DILocation(line: 34, column: 63, scope: !5)
!24 = !DILocation(line: 36, column: 22, scope: !5)
!25 = !DILocation(line: 39, column: 38, scope: !5)
!26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29)
!27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
!29 = !DILocation(line: 40, column: 25, scope: !27)
!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32)
!31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0)
!32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33)
!33 = !DILocation(line: 40, column: 25, scope: !31)
!34 = !DILocation(line: 40, column: 28, scope: !5)
!35 = !DILocation(line: 41, column: 25, scope: !5)
!36 = !DILocation(line: 41, column: 36, scope: !5)
!37 = !DILocation(line: 41, column: 4, scope: !5)