; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %5 = and i32 %4, 127, !dbg !8 %6 = shl nuw nsw i32 %5, 3, !dbg !8 %7 = shl nuw nsw i32 %5, 2, !dbg !8 %8 = or i32 %7, 512, !dbg !8 %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9 %10 = shl i32 %9, 10, !dbg !10 %11 = or i32 %10, %6, !dbg !11 %12 = or i32 %10, %7, !dbg !11 %13 = or i32 %10, %8, !dbg !11 %14 = icmp slt i32 %11, 12865792, !dbg !12 %15 = icmp slt i32 %12, 12865792, !dbg !12 %16 = icmp slt i32 %13, 12865792, !dbg !12 %17 = sext i32 %11 to i64, !dbg !13 %18 = getelementptr i16, ptr addrspace(1) %0, i64 %17, !dbg !13 %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %18, i1 %14) #2, !dbg !14 %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !14 %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !14 %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !14 %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !14 %24 = trunc i32 %20 to i16, !dbg !14 %extelt.offset = lshr i32 %20, 16, !dbg !14 %25 = trunc i32 %extelt.offset to i16, !dbg !14 %26 = trunc i32 %21 to i16, !dbg !14 %extelt.offset1 = lshr i32 %21, 16, !dbg !14 %27 = trunc i32 %extelt.offset1 to i16, !dbg !14 %28 = trunc i32 %22 to i16, !dbg !14 %extelt.offset2 = lshr i32 %22, 16, !dbg !14 %29 = trunc i32 %extelt.offset2 to i16, !dbg !14 %30 = trunc i32 %23 to i16, !dbg !14 %extelt.offset3 = lshr i32 %23, 16, !dbg !14 %31 = trunc i32 %extelt.offset3 to i16, !dbg !14 %32 = zext nneg i32 %6 to i64, !dbg !15 %33 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %32, !dbg !15 %34 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !15 store <1 x i16> %34, ptr addrspace(3) %33, align 2, !dbg !15 %35 = or i32 %6, 1, !dbg !15 %36 = zext nneg i32 %35 to i64, !dbg !15 %37 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %36, !dbg !15 %38 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !15 store <1 x i16> %38, ptr addrspace(3) %37, align 2, !dbg !15 %39 = or i32 %6, 2, !dbg !15 %40 = zext nneg i32 %39 to i64, !dbg !15 %41 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %40, !dbg !15 %42 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !15 store <1 x i16> %42, ptr addrspace(3) %41, align 2, !dbg !15 %43 = or i32 %6, 3, !dbg !15 %44 = zext nneg i32 %43 to i64, !dbg !15 %45 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %44, !dbg !15 %46 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !15 store <1 x i16> %46, ptr addrspace(3) %45, align 2, !dbg !15 %47 = or i32 %6, 4, !dbg !15 %48 = zext nneg i32 %47 to i64, !dbg !15 %49 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %48, !dbg !15 %50 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !15 store <1 x i16> %50, ptr addrspace(3) %49, align 2, !dbg !15 %51 = or i32 %6, 5, !dbg !15 %52 = zext nneg i32 %51 to i64, !dbg !15 %53 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %52, !dbg !15 %54 = insertelement <1 x i16> undef, i16 %29, i64 0, !dbg !15 store <1 x i16> %54, ptr addrspace(3) %53, align 2, !dbg !15 %55 = or i32 %6, 6, !dbg !15 %56 = zext nneg i32 %55 to i64, !dbg !15 %57 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %56, !dbg !15 %58 = insertelement <1 x i16> undef, i16 %30, i64 0, !dbg !15 store <1 x i16> %58, ptr addrspace(3) %57, align 2, !dbg !15 %59 = or i32 %6, 7, !dbg !15 %60 = zext nneg i32 %59 to i64, !dbg !15 %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !15 %62 = insertelement <1 x i16> undef, i16 %31, i64 0, !dbg !15 store <1 x i16> %62, ptr addrspace(3) %61, align 2, !dbg !15 tail call void @llvm.nvvm.barrier0(), !dbg !15 %63 = zext nneg i32 %7 to i64, !dbg !15 %64 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %63, !dbg !15 %65 = load i16, ptr addrspace(3) %64, align 2, !dbg !15 %66 = or i32 %7, 1, !dbg !15 %67 = zext nneg i32 %66 to i64, !dbg !15 %68 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %67, !dbg !15 %69 = load i16, ptr addrspace(3) %68, align 2, !dbg !15 %70 = or i32 %7, 2, !dbg !15 %71 = zext nneg i32 %70 to i64, !dbg !15 %72 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %71, !dbg !15 %73 = load i16, ptr addrspace(3) %72, align 2, !dbg !15 %74 = or i32 %7, 3, !dbg !15 %75 = zext nneg i32 %74 to i64, !dbg !15 %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !15 %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !15 %78 = zext nneg i32 %8 to i64, !dbg !15 %79 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %78, !dbg !15 %80 = load i16, ptr addrspace(3) %79, align 2, !dbg !15 %81 = or i32 %7, 513, !dbg !15 %82 = zext nneg i32 %81 to i64, !dbg !15 %83 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %82, !dbg !15 %84 = load i16, ptr addrspace(3) %83, align 2, !dbg !15 %85 = or i32 %7, 514, !dbg !15 %86 = zext nneg i32 %85 to i64, !dbg !15 %87 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %86, !dbg !15 %88 = load i16, ptr addrspace(3) %87, align 2, !dbg !15 %89 = or i32 %7, 515, !dbg !15 %90 = zext nneg i32 %89 to i64, !dbg !15 %91 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %90, !dbg !15 %92 = load i16, ptr addrspace(3) %91, align 2, !dbg !15 %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !15 %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #2, !dbg !15 %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #2, !dbg !15 %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !15 %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %80) #2, !dbg !15 %98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #2, !dbg !15 %99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %88) #2, !dbg !15 %100 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #2, !dbg !15 %101 = sext i32 %12 to i64, !dbg !16 %102 = getelementptr float, ptr addrspace(1) %1, i64 %101, !dbg !16 %103 = sext i32 %13 to i64, !dbg !16 %104 = getelementptr float, ptr addrspace(1) %1, i64 %103, !dbg !16 %105 = bitcast float %93 to i32, !dbg !17 %106 = bitcast float %94 to i32, !dbg !17 %107 = bitcast float %95 to i32, !dbg !17 %108 = bitcast float %96 to i32, !dbg !17 tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %105, i32 %106, i32 %107, i32 %108, ptr addrspace(1) %102, i1 %15) #2, !dbg !17 %109 = bitcast float %97 to i32, !dbg !17 %110 = bitcast float %98 to i32, !dbg !17 %111 = bitcast float %99 to i32, !dbg !17 %112 = bitcast float %100 to i32, !dbg !17 tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %109, i32 %110, i32 %111, i32 %112, ptr addrspace(1) %104, i1 %16) #2, !dbg !17 ret void, !dbg !18 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #1 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind } attributes #2 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx") !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 21, column: 36, scope: !5) !9 = !DILocation(line: 20, column: 28, scope: !5) !10 = !DILocation(line: 20, column: 33, scope: !5) !11 = !DILocation(line: 21, column: 23, scope: !5) !12 = !DILocation(line: 22, column: 21, scope: !5) !13 = !DILocation(line: 24, column: 30, scope: !5) !14 = !DILocation(line: 24, column: 35, scope: !5) !15 = !DILocation(line: 24, column: 45, scope: !5) !16 = !DILocation(line: 26, column: 25, scope: !5) !17 = !DILocation(line: 26, column: 36, scope: !5) !18 = !DILocation(line: 26, column: 4, scope: !5)