; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %5 = and i32 %4, 127, !dbg !8 %6 = shl nuw nsw i32 %5, 3, !dbg !8 %7 = shl nuw nsw i32 %5, 2, !dbg !8 %8 = or i32 %7, 512, !dbg !8 %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9 %10 = shl i32 %9, 10, !dbg !10 %11 = or i32 %10, %6, !dbg !11 %12 = or i32 %10, %7, !dbg !11 %13 = or i32 %10, %8, !dbg !11 %14 = sext i32 %11 to i64, !dbg !12 %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12 %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13 %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13 %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13 %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13 %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13 %21 = trunc i32 %17 to i16, !dbg !13 %extelt.offset = lshr i32 %17, 16, !dbg !13 %22 = trunc i32 %extelt.offset to i16, !dbg !13 %23 = trunc i32 %18 to i16, !dbg !13 %extelt.offset1 = lshr i32 %18, 16, !dbg !13 %24 = trunc i32 %extelt.offset1 to i16, !dbg !13 %25 = trunc i32 %19 to i16, !dbg !13 %extelt.offset2 = lshr i32 %19, 16, !dbg !13 %26 = trunc i32 %extelt.offset2 to i16, !dbg !13 %27 = trunc i32 %20 to i16, !dbg !13 %extelt.offset3 = lshr i32 %20, 16, !dbg !13 %28 = trunc i32 %extelt.offset3 to i16, !dbg !13 %29 = zext nneg i32 %6 to i64, !dbg !14 %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14 %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14 store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14 %32 = or i32 %6, 1, !dbg !14 %33 = zext nneg i32 %32 to i64, !dbg !14 %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14 %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14 store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14 %36 = or i32 %6, 2, !dbg !14 %37 = zext nneg i32 %36 to i64, !dbg !14 %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14 %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14 store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14 %40 = or i32 %6, 3, !dbg !14 %41 = zext nneg i32 %40 to i64, !dbg !14 %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14 %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14 store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14 %44 = or i32 %6, 4, !dbg !14 %45 = zext nneg i32 %44 to i64, !dbg !14 %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14 %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14 store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14 %48 = or i32 %6, 5, !dbg !14 %49 = zext nneg i32 %48 to i64, !dbg !14 %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14 %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14 store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14 %52 = or i32 %6, 6, !dbg !14 %53 = zext nneg i32 %52 to i64, !dbg !14 %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14 %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14 store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14 %56 = or i32 %6, 7, !dbg !14 %57 = zext nneg i32 %56 to i64, !dbg !14 %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14 %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14 store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14 tail call void @llvm.nvvm.barrier0(), !dbg !14 %60 = zext nneg i32 %7 to i64, !dbg !14 %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14 %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14 %63 = or i32 %7, 1, !dbg !14 %64 = zext nneg i32 %63 to i64, !dbg !14 %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14 %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14 %67 = or i32 %7, 2, !dbg !14 %68 = zext nneg i32 %67 to i64, !dbg !14 %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14 %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14 %71 = or i32 %7, 3, !dbg !14 %72 = zext nneg i32 %71 to i64, !dbg !14 %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14 %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14 %75 = zext nneg i32 %8 to i64, !dbg !14 %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14 %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14 %78 = or i32 %7, 513, !dbg !14 %79 = zext nneg i32 %78 to i64, !dbg !14 %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14 %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14 %82 = or i32 %7, 514, !dbg !14 %83 = zext nneg i32 %82 to i64, !dbg !14 %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14 %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14 %86 = or i32 %7, 515, !dbg !14 %87 = zext nneg i32 %86 to i64, !dbg !14 %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14 %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14 %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14 %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14 %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14 %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14 %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14 %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14 %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14 %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14 %98 = sext i32 %12 to i64, !dbg !15 %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15 %100 = sext i32 %13 to i64, !dbg !15 %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15 %102 = bitcast float %90 to i32, !dbg !16 %103 = bitcast float %91 to i32, !dbg !16 %104 = bitcast float %92 to i32, !dbg !16 %105 = bitcast float %93 to i32, !dbg !16 tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16 %106 = bitcast float %94 to i32, !dbg !16 %107 = bitcast float %95 to i32, !dbg !16 %108 = bitcast float %96 to i32, !dbg !16 %109 = bitcast float %97 to i32, !dbg !16 tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16 ret void, !dbg !17 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #1 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind } attributes #2 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot") !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 21, column: 36, scope: !5) !9 = !DILocation(line: 20, column: 28, scope: !5) !10 = !DILocation(line: 20, column: 33, scope: !5) !11 = !DILocation(line: 21, column: 23, scope: !5) !12 = !DILocation(line: 24, column: 30, scope: !5) !13 = !DILocation(line: 24, column: 35, scope: !5) !14 = !DILocation(line: 24, column: 44, scope: !5) !15 = !DILocation(line: 26, column: 25, scope: !5) !16 = !DILocation(line: 26, column: 36, scope: !5) !17 = !DILocation(line: 26, column: 4, scope: !5)