; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" @global_smem = external addrspace(3) global [0 x i8] define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 { %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 %10 = and i32 %9, 31, !dbg !8 %11 = lshr i32 %9, 5, !dbg !8 %12 = and i32 %11, 1, !dbg !8 %urem = shl i32 %9, 2, !dbg !8 %13 = and i32 %urem, 252, !dbg !8 %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 %15 = shl i32 %14, 8, !dbg !10 %16 = or i32 %15, %13, !dbg !11 %17 = sext i32 %16 to i64, !dbg !12 %18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12 %19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 %20 = extractvalue { i32, i32 } %19, 0, !dbg !13 %21 = extractvalue { i32, i32 } %19, 1, !dbg !13 %22 = trunc i32 %20 to i16, !dbg !13 %extelt.offset = lshr i32 %20, 16, !dbg !13 %23 = trunc i32 %extelt.offset to i16, !dbg !13 %24 = trunc i32 %21 to i16, !dbg !13 %extelt.offset1 = lshr i32 %21, 16, !dbg !13 %25 = trunc i32 %extelt.offset1 to i16, !dbg !13 %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14 %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14 %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14 %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14 %30 = zext nneg i32 %13 to i64, !dbg !15 %31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15 %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16 %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16 %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16 %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16 %37 = bitcast i32 %33 to float, !dbg !16 %38 = bitcast i32 %34 to float, !dbg !16 %39 = bitcast i32 %35 to float, !dbg !16 %40 = bitcast i32 %36 to float, !dbg !16 %41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17 %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18 %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18 %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18 %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18 %47 = bitcast i32 %43 to float, !dbg !18 %48 = bitcast i32 %44 to float, !dbg !18 %49 = bitcast i32 %45 to float, !dbg !18 %50 = bitcast i32 %46 to float, !dbg !18 %51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19 %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20 %54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20 %55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20 %56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20 %57 = bitcast i32 %53 to float, !dbg !20 %58 = bitcast i32 %54 to float, !dbg !20 %59 = bitcast i32 %55 to float, !dbg !20 %60 = bitcast i32 %56 to float, !dbg !20 %61 = sext i32 %14 to i64, !dbg !21 %62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21 %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 %64 = bitcast i32 %63 to float, !dbg !22 %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 %66 = bitcast i32 %65 to float, !dbg !22 %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 %68 = bitcast i32 %67 to float, !dbg !22 %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 %70 = bitcast i32 %69 to float, !dbg !22 %71 = fmul float %26, %37, !dbg !23 %72 = fmul float %27, %38, !dbg !23 %73 = fmul float %28, %39, !dbg !23 %74 = fmul float %29, %40, !dbg !23 %75 = fadd float %71, %72, !dbg !24 %76 = fadd float %73, %75, !dbg !24 %77 = fadd float %74, %76, !dbg !24 %78 = bitcast float %77 to i32, !dbg !30 %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30 %80 = bitcast i32 %79 to float, !dbg !30 %81 = fadd float %77, %80, !dbg !24 %82 = bitcast float %81 to i32, !dbg !30 %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30 %84 = bitcast i32 %83 to float, !dbg !30 %85 = fadd float %81, %84, !dbg !24 %86 = bitcast float %85 to i32, !dbg !30 %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30 %88 = bitcast i32 %87 to float, !dbg !30 %89 = fadd float %85, %88, !dbg !24 %90 = bitcast float %89 to i32, !dbg !30 %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30 %92 = bitcast i32 %91 to float, !dbg !30 %93 = fadd float %89, %92, !dbg !24 %94 = bitcast float %93 to i32, !dbg !30 %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30 %96 = bitcast i32 %95 to float, !dbg !30 %97 = fadd float %93, %96, !dbg !24 %98 = icmp eq i32 %10, 0, !dbg !30 %99 = zext nneg i32 %12 to i64, !dbg !30 %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30 tail call void @llvm.nvvm.barrier0(), !dbg !30 %101 = icmp slt i32 %9, 2, !dbg !30 %102 = sext i32 %9 to i64, !dbg !30 %103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30 %104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30 %105 = bitcast float %104 to i32, !dbg !30 %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30 %107 = bitcast i32 %106 to float, !dbg !30 %108 = fadd float %104, %107, !dbg !24 %109 = and i32 %9, 1, !dbg !30 %110 = icmp eq i32 %109, 0, !dbg !30 %111 = and i1 %101, %110, !dbg !30 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30 tail call void @llvm.nvvm.barrier0(), !dbg !30 %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30 %113 = fadd float %112, 0.000000e+00, !dbg !32 %114 = fmul float %71, %47, !dbg !36 %115 = fmul float %72, %48, !dbg !36 %116 = fmul float %73, %49, !dbg !36 %117 = fmul float %74, %50, !dbg !36 tail call void @llvm.nvvm.barrier0(), !dbg !37 %118 = fadd float %114, %115, !dbg !39 %119 = fadd float %116, %118, !dbg !39 %120 = fadd float %117, %119, !dbg !39 %121 = bitcast float %120 to i32, !dbg !37 %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37 %123 = bitcast i32 %122 to float, !dbg !37 %124 = fadd float %120, %123, !dbg !39 %125 = bitcast float %124 to i32, !dbg !37 %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37 %127 = bitcast i32 %126 to float, !dbg !37 %128 = fadd float %124, %127, !dbg !39 %129 = bitcast float %128 to i32, !dbg !37 %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37 %131 = bitcast i32 %130 to float, !dbg !37 %132 = fadd float %128, %131, !dbg !39 %133 = bitcast float %132 to i32, !dbg !37 %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37 %135 = bitcast i32 %134 to float, !dbg !37 %136 = fadd float %132, %135, !dbg !39 %137 = bitcast float %136 to i32, !dbg !37 %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37 %139 = bitcast i32 %138 to float, !dbg !37 %140 = fadd float %136, %139, !dbg !39 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37 tail call void @llvm.nvvm.barrier0(), !dbg !37 %141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37 %142 = bitcast float %141 to i32, !dbg !37 %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37 %144 = bitcast i32 %143 to float, !dbg !37 %145 = fadd float %141, %144, !dbg !39 tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37 tail call void @llvm.nvvm.barrier0(), !dbg !37 %146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37 %147 = fadd float %146, 0.000000e+00, !dbg !42 %148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44 %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44 %150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44 %151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44 %152 = fmul float %71, 2.560000e+02, !dbg !45 %153 = fmul float %72, 2.560000e+02, !dbg !45 %154 = fmul float %73, 2.560000e+02, !dbg !45 %155 = fmul float %74, 2.560000e+02, !dbg !45 %156 = fsub float %152, %113, !dbg !46 %157 = fsub float %153, %113, !dbg !46 %158 = fsub float %154, %113, !dbg !46 %159 = fsub float %155, %113, !dbg !46 %160 = fmul float %147, %47, !dbg !47 %161 = fmul float %147, %48, !dbg !47 %162 = fmul float %147, %49, !dbg !47 %163 = fmul float %147, %50, !dbg !47 %164 = fsub float %156, %160, !dbg !48 %165 = fsub float %157, %161, !dbg !48 %166 = fsub float %158, %162, !dbg !48 %167 = fsub float %159, %163, !dbg !48 %168 = fmul float %148, %164, !dbg !49 %169 = fmul float %148, %165, !dbg !49 %170 = fmul float %148, %166, !dbg !49 %171 = fmul float %148, %167, !dbg !49 %172 = fadd float %168, %57, !dbg !50 %173 = fadd float %169, %58, !dbg !50 %174 = fadd float %170, %59, !dbg !50 %175 = fadd float %171, %60, !dbg !50 %176 = bitcast float %172 to i32, !dbg !51 %177 = bitcast float %173 to i32, !dbg !51 %178 = bitcast float %174 to i32, !dbg !51 %179 = bitcast float %175 to i32, !dbg !51 tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51 %180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52 %181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53 %182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53 %183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53 %184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53 %185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53 %186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53 %187 = bitcast <2 x i16> %186 to i32, !dbg !53 %188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53 %189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53 %190 = bitcast <2 x i16> %189 to i32, !dbg !53 tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53 ret void, !dbg !54 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #2 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } attributes #2 = { convergent nocallback nounwind } attributes #3 = { nounwind } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} !nvvm.annotations = !{!3, !4, !4, !3} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) !2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn") !3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1} !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64} !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) !6 = !DISubroutineType(cc: DW_CC_normal, types: !7) !7 = !{} !8 = !DILocation(line: 26, column: 26, scope: !5) !9 = !DILocation(line: 23, column: 28, scope: !5) !10 = !DILocation(line: 30, column: 40, scope: !5) !11 = !DILocation(line: 30, column: 36, scope: !5) !12 = !DILocation(line: 30, column: 30, scope: !5) !13 = !DILocation(line: 30, column: 46, scope: !5) !14 = !DILocation(line: 30, column: 67, scope: !5) !15 = !DILocation(line: 31, column: 30, scope: !5) !16 = !DILocation(line: 31, column: 35, scope: !5) !17 = !DILocation(line: 32, column: 30, scope: !5) !18 = !DILocation(line: 32, column: 46, scope: !5) !19 = !DILocation(line: 33, column: 35, scope: !5) !20 = !DILocation(line: 33, column: 51, scope: !5) !21 = !DILocation(line: 34, column: 31, scope: !5) !22 = !DILocation(line: 34, column: 36, scope: !5) !23 = !DILocation(line: 36, column: 18, scope: !5) !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28) !25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0) !26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") !27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0) !28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29) !29 = !DILocation(line: 39, column: 57, scope: !25) !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) !31 = !DILocation(line: 39, column: 57, scope: !27) !32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35) !33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0) !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") !35 = !DILocation(line: 39, column: 44, scope: !33) !36 = !DILocation(line: 40, column: 18, scope: !5) !37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38) !38 = !DILocation(line: 43, column: 59, scope: !27) !39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40) !40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41) !41 = !DILocation(line: 43, column: 59, scope: !25) !42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43) !43 = !DILocation(line: 43, column: 45, scope: !33) !44 = !DILocation(line: 45, column: 20, scope: !5) !45 = !DILocation(line: 46, column: 19, scope: !5) !46 = !DILocation(line: 47, column: 20, scope: !5) !47 = !DILocation(line: 48, column: 19, scope: !5) !48 = !DILocation(line: 49, column: 20, scope: !5) !49 = !DILocation(line: 50, column: 20, scope: !5) !50 = !DILocation(line: 51, column: 20, scope: !5) !51 = !DILocation(line: 53, column: 51, scope: !5) !52 = !DILocation(line: 54, column: 25, scope: !5) !53 = !DILocation(line: 54, column: 48, scope: !5) !54 = !DILocation(line: 54, column: 4, scope: !5)