diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 749c8ab7f0013f6742f824035512f6ab106098f9..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c1fc3d94e9a7d4b7c0cf13dd3a9633efef2ed265222b17b22d32282818b7bd1 -size 84165576 diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..e62b3fae918c3875e929ccd2454dfed4c789a4d5 --- /dev/null +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6330aa66b63067a8c9c031419773dc47e8853a717ef20b03c57df76660188831 +size 84165640 diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..277a7442e4587af84f3d834a442a703b02572322 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_wtjc356yopxde +ops = torch.ops._moe_wtjc356yopxde def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_wtjc356yopxde::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 96366cea06aa3fbd657651cf78a2cb8698925a61..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd5492f9d9216ee88cfc40f373b19207c8e5f04ba8c55c58aec3ecc9f9ad3239 -size 84364440 diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..2101d07ed8a1aabff3ad73ca6252cc698f422bf2 --- /dev/null +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ca4f733821a564c525a36bb13e35ae960dc1e20f6472b177f67b9b165597ff +size 84364504 diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..3a1bc84783bcd053d6474360806eb23081de895d 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_fidhfyl4jgbje +ops = torch.ops._moe_fidhfyl4jgbje def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_fidhfyl4jgbje::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index c816881379ec38d7a730448f541cad9d01d964ba..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c1dd7f6fb98ad1ed39a402e1e42f3231645949dcc5cef28739f4e093883e0184 -size 84063064 diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..a5ae5afb924ad8d777a912234c843c59effef3d6 --- /dev/null +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0864e745883f687c46c9ce743f1e2887113734c57268b9bc0e290185be28cf65 +size 84063128 diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..2438914a584da6996cb28867d034bdc2bce6d85b 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_sg5gu4g3brle6 +ops = torch.ops._moe_sg5gu4g3brle6 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_sg5gu4g3brle6::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index d16040f32bd65235ff086cd1651afc886107d76d..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1994e01d53c190da58a4a864b648421b515e2171abd320184164507e1aa4f1fe -size 84157816 diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..26f200bcf4d9f945d1c55a64cdac19a5f1c0f427 --- /dev/null +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e83b7db92da1ee38a3a4e5a453d4279024e6af95efcf0ad4b34e275029e44729 +size 84157912 diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..96ff0854463fbc98464606eca0d575b3fbad23e6 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_v3wdnwni3a5ce +ops = torch.ops._moe_v3wdnwni3a5ce def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_v3wdnwni3a5ce::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index a7e492cf6a813e48fc6edbdf38e6ed79b0e0a6c4..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bd45d6fb85953a97cf3b6ceecee61b3298a3b6d1b46708ca6618689f63d6aa9 -size 84360896 diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..1fc65a47efca9e162e237d0eb070b50aa7374028 --- /dev/null +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d1b063e4c52f5d744025e000fd79c5f41cdf56a32883c2d269b9c59f586c9e4 +size 84360992 diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..0405f57430cc58455e5c76beb87ceec9af3b19be 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_hrq7opevcb4ug +ops = torch.ops._moe_hrq7opevcb4ug def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_hrq7opevcb4ug::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 9edacbfbf47724f264d163efa8699e866fadc548..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:973886e7a4e11ba2161ffe3034cdc52323321f712463b8378dbb6fc4c420b934 -size 84059552 diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..a88d47ebcbc5522200d9a0b69cf9edd5ea99627c --- /dev/null +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d820072815ae83e3e838eb14f01356a09eeb9a5367851d5f6038d7346fd64564 +size 84059584 diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..5e40c76afde362dcd0401eadb7f9c61cdacb3c1c 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_p3swbnotpexcc +ops = torch.ops._moe_p3swbnotpexcc def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_p3swbnotpexcc::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index efdbe18abfeb7225a15b17cdc7c3c94c821352a9..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d9e6d3dc978ae8aee87335a292d4ee55278658dabc3319829f3d4a7722de303c -size 84165608 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..4eea5cfbef621fbeca58815ddcc542fba180a64f --- /dev/null +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5defb7114c1ba9cfdb740230057cb0c5cb21efe628840771db32494a89b5aa7 +size 84165672 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..5bd9a9581e86fee5f992b4355e04fc0eb148de1d 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_nskz7v224zllw +ops = torch.ops._moe_nskz7v224zllw def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_nskz7v224zllw::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index dbfb190d598af93eb0f164652159a2f8b2517505..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:35112cbe69729f9843c91eda4acc549df354d09f9b3fbfaf704820cefc5ffd86 -size 84364440 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..5f165e4440c502173857a71eb64f85bb861c9df6 --- /dev/null +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8094d225249868d1f1c0abbfe8db3a486a99bd1f0928705e7dd5a998f125d8bf +size 84364504 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..8df4ba928a7302037d3fdf29eb3aef0360610cc2 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_t32bhzwhzero6 +ops = torch.ops._moe_t32bhzwhzero6 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_t32bhzwhzero6::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index adf24bedf870eed4989a31fdd628c816e2383ecb..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:554ef8777913b7c73fd3d8aeeb08e441dc189d26765676a56f5d704f05e4846e -size 84063096 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..3838db93c75ca2488554de6993944421f63e50d9 --- /dev/null +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23f0aec499051a34ed7ba7ac4e58d7d84c5501b8beb1794d6ae8c13f54b08b9e +size 84063160 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..e67d8f0575a8eb529b25cbb908fcf9fd9badcddc 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_pgljmg5ek5k4e +ops = torch.ops._moe_pgljmg5ek5k4e def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_pgljmg5ek5k4e::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 0efa2f346fa7d739514c4da79fd488ac5c17a8fa..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96216ac120dbf99500906eaa6beeaf30c03e07044c0e394e6a83be25a4e184ce -size 84157824 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..55b89014e1499c090462b343a113029d3a965a3c --- /dev/null +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1eef7e6a15aca930caa813a845147beeec16159c8cce89891c40d080a6f3062 +size 84157880 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..f4081f5f2ab5660696b4e3fb2fd24f7b9efa4a9e 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_dtibz76vuxaaq +ops = torch.ops._moe_dtibz76vuxaaq def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_dtibz76vuxaaq::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 93c7ec969117432c163862a4536d07f1d4ffd584..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:570d15a2c3120695fada586323820f6b3913e514d3d495680fe5cb716445a851 -size 84360896 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..987c04fc1787c3fb4c9459ff25a436075b7dd0b7 --- /dev/null +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704adc83ab06534f1af22b829003765b42c118df3790569b346ef36e7be570de +size 84360960 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..45d8b48ee178ee36ad526495ece27b491b8a2947 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_plblvprmwqffy +ops = torch.ops._moe_plblvprmwqffy def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_plblvprmwqffy::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py @@ -1 +1,135 @@ -import moe._custom_ops as ops +from typing import TYPE_CHECKING + +import torch + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +from ._ops import add_op_namespace_prefix, ops +from .fused_marlin_moe import fused_marlin_moe +from .fused_moe import fused_moe, fused_topk, grouped_topk +from .scalar_type import ScalarType, scalar_types + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) + return output + + +def moe_sum(input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indicies: torch.Tensor, + gating_output: float, +) -> None: + ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(out, x) + return out + + +__all__ = [ + "gptq_marlin_moe_repack", + "awq_marlin_moe_repack", + "fused_marlin_moe", + "moe_sum", + "moe_align_block_size", + "topk_softmax", + "fused_moe", +] diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py deleted file mode 100644 index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import TYPE_CHECKING - -import torch - -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - -try: - from ._ops import ops, add_op_namespace_prefix -except ImportError as e: - # Fallback for local development. - try: - import _moe - - ops = torch._moe - - def add_op_namespace_prefix(op_name: str): - return f"_quantization::{op_name}" - - except ImportError: - raise e - -from .scalar_type import ScalarType - -def gptq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.gptq_marlin_repack( - b_q_weight[e], perm[e], size_k, size_n, num_bits - ) - return output - - -def awq_marlin_moe_repack( - b_q_weight: torch.Tensor, - perm: torch.Tensor, - size_k: int, - size_n: int, - num_bits: int, -) -> torch.Tensor: - num_experts = b_q_weight.shape[0] - assert size_k % 16 == 0 - output = torch.empty( - (num_experts, size_k // 16, size_n * (num_bits // 2)), - device=b_q_weight.device, - dtype=b_q_weight.dtype, - ) - for e in range(num_experts): - output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits) - return output - - -def moe_sum(input: torch.Tensor, output: torch.Tensor): - ops.moe_sum(input, output) - - -def moe_align_block_size( - topk_ids: torch.Tensor, - num_experts: int, - block_size: int, - sorted_token_ids: torch.Tensor, - experts_ids: torch.Tensor, - num_tokens_post_pad: torch.Tensor, -) -> None: - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_token_ids, - experts_ids, - num_tokens_post_pad, - ) - - -def topk_softmax( - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, - gating_output: float, -) -> None: - ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) - -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so deleted file mode 100755 index 5df775039c5dbea7f14c9c6507bbdf7da46c46f2..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a1cc0068fc943693e8c39c2b43b147584ee43a3046629583a95bfb4244fdf2a -size 84059520 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..d7b008bfd1f3eef97c4c870812388339b2baf82d --- /dev/null +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259f926d53dc10e91ef41311f61bcea93fbdbda94758fdca164b37256f9c69de +size 84059616 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..42eb05054c3c7990ce1ea4ea25d709eb0f41211a 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_0_0_1 -ops = torch.ops._moe_0_0_1 +from . import _moe_k6bmwmtgkqymw +ops = torch.ops._moe_k6bmwmtgkqymw def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_0_0_1::{op_name}" \ No newline at end of file + return f"_moe_k6bmwmtgkqymw::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -7,7 +7,7 @@ import torch from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config from .scalar_type import scalar_types -import moe._custom_ops as ops +import moe as ops def get_scalar_type(num_bits: int, has_zp: bool): diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -11,7 +11,7 @@ import triton.language as tl from .platforms import current_platform from .fp8 import scaled_fp8_quant -import moe._custom_ops as ops +import moe as ops VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))