diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..3bb05eb2badd497715263c93571cb5c65f9bd9c1 --- /dev/null +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2faeea044dbfd59eaf429d039ae368ed0c3e500817ac1acaefb3720ceca1f5ea +size 84165672 diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so deleted file mode 100755 index e62b3fae918c3875e929ccd2454dfed4c789a4d5..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6330aa66b63067a8c9c031419773dc47e8853a717ef20b03c57df76660188831 -size 84165640 diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py index 277a7442e4587af84f3d834a442a703b02572322..3e252de8060a15717082e254a192b08e74eb7a74 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_wtjc356yopxde -ops = torch.ops._moe_wtjc356yopxde +from . import _moe_w3lspmuramohg +ops = torch.ops._moe_w3lspmuramohg def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_wtjc356yopxde::{op_name}" \ No newline at end of file + return f"_moe_w3lspmuramohg::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so deleted file mode 100755 index 2101d07ed8a1aabff3ad73ca6252cc698f422bf2..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0ca4f733821a564c525a36bb13e35ae960dc1e20f6472b177f67b9b165597ff -size 84364504 diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..aa064ee509da1cd2ff0319fae5b07211a8219d24 --- /dev/null +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5d4bd811ee24dd293d42959e6d23d66dddcc186b2ede701ebcbf6d66705fe1 +size 84364504 diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py index 3a1bc84783bcd053d6474360806eb23081de895d..2844878a66bb4bd304bb63e45f6d9ff1ec7f5778 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_fidhfyl4jgbje -ops = torch.ops._moe_fidhfyl4jgbje +from . import _moe_xztwj3vfii47s +ops = torch.ops._moe_xztwj3vfii47s def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_fidhfyl4jgbje::{op_name}" \ No newline at end of file + return f"_moe_xztwj3vfii47s::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so deleted file mode 100755 index a5ae5afb924ad8d777a912234c843c59effef3d6..0000000000000000000000000000000000000000 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0864e745883f687c46c9ce743f1e2887113734c57268b9bc0e290185be28cf65 -size 84063128 diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..10be8c15d732ad4b4e285d0495d410f033129abd --- /dev/null +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e33340a0b05f5776c1e5ef66e371b2c198dc00c03c810e2c4ef20923d7a417 +size 84063160 diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py index 2438914a584da6996cb28867d034bdc2bce6d85b..7af1a9c422e1e2d4f7a7887e0b5943b49ff4346f 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_sg5gu4g3brle6 -ops = torch.ops._moe_sg5gu4g3brle6 +from . import _moe_zjfwjryvbxcss +ops = torch.ops._moe_zjfwjryvbxcss def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_sg5gu4g3brle6::{op_name}" \ No newline at end of file + return f"_moe_zjfwjryvbxcss::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so deleted file mode 100755 index 26f200bcf4d9f945d1c55a64cdac19a5f1c0f427..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e83b7db92da1ee38a3a4e5a453d4279024e6af95efcf0ad4b34e275029e44729 -size 84157912 diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..f3dfad1d59add1e3437accb2fbfe03054ab5945c --- /dev/null +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aea1e40159b3d8ca879344b36d6c3229d764baf9553b1bef2a04460f1f03f31 +size 84157888 diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py index 96ff0854463fbc98464606eca0d575b3fbad23e6..dfbbaa9a5ad33f466c5a053001de9b44957d4ae0 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_v3wdnwni3a5ce -ops = torch.ops._moe_v3wdnwni3a5ce +from . import _moe_vjujc4o4hplak +ops = torch.ops._moe_vjujc4o4hplak def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_v3wdnwni3a5ce::{op_name}" \ No newline at end of file + return f"_moe_vjujc4o4hplak::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..d15f2a1a06d83d3036c5d2530eaf124f9883547d --- /dev/null +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71767ce941c8fb0e823c11cdebb01bfd77f2250df2873b862473803072276bf4 +size 84360960 diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so deleted file mode 100755 index 1fc65a47efca9e162e237d0eb070b50aa7374028..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d1b063e4c52f5d744025e000fd79c5f41cdf56a32883c2d269b9c59f586c9e4 -size 84360992 diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py index 0405f57430cc58455e5c76beb87ceec9af3b19be..ce415661e46e1cd67885c909a6041b7e4d8458db 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_hrq7opevcb4ug -ops = torch.ops._moe_hrq7opevcb4ug +from . import _moe_bjua6v5mj6njy +ops = torch.ops._moe_bjua6v5mj6njy def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_hrq7opevcb4ug::{op_name}" \ No newline at end of file + return f"_moe_bjua6v5mj6njy::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..2b52246752b1234718d047b179901beb6665e115 --- /dev/null +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38256704ec3f4ad93da175dff5054670c8e9db26b5573579d80331af6f271373 +size 84059616 diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so deleted file mode 100755 index a88d47ebcbc5522200d9a0b69cf9edd5ea99627c..0000000000000000000000000000000000000000 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d820072815ae83e3e838eb14f01356a09eeb9a5367851d5f6038d7346fd64564 -size 84059584 diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py index 5e40c76afde362dcd0401eadb7f9c61cdacb3c1c..b64a326716b6413536981462f2fb67402d887948 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_p3swbnotpexcc -ops = torch.ops._moe_p3swbnotpexcc +from . import _moe_ajhcvhc2njy6q +ops = torch.ops._moe_ajhcvhc2njy6q def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_p3swbnotpexcc::{op_name}" \ No newline at end of file + return f"_moe_ajhcvhc2njy6q::{op_name}" \ No newline at end of file diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so deleted file mode 100755 index 4eea5cfbef621fbeca58815ddcc542fba180a64f..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5defb7114c1ba9cfdb740230057cb0c5cb21efe628840771db32494a89b5aa7 -size 84165672 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..624ae4d510e43ca1180bc8459151f04b474d81a2 --- /dev/null +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb03ab835bafe70c299a49cec39abf27f5b5d78715b16eed3527a683181df529 +size 84165672 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py index 5bd9a9581e86fee5f992b4355e04fc0eb148de1d..3c63395ada1d0726891986eb9bad52ef938c16e9 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_nskz7v224zllw -ops = torch.ops._moe_nskz7v224zllw +from . import _moe_wbafjrt24mw7y +ops = torch.ops._moe_wbafjrt24mw7y def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_nskz7v224zllw::{op_name}" \ No newline at end of file + return f"_moe_wbafjrt24mw7y::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..09c6a19ad273ed7cb2f0fbbbacea606a4e31a2de --- /dev/null +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378a8a453186ae62a92342077a988271cd7a02f46fbe303b4505d4484f1bfaef +size 84364536 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so deleted file mode 100755 index 5f165e4440c502173857a71eb64f85bb861c9df6..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8094d225249868d1f1c0abbfe8db3a486a99bd1f0928705e7dd5a998f125d8bf -size 84364504 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py index 8df4ba928a7302037d3fdf29eb3aef0360610cc2..5fa8a278f2d590343a821e67c73856ef7b041dc0 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_t32bhzwhzero6 -ops = torch.ops._moe_t32bhzwhzero6 +from . import _moe_ezuwtpw27xv6u +ops = torch.ops._moe_ezuwtpw27xv6u def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_t32bhzwhzero6::{op_name}" \ No newline at end of file + return f"_moe_ezuwtpw27xv6u::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..b4e5aeace7f574a6943c04cfa6909eb90d901801 --- /dev/null +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae1204c5e2f4c7692676e0ef703dbab4f20a9f14652c75dee41b8d56560db19 +size 84063128 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so deleted file mode 100755 index 3838db93c75ca2488554de6993944421f63e50d9..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23f0aec499051a34ed7ba7ac4e58d7d84c5501b8beb1794d6ae8c13f54b08b9e -size 84063160 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py index e67d8f0575a8eb529b25cbb908fcf9fd9badcddc..35ff3f2596227d70369578217f63970f033d3d90 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_pgljmg5ek5k4e -ops = torch.ops._moe_pgljmg5ek5k4e +from . import _moe_b3lelvb3xhtk2 +ops = torch.ops._moe_b3lelvb3xhtk2 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_pgljmg5ek5k4e::{op_name}" \ No newline at end of file + return f"_moe_b3lelvb3xhtk2::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so deleted file mode 100755 index 55b89014e1499c090462b343a113029d3a965a3c..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1eef7e6a15aca930caa813a845147beeec16159c8cce89891c40d080a6f3062 -size 84157880 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..05e613750fdcfdedc95bba9406de1e574a16033d --- /dev/null +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b8ebfaa74892fb13f34924a63e188b9593cc3290831bf31e0f78ae99c9526b0 +size 84157856 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py index f4081f5f2ab5660696b4e3fb2fd24f7b9efa4a9e..bbc56fb61cf5ea61644707b67783c875096d06ef 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_dtibz76vuxaaq -ops = torch.ops._moe_dtibz76vuxaaq +from . import _moe_mqt4gjnisx6je +ops = torch.ops._moe_mqt4gjnisx6je def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_dtibz76vuxaaq::{op_name}" \ No newline at end of file + return f"_moe_mqt4gjnisx6je::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so deleted file mode 100755 index 987c04fc1787c3fb4c9459ff25a436075b7dd0b7..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:704adc83ab06534f1af22b829003765b42c118df3790569b346ef36e7be570de -size 84360960 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..0733f1b85f27350e3897040ee33574ad9ed06671 --- /dev/null +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3891e7e17cd7509440cc41e9fb9fe767e29914454eeb44019993cbc4c47397d9 +size 84360960 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py index 45d8b48ee178ee36ad526495ece27b491b8a2947..698864af56e2ca956eeb9623d75d4fcab1342acf 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_plblvprmwqffy -ops = torch.ops._moe_plblvprmwqffy +from . import _moe_xdjcc2jnxatjk +ops = torch.ops._moe_xdjcc2jnxatjk def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_plblvprmwqffy::{op_name}" \ No newline at end of file + return f"_moe_xdjcc2jnxatjk::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")) diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py @@ -1,19 +1,5 @@ -from typing import TYPE_CHECKING - import torch -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING: - - def register_fake(fn): - return lambda name: fn - -else: - try: - from torch.library import register_fake - except ImportError: - from torch.library import impl_abstract as register_fake - from ._ops import add_op_namespace_prefix, ops from .fused_marlin_moe import fused_marlin_moe from .fused_moe import fused_moe, fused_topk, grouped_topk @@ -91,39 +77,6 @@ def topk_softmax( ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if hasattr(ops, "marlin_gemm_moe"): - - @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type: ScalarType, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - ops.silu_and_mul(out, x) - return out - - __all__ = [ "gptq_marlin_moe_repack", "awq_marlin_moe_repack", diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..da3a5c9bec521db1e94b8b2c2e209f1d3f9dd924 --- /dev/null +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daa2623a195e4ac85e3fe6403f3d827fedb3119c802ef2e6e2c06caf603755de +size 84059584 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so deleted file mode 100755 index d7b008bfd1f3eef97c4c870812388339b2baf82d..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:259f926d53dc10e91ef41311f61bcea93fbdbda94758fdca164b37256f9c69de -size 84059616 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py index 42eb05054c3c7990ce1ea4ea25d709eb0f41211a..af27c5e61c6f340d4327217b058d440afd340af7 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_k6bmwmtgkqymw -ops = torch.ops._moe_k6bmwmtgkqymw +from . import _moe_2wvvf3dwfnuuk +ops = torch.ops._moe_2wvvf3dwfnuuk def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_k6bmwmtgkqymw::{op_name}" \ No newline at end of file + return f"_moe_2wvvf3dwfnuuk::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py @@ -1,13 +1,25 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch +from ._ops import add_op_namespace_prefix, ops from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config -from .scalar_type import scalar_types -import moe as ops +from .scalar_type import ScalarType, scalar_types + +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn + +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake def get_scalar_type(num_bits: int, has_zp: bool): @@ -116,7 +128,7 @@ def single_marlin_moe( scalar_type = get_scalar_type(num_bits, has_zero_point) - intermediate_cache = ops.ops.marlin_gemm_moe( + intermediate_cache = ops.marlin_gemm_moe( hidden_states, w, sorted_token_ids, @@ -287,7 +299,7 @@ def fused_marlin_moe( dtype=hidden_states.dtype, ) - intermediate_cache1 = ops.ops.marlin_gemm_moe( + intermediate_cache1 = ops.marlin_gemm_moe( hidden_states, w1, sorted_token_ids, @@ -312,7 +324,7 @@ def fused_marlin_moe( ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - intermediate_cache3 = ops.ops.marlin_gemm_moe( + intermediate_cache3 = ops.marlin_gemm_moe( intermediate_cache2, w2, sorted_token_ids, @@ -336,3 +348,31 @@ def fused_marlin_moe( ) return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +if hasattr(ops, "marlin_gemm_moe"): + + @register_fake(add_op_namespace_prefix("marlin_gemm_moe")) + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -9,9 +9,9 @@ import torch import triton import triton.language as tl -from .platforms import current_platform +from ._ops import ops from .fp8 import scaled_fp8_quant -import moe as ops +from .platforms import current_platform VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))