Try to avoid fake op registration issues

Browse files

Files changed (3) hide show

ext-torch/moe/__init__.py +0 -47
ext-torch/moe/fused_marlin_moe.py +46 -6
ext-torch/moe/fused_moe.py +2 -2

ext-torch/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

ext-torch/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

ext-torch/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))