diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..3bb05eb2badd497715263c93571cb5c65f9bd9c1
--- /dev/null
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2faeea044dbfd59eaf429d039ae368ed0c3e500817ac1acaefb3720ceca1f5ea
+size 84165672
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so
deleted file mode 100755
index e62b3fae918c3875e929ccd2454dfed4c789a4d5..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6330aa66b63067a8c9c031419773dc47e8853a717ef20b03c57df76660188831
-size 84165640
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
index 277a7442e4587af84f3d834a442a703b02572322..3e252de8060a15717082e254a192b08e74eb7a74 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_wtjc356yopxde
-ops = torch.ops._moe_wtjc356yopxde
+from . import _moe_w3lspmuramohg
+ops = torch.ops._moe_w3lspmuramohg
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_wtjc356yopxde::{op_name}"
\ No newline at end of file
+    return f"_moe_w3lspmuramohg::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so
deleted file mode 100755
index 2101d07ed8a1aabff3ad73ca6252cc698f422bf2..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b0ca4f733821a564c525a36bb13e35ae960dc1e20f6472b177f67b9b165597ff
-size 84364504
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..aa064ee509da1cd2ff0319fae5b07211a8219d24
--- /dev/null
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5d4bd811ee24dd293d42959e6d23d66dddcc186b2ede701ebcbf6d66705fe1
+size 84364504
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
index 3a1bc84783bcd053d6474360806eb23081de895d..2844878a66bb4bd304bb63e45f6d9ff1ec7f5778 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_fidhfyl4jgbje
-ops = torch.ops._moe_fidhfyl4jgbje
+from . import _moe_xztwj3vfii47s
+ops = torch.ops._moe_xztwj3vfii47s
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_fidhfyl4jgbje::{op_name}"
\ No newline at end of file
+    return f"_moe_xztwj3vfii47s::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so
deleted file mode 100755
index a5ae5afb924ad8d777a912234c843c59effef3d6..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0864e745883f687c46c9ce743f1e2887113734c57268b9bc0e290185be28cf65
-size 84063128
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..10be8c15d732ad4b4e285d0495d410f033129abd
--- /dev/null
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e33340a0b05f5776c1e5ef66e371b2c198dc00c03c810e2c4ef20923d7a417
+size 84063160
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
index 2438914a584da6996cb28867d034bdc2bce6d85b..7af1a9c422e1e2d4f7a7887e0b5943b49ff4346f 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_sg5gu4g3brle6
-ops = torch.ops._moe_sg5gu4g3brle6
+from . import _moe_zjfwjryvbxcss
+ops = torch.ops._moe_zjfwjryvbxcss
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_sg5gu4g3brle6::{op_name}"
\ No newline at end of file
+    return f"_moe_zjfwjryvbxcss::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so
deleted file mode 100755
index 26f200bcf4d9f945d1c55a64cdac19a5f1c0f427..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e83b7db92da1ee38a3a4e5a453d4279024e6af95efcf0ad4b34e275029e44729
-size 84157912
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..f3dfad1d59add1e3437accb2fbfe03054ab5945c
--- /dev/null
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0aea1e40159b3d8ca879344b36d6c3229d764baf9553b1bef2a04460f1f03f31
+size 84157888
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
index 96ff0854463fbc98464606eca0d575b3fbad23e6..dfbbaa9a5ad33f466c5a053001de9b44957d4ae0 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_v3wdnwni3a5ce
-ops = torch.ops._moe_v3wdnwni3a5ce
+from . import _moe_vjujc4o4hplak
+ops = torch.ops._moe_vjujc4o4hplak
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_v3wdnwni3a5ce::{op_name}"
\ No newline at end of file
+    return f"_moe_vjujc4o4hplak::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..d15f2a1a06d83d3036c5d2530eaf124f9883547d
--- /dev/null
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71767ce941c8fb0e823c11cdebb01bfd77f2250df2873b862473803072276bf4
+size 84360960
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so
deleted file mode 100755
index 1fc65a47efca9e162e237d0eb070b50aa7374028..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d1b063e4c52f5d744025e000fd79c5f41cdf56a32883c2d269b9c59f586c9e4
-size 84360992
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
index 0405f57430cc58455e5c76beb87ceec9af3b19be..ce415661e46e1cd67885c909a6041b7e4d8458db 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_hrq7opevcb4ug
-ops = torch.ops._moe_hrq7opevcb4ug
+from . import _moe_bjua6v5mj6njy
+ops = torch.ops._moe_bjua6v5mj6njy
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_hrq7opevcb4ug::{op_name}"
\ No newline at end of file
+    return f"_moe_bjua6v5mj6njy::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..2b52246752b1234718d047b179901beb6665e115
--- /dev/null
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38256704ec3f4ad93da175dff5054670c8e9db26b5573579d80331af6f271373
+size 84059616
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so
deleted file mode 100755
index a88d47ebcbc5522200d9a0b69cf9edd5ea99627c..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d820072815ae83e3e838eb14f01356a09eeb9a5367851d5f6038d7346fd64564
-size 84059584
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
index 5e40c76afde362dcd0401eadb7f9c61cdacb3c1c..b64a326716b6413536981462f2fb67402d887948 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_p3swbnotpexcc
-ops = torch.ops._moe_p3swbnotpexcc
+from . import _moe_ajhcvhc2njy6q
+ops = torch.ops._moe_ajhcvhc2njy6q
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_p3swbnotpexcc::{op_name}"
\ No newline at end of file
+    return f"_moe_ajhcvhc2njy6q::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so
deleted file mode 100755
index 4eea5cfbef621fbeca58815ddcc542fba180a64f..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e5defb7114c1ba9cfdb740230057cb0c5cb21efe628840771db32494a89b5aa7
-size 84165672
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..624ae4d510e43ca1180bc8459151f04b474d81a2
--- /dev/null
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb03ab835bafe70c299a49cec39abf27f5b5d78715b16eed3527a683181df529
+size 84165672
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
index 5bd9a9581e86fee5f992b4355e04fc0eb148de1d..3c63395ada1d0726891986eb9bad52ef938c16e9 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_nskz7v224zllw
-ops = torch.ops._moe_nskz7v224zllw
+from . import _moe_wbafjrt24mw7y
+ops = torch.ops._moe_wbafjrt24mw7y
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_nskz7v224zllw::{op_name}"
\ No newline at end of file
+    return f"_moe_wbafjrt24mw7y::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..09c6a19ad273ed7cb2f0fbbbacea606a4e31a2de
--- /dev/null
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:378a8a453186ae62a92342077a988271cd7a02f46fbe303b4505d4484f1bfaef
+size 84364536
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so
deleted file mode 100755
index 5f165e4440c502173857a71eb64f85bb861c9df6..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8094d225249868d1f1c0abbfe8db3a486a99bd1f0928705e7dd5a998f125d8bf
-size 84364504
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
index 8df4ba928a7302037d3fdf29eb3aef0360610cc2..5fa8a278f2d590343a821e67c73856ef7b041dc0 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_t32bhzwhzero6
-ops = torch.ops._moe_t32bhzwhzero6
+from . import _moe_ezuwtpw27xv6u
+ops = torch.ops._moe_ezuwtpw27xv6u
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_t32bhzwhzero6::{op_name}"
\ No newline at end of file
+    return f"_moe_ezuwtpw27xv6u::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..b4e5aeace7f574a6943c04cfa6909eb90d901801
--- /dev/null
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae1204c5e2f4c7692676e0ef703dbab4f20a9f14652c75dee41b8d56560db19
+size 84063128
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so
deleted file mode 100755
index 3838db93c75ca2488554de6993944421f63e50d9..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:23f0aec499051a34ed7ba7ac4e58d7d84c5501b8beb1794d6ae8c13f54b08b9e
-size 84063160
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
index e67d8f0575a8eb529b25cbb908fcf9fd9badcddc..35ff3f2596227d70369578217f63970f033d3d90 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_pgljmg5ek5k4e
-ops = torch.ops._moe_pgljmg5ek5k4e
+from . import _moe_b3lelvb3xhtk2
+ops = torch.ops._moe_b3lelvb3xhtk2
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_pgljmg5ek5k4e::{op_name}"
\ No newline at end of file
+    return f"_moe_b3lelvb3xhtk2::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so
deleted file mode 100755
index 55b89014e1499c090462b343a113029d3a965a3c..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b1eef7e6a15aca930caa813a845147beeec16159c8cce89891c40d080a6f3062
-size 84157880
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..05e613750fdcfdedc95bba9406de1e574a16033d
--- /dev/null
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8ebfaa74892fb13f34924a63e188b9593cc3290831bf31e0f78ae99c9526b0
+size 84157856
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
index f4081f5f2ab5660696b4e3fb2fd24f7b9efa4a9e..bbc56fb61cf5ea61644707b67783c875096d06ef 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_dtibz76vuxaaq
-ops = torch.ops._moe_dtibz76vuxaaq
+from . import _moe_mqt4gjnisx6je
+ops = torch.ops._moe_mqt4gjnisx6je
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_dtibz76vuxaaq::{op_name}"
\ No newline at end of file
+    return f"_moe_mqt4gjnisx6je::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so
deleted file mode 100755
index 987c04fc1787c3fb4c9459ff25a436075b7dd0b7..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:704adc83ab06534f1af22b829003765b42c118df3790569b346ef36e7be570de
-size 84360960
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..0733f1b85f27350e3897040ee33574ad9ed06671
--- /dev/null
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_xdjcc2jnxatjk.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3891e7e17cd7509440cc41e9fb9fe767e29914454eeb44019993cbc4c47397d9
+size 84360960
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
index 45d8b48ee178ee36ad526495ece27b491b8a2947..698864af56e2ca956eeb9623d75d4fcab1342acf 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_plblvprmwqffy
-ops = torch.ops._moe_plblvprmwqffy
+from . import _moe_xdjcc2jnxatjk
+ops = torch.ops._moe_xdjcc2jnxatjk
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_plblvprmwqffy::{op_name}"
\ No newline at end of file
+    return f"_moe_xdjcc2jnxatjk::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
index bec6944a0d71eb9cc367fa7070fa8210ce3a7380..167092b5a614fe006958c8c6c59c7de7e52d3eab 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
-
 import torch
 
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
 
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..da3a5c9bec521db1e94b8b2c2e209f1d3f9dd924
--- /dev/null
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2wvvf3dwfnuuk.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daa2623a195e4ac85e3fe6403f3d827fedb3119c802ef2e6e2c06caf603755de
+size 84059584
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so
deleted file mode 100755
index d7b008bfd1f3eef97c4c870812388339b2baf82d..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:259f926d53dc10e91ef41311f61bcea93fbdbda94758fdca164b37256f9c69de
-size 84059616
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
index 42eb05054c3c7990ce1ea4ea25d709eb0f41211a..af27c5e61c6f340d4327217b058d440afd340af7 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_k6bmwmtgkqymw
-ops = torch.ops._moe_k6bmwmtgkqymw
+from . import _moe_2wvvf3dwfnuuk
+ops = torch.ops._moe_2wvvf3dwfnuuk
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_k6bmwmtgkqymw::{op_name}"
\ No newline at end of file
+    return f"_moe_2wvvf3dwfnuuk::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
index 471381f9885c2fe74c9655c5ad8cec763bef4825..6655bf13b910a7fcd64102143c2d630fb8f7f224 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 
 import functools
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import torch
 
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
+from .scalar_type import ScalarType, scalar_types
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
 
     scalar_type = get_scalar_type(num_bits, has_zero_point)
 
-    intermediate_cache = ops.ops.marlin_gemm_moe(
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
 
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
 
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index 43c4859181d3290fe41c545746932bbaa067f590..49a09b7eca6bac8b0907ce11395ae5198989d531 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
 
-from .platforms import current_platform
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
-import moe as ops
+from .platforms import current_platform
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))