diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 749c8ab7f0013f6742f824035512f6ab106098f9..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c1fc3d94e9a7d4b7c0cf13dd3a9633efef2ed265222b17b22d32282818b7bd1
-size 84165576
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e62b3fae918c3875e929ccd2454dfed4c789a4d5
--- /dev/null
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6330aa66b63067a8c9c031419773dc47e8853a717ef20b03c57df76660188831
+size 84165640
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..277a7442e4587af84f3d834a442a703b02572322 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_wtjc356yopxde
+ops = torch.ops._moe_wtjc356yopxde
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_wtjc356yopxde::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 96366cea06aa3fbd657651cf78a2cb8698925a61..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cd5492f9d9216ee88cfc40f373b19207c8e5f04ba8c55c58aec3ecc9f9ad3239
-size 84364440
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..2101d07ed8a1aabff3ad73ca6252cc698f422bf2
--- /dev/null
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0ca4f733821a564c525a36bb13e35ae960dc1e20f6472b177f67b9b165597ff
+size 84364504
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..3a1bc84783bcd053d6474360806eb23081de895d 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_fidhfyl4jgbje
+ops = torch.ops._moe_fidhfyl4jgbje
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_fidhfyl4jgbje::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index c816881379ec38d7a730448f541cad9d01d964ba..0000000000000000000000000000000000000000
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c1dd7f6fb98ad1ed39a402e1e42f3231645949dcc5cef28739f4e093883e0184
-size 84063064
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..a5ae5afb924ad8d777a912234c843c59effef3d6
--- /dev/null
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0864e745883f687c46c9ce743f1e2887113734c57268b9bc0e290185be28cf65
+size 84063128
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..2438914a584da6996cb28867d034bdc2bce6d85b 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_sg5gu4g3brle6
+ops = torch.ops._moe_sg5gu4g3brle6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_sg5gu4g3brle6::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index d16040f32bd65235ff086cd1651afc886107d76d..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1994e01d53c190da58a4a864b648421b515e2171abd320184164507e1aa4f1fe
-size 84157816
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..26f200bcf4d9f945d1c55a64cdac19a5f1c0f427
--- /dev/null
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e83b7db92da1ee38a3a4e5a453d4279024e6af95efcf0ad4b34e275029e44729
+size 84157912
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..96ff0854463fbc98464606eca0d575b3fbad23e6 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_v3wdnwni3a5ce
+ops = torch.ops._moe_v3wdnwni3a5ce
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_v3wdnwni3a5ce::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index a7e492cf6a813e48fc6edbdf38e6ed79b0e0a6c4..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5bd45d6fb85953a97cf3b6ceecee61b3298a3b6d1b46708ca6618689f63d6aa9
-size 84360896
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..1fc65a47efca9e162e237d0eb070b50aa7374028
--- /dev/null
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d1b063e4c52f5d744025e000fd79c5f41cdf56a32883c2d269b9c59f586c9e4
+size 84360992
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..0405f57430cc58455e5c76beb87ceec9af3b19be 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_hrq7opevcb4ug
+ops = torch.ops._moe_hrq7opevcb4ug
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_hrq7opevcb4ug::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 9edacbfbf47724f264d163efa8699e866fadc548..0000000000000000000000000000000000000000
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:973886e7a4e11ba2161ffe3034cdc52323321f712463b8378dbb6fc4c420b934
-size 84059552
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..a88d47ebcbc5522200d9a0b69cf9edd5ea99627c
--- /dev/null
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_moe_p3swbnotpexcc.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d820072815ae83e3e838eb14f01356a09eeb9a5367851d5f6038d7346fd64564
+size 84059584
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..5e40c76afde362dcd0401eadb7f9c61cdacb3c1c 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_p3swbnotpexcc
+ops = torch.ops._moe_p3swbnotpexcc
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_p3swbnotpexcc::{op_name}"
\ No newline at end of file
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index efdbe18abfeb7225a15b17cdc7c3c94c821352a9..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d9e6d3dc978ae8aee87335a292d4ee55278658dabc3319829f3d4a7722de303c
-size 84165608
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..4eea5cfbef621fbeca58815ddcc542fba180a64f
--- /dev/null
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5defb7114c1ba9cfdb740230057cb0c5cb21efe628840771db32494a89b5aa7
+size 84165672
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..5bd9a9581e86fee5f992b4355e04fc0eb148de1d 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_nskz7v224zllw
+ops = torch.ops._moe_nskz7v224zllw
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_nskz7v224zllw::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index dbfb190d598af93eb0f164652159a2f8b2517505..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:35112cbe69729f9843c91eda4acc549df354d09f9b3fbfaf704820cefc5ffd86
-size 84364440
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..5f165e4440c502173857a71eb64f85bb861c9df6
--- /dev/null
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8094d225249868d1f1c0abbfe8db3a486a99bd1f0928705e7dd5a998f125d8bf
+size 84364504
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..8df4ba928a7302037d3fdf29eb3aef0360610cc2 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_t32bhzwhzero6
+ops = torch.ops._moe_t32bhzwhzero6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_t32bhzwhzero6::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index adf24bedf870eed4989a31fdd628c816e2383ecb..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:554ef8777913b7c73fd3d8aeeb08e441dc189d26765676a56f5d704f05e4846e
-size 84063096
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..3838db93c75ca2488554de6993944421f63e50d9
--- /dev/null
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23f0aec499051a34ed7ba7ac4e58d7d84c5501b8beb1794d6ae8c13f54b08b9e
+size 84063160
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..e67d8f0575a8eb529b25cbb908fcf9fd9badcddc 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_pgljmg5ek5k4e
+ops = torch.ops._moe_pgljmg5ek5k4e
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_pgljmg5ek5k4e::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 0efa2f346fa7d739514c4da79fd488ac5c17a8fa..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:96216ac120dbf99500906eaa6beeaf30c03e07044c0e394e6a83be25a4e184ce
-size 84157824
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..55b89014e1499c090462b343a113029d3a965a3c
--- /dev/null
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1eef7e6a15aca930caa813a845147beeec16159c8cce89891c40d080a6f3062
+size 84157880
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..f4081f5f2ab5660696b4e3fb2fd24f7b9efa4a9e 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_dtibz76vuxaaq
+ops = torch.ops._moe_dtibz76vuxaaq
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_dtibz76vuxaaq::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 93c7ec969117432c163862a4536d07f1d4ffd584..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:570d15a2c3120695fada586323820f6b3913e514d3d495680fe5cb716445a851
-size 84360896
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..987c04fc1787c3fb4c9459ff25a436075b7dd0b7
--- /dev/null
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:704adc83ab06534f1af22b829003765b42c118df3790569b346ef36e7be570de
+size 84360960
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..45d8b48ee178ee36ad526495ece27b491b8a2947 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_plblvprmwqffy
+ops = torch.ops._moe_plblvprmwqffy
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_plblvprmwqffy::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))
 
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
index 0e3b4850e664a15271d7bfee04ffc6bdab3a6083..bec6944a0d71eb9cc367fa7070fa8210ce3a7380 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py
@@ -1 +1,135 @@
-import moe._custom_ops as ops
+from typing import TYPE_CHECKING
+
+import torch
+
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
+
+from ._ops import add_op_namespace_prefix, ops
+from .fused_marlin_moe import fused_marlin_moe
+from .fused_moe import fused_moe, fused_topk, grouped_topk
+from .scalar_type import ScalarType, scalar_types
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.gptq_marlin_repack(
+            b_q_weight[e], perm[e], size_k, size_n, num_bits
+        )
+    return output
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
+
+
+def moe_sum(input: torch.Tensor, output: torch.Tensor):
+    ops.moe_sum(input, output)
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    experts_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indicies: torch.Tensor,
+    gating_output: float,
+) -> None:
+    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
+
+
+if hasattr(ops, "marlin_gemm_moe"):
+
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+__all__ = [
+    "gptq_marlin_moe_repack",
+    "awq_marlin_moe_repack",
+    "fused_marlin_moe",
+    "moe_sum",
+    "moe_align_block_size",
+    "topk_softmax",
+    "fused_moe",
+]
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py
deleted file mode 100644
index 5020813c678a4b923393df5b77345ecc0df43077..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_custom_ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from typing import TYPE_CHECKING
-
-import torch
-
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-
-    def register_fake(fn):
-        return lambda name: fn
-
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
-
-try:
-    from ._ops import ops, add_op_namespace_prefix
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _moe
-
-        ops = torch._moe
-
-        def add_op_namespace_prefix(op_name: str):
-            return f"_quantization::{op_name}"
-
-    except ImportError:
-        raise e
-
-from .scalar_type import ScalarType
-
-def gptq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
-        )
-    return output
-
-
-def awq_marlin_moe_repack(
-    b_q_weight: torch.Tensor,
-    perm: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    num_bits: int,
-) -> torch.Tensor:
-    num_experts = b_q_weight.shape[0]
-    assert size_k % 16 == 0
-    output = torch.empty(
-        (num_experts, size_k // 16, size_n * (num_bits // 2)),
-        device=b_q_weight.device,
-        dtype=b_q_weight.dtype,
-    )
-    for e in range(num_experts):
-        output[e] = ops.awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
-    return output
-
-
-def moe_sum(input: torch.Tensor, output: torch.Tensor):
-    ops.moe_sum(input, output)
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    experts_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
-        block_size,
-        sorted_token_ids,
-        experts_ids,
-        num_tokens_post_pad,
-    )
-
-
-def topk_softmax(
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    token_expert_indicies: torch.Tensor,
-    gating_output: float,
-) -> None:
-    ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-
-if hasattr(ops, "marlin_gemm_moe"):
-
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
deleted file mode 100755
index 5df775039c5dbea7f14c9c6507bbdf7da46c46f2..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_0_0_1.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0a1cc0068fc943693e8c39c2b43b147584ee43a3046629583a95bfb4244fdf2a
-size 84059520
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..d7b008bfd1f3eef97c4c870812388339b2baf82d
--- /dev/null
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:259f926d53dc10e91ef41311f61bcea93fbdbda94758fdca164b37256f9c69de
+size 84059616
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
index 19ec5f669cd3e4bd8b10b7776865ccf931cda507..42eb05054c3c7990ce1ea4ea25d709eb0f41211a 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_0_0_1
-ops = torch.ops._moe_0_0_1
+from . import _moe_k6bmwmtgkqymw
+ops = torch.ops._moe_k6bmwmtgkqymw
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_0_0_1::{op_name}"
\ No newline at end of file
+    return f"_moe_k6bmwmtgkqymw::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
index e663f5c63d11a44297a2ee224e057ab8760a414a..471381f9885c2fe74c9655c5ad8cec763bef4825 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py
@@ -7,7 +7,7 @@ import torch
 
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
 from .scalar_type import scalar_types
-import moe._custom_ops as ops
+import moe as ops
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index d4486f56dfebededb7fdfe7bbd92611af1327100..43c4859181d3290fe41c545746932bbaa067f590 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -11,7 +11,7 @@ import triton.language as tl
 
 from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe._custom_ops as ops
+import moe as ops
 
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))