diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index c60e6b378c3b699358f740b7ad05c4870d4db777..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03298359c23e496df84a18978298d3372423f7733fb8185b2f6a535d25a64a7e
-size 87060424
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..7c371683233895c74e7be505b7fa13cb52c7d861
--- /dev/null
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410
+size 87056328
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 1829f8d717b0aed68b06d5f07eb101071f3736be..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:459eff1ecfb7a0b08c7733cf7d87073d44c1cc2c123db5437fc2d96699968b24
-size 87254968
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..44b45f80e1fff4f72ae1a8318e3c7fa1440efe48
--- /dev/null
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54
+size 87254968
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index cecb9000fc97030d72c4d32875fc8cf4a4fe51d6..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ab56f79975737060a41592e8be9f56d92e599b04d3cb9a95c8f515afb341b6a2
-size 86965608
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..15f52860904d7759f3c532f931d3def0a504f5ba
--- /dev/null
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795
+size 86965608
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 5c77d404553f8ffea7fce8009647f3c26d3aaf84..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af44b6b8253f8b388158eda88cd12a59bec61aede5702ca684da10096e2708cb
-size 87052528
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..0ecea2f27281e77ac8b685e21404c5ad075a040d
--- /dev/null
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040
+size 87048408
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 0bb5ca85510b1a367cea4e1c9f1f767e2b354854..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f616a904eda2f4a23a23556ef522449f9d1b111ea5a9c215d7d04c3ccb9345fe
-size 87243240
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..7da2836d223b6c56fd07c5e5d0eccc09893412d9
--- /dev/null
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3
+size 87243240
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 2a8ff2bc8c8c77b61527e0df43a8d5a87a7295b2..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a
-size 86957976
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8f03a058c0494bccaddcb5e9ea658d303b639f89
--- /dev/null
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d
+size 86953880
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index cd7473e24b188652c6609a5cec6d9d556864b822..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c9916c6b52d9381c94b09b78f4f6756895e51563c0049f462363937f40c0114f
-size 87060352
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..eef4a05311f07c1349de657b53513342a3d16ef0
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6
+size 87060352
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index f225d4ab3efee6db9b3887b90f2ea64f475275ec..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278
-size 86961568
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..f2533d1e0a4e4edf5ff67ae89329ea46932d4fd7
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472
+size 86965664
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 51538bd3bec7d15bc3e8ed09f73b24643c62b397..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587
-size 87428864
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..cc28f14fad9ce4bb9946f1d572a2f71187c86bc7
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595
+size 87432960
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 1fd6dfe8012f1b85f3ea0ca984eefb51d810eb32..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5e0d9837b95dde6e7286a15e312db47fbc302850ca6ea5610fb306ba42bfbab7
-size 87048408
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..298cdd4335279a97dfc815152e97893c28e54522
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f
+size 87044288
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 858dc59bf2b280b88036c862af18280b045ea498..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f7abba352c56f674e909a701b965b1508babdf69e9a6dda54fac6f11088d0ac2
-size 86953856
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e33b207012e8d3eb530377a0bd6eea3f69c253c2
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6
+size 86953856
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
deleted file mode 100755
index 56e867522fa7bb2f5ba45e50c91311f35e64861f..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6568844b9365cdbe7fbdddfda668e49f7c780028988c5bae2d48a72eeba1650b
-size 87417064
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..5c8405b3a46d0ce7541f3a34f9bbe8b05f7af100
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897
+size 87417064
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py
index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _moe_21a4db0
-ops = torch.ops._moe_21a4db0
+from . import _moe_2218ad7
+ops = torch.ops._moe_2218ad7
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_21a4db0::{op_name}"
\ No newline at end of file
+    return f"_moe_2218ad7::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py
index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py
@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 
 
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py
index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py
@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
 
         router_logits = self.router(hidden_states)
+
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
+
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
 
         out += self.shared_expert(hidden_states)