kernels-community
/

moe

Model card Files Files and versions Community

danieldk HF Staff commited on Apr 8

Commit

2218ad7

·

1 Parent(s): 07c5f2e

Handle FP8

Files changed (2) hide show

torch-ext/moe/fused_moe.py +24 -0
torch-ext/moe/layers.py +10 -0

torch-ext/moe/fused_moe.py CHANGED Viewed

@@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool(
 )
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

 )
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
 @triton.jit
 def write_zeros_to_output(
     c_ptr,

torch-ext/moe/layers.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module):
         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
@@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module):
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
         )
         out += self.shared_expert(hidden_states)

         _fix_llama4_experts(hidden_states, self.experts)
         router_logits = self.router(hidden_states)
+        extra_kwargs = {}
+        use_fp8_w8a8 = False
+        if hasattr(self.experts, "gate_up_proj_scale"):
+            use_fp8_w8a8 = True
+            extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
+            extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         out = fused_moe(
             hidden_states,
             w1=self.experts.gate_up_proj,
             renormalize=False,
             custom_routing_function=_llama4_topk,
             apply_router_weight_on_input=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            **extra_kwargs
         )
         out += self.shared_expert(hidden_states)