diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index c60e6b378c3b699358f740b7ad05c4870d4db777..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:03298359c23e496df84a18978298d3372423f7733fb8185b2f6a535d25a64a7e -size 87060424 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..7c371683233895c74e7be505b7fa13cb52c7d861 --- /dev/null +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410 +size 87056328 diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 1829f8d717b0aed68b06d5f07eb101071f3736be..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:459eff1ecfb7a0b08c7733cf7d87073d44c1cc2c123db5437fc2d96699968b24 -size 87254968 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..44b45f80e1fff4f72ae1a8318e3c7fa1440efe48 --- /dev/null +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54 +size 87254968 diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index cecb9000fc97030d72c4d32875fc8cf4a4fe51d6..0000000000000000000000000000000000000000 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab56f79975737060a41592e8be9f56d92e599b04d3cb9a95c8f515afb341b6a2 -size 86965608 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..15f52860904d7759f3c532f931d3def0a504f5ba --- /dev/null +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795 +size 86965608 diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py b/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 5c77d404553f8ffea7fce8009647f3c26d3aaf84..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af44b6b8253f8b388158eda88cd12a59bec61aede5702ca684da10096e2708cb -size 87052528 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..0ecea2f27281e77ac8b685e21404c5ad075a040d --- /dev/null +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040 +size 87048408 diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 0bb5ca85510b1a367cea4e1c9f1f767e2b354854..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f616a904eda2f4a23a23556ef522449f9d1b111ea5a9c215d7d04c3ccb9345fe -size 87243240 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..7da2836d223b6c56fd07c5e5d0eccc09893412d9 --- /dev/null +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3 +size 87243240 diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 2a8ff2bc8c8c77b61527e0df43a8d5a87a7295b2..0000000000000000000000000000000000000000 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a -size 86957976 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..8f03a058c0494bccaddcb5e9ea658d303b639f89 --- /dev/null +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d +size 86953880 diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py b/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py +++ b/build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index cd7473e24b188652c6609a5cec6d9d556864b822..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9916c6b52d9381c94b09b78f4f6756895e51563c0049f462363937f40c0114f -size 87060352 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..eef4a05311f07c1349de657b53513342a3d16ef0 --- /dev/null +++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6 +size 87060352 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index f225d4ab3efee6db9b3887b90f2ea64f475275ec..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278 -size 86961568 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..f2533d1e0a4e4edf5ff67ae89329ea46932d4fd7 --- /dev/null +++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472 +size 86965664 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 51538bd3bec7d15bc3e8ed09f73b24643c62b397..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587 -size 87428864 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..cc28f14fad9ce4bb9946f1d572a2f71187c86bc7 --- /dev/null +++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595 +size 87432960 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py b/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 1fd6dfe8012f1b85f3ea0ca984eefb51d810eb32..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5e0d9837b95dde6e7286a15e312db47fbc302850ca6ea5610fb306ba42bfbab7 -size 87048408 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..298cdd4335279a97dfc815152e97893c28e54522 --- /dev/null +++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f +size 87044288 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 858dc59bf2b280b88036c862af18280b045ea498..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7abba352c56f674e909a701b965b1508babdf69e9a6dda54fac6f11088d0ac2 -size 86953856 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..e33b207012e8d3eb530377a0bd6eea3f69c253c2 --- /dev/null +++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6 +size 86953856 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states) diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so deleted file mode 100755 index 56e867522fa7bb2f5ba45e50c91311f35e64861f..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6568844b9365cdbe7fbdddfda668e49f7c780028988c5bae2d48a72eeba1650b -size 87417064 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..5c8405b3a46d0ce7541f3a34f9bbe8b05f7af100 --- /dev/null +++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897 +size 87417064 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py index 8c59f5aef1516f8f35dd775f9619ea797daf2e52..a27b7d812f497aa41d8429369ea3f1de496eb0d6 100644 --- a/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py +++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _moe_21a4db0 -ops = torch.ops._moe_21a4db0 +from . import _moe_2218ad7 +ops = torch.ops._moe_2218ad7 def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_moe_21a4db0::{op_name}" \ No newline at end of file + return f"_moe_2218ad7::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py index b0d3625037e9509140e8fefae153357606da4325..f8f0586c2a58d1fb68f7d2eaed47a48f165d1e8d 100644 --- a/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py +++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( ) +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + block_shape: Optional[List[int]], +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + A, A_scale = scaled_fp8_quant(A, A_scale) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + return A, A_scale + + @triton.jit def write_zeros_to_output( c_ptr, diff --git a/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py b/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py index 0cf388ff61bd10cee921cd6f34bdc84a1b5025f6..38b1d6fd3a9b8f2eb425b09889e372e20c5aecb6 100644 --- a/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py +++ b/build/torch26-cxx98-cu126-x86_64-linux/moe/layers.py @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): _fix_llama4_experts(hidden_states, self.experts) router_logits = self.router(hidden_states) + + extra_kwargs = {} + use_fp8_w8a8 = False + if hasattr(self.experts, "gate_up_proj_scale"): + use_fp8_w8a8 = True + extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale + extra_kwargs["w2_scale"] = self.experts.down_proj_scale + out = fused_moe( hidden_states, w1=self.experts.gate_up_proj, @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): renormalize=False, custom_routing_function=_llama4_topk, apply_router_weight_on_input=True, + use_fp8_w8a8=use_fp8_w8a8, + **extra_kwargs ) out += self.shared_expert(hidden_states)