1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +10 +11 +12 +13 +14 +15 | """Utility to dump NVIDIA GPU information.""" +import subprocess + +def nvidia_dump(): + """Dump NVIDIA GPU information.""" + try: + result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True) + print("NVIDIA GPU Information:") + print(result.stdout) + except FileNotFoundError: + print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?") + except subprocess.CalledProcessError as e: + print(f"Error running nvidia-smi: {e}") + +nvidia_dump() + |
1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 | """Simple utilities for running the models.""" +import torch + +def to_dtype(dtype_str: str): + """Convert string to torch dtype.""" + if dtype_str == "float16": + return torch.float16 + if dtype_str == "bfloat16": + return torch.bfloat16 + return torch.float32 + +def tensor_stats(t: torch.Tensor) -> str: + """Generate stats string for a tensor.""" + return (f"shape={tuple(t.shape)}, " + f"dtype={t.dtype}, " + f"device={t.device}, " + f"mean={t.mean().item():.6f}, " + f"std={t.std().item():.6f}") + +def set_seed(seed: int): + """Set seeds for reproducibility.""" + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +"""Reusable benchmarking utilities for performance testing.""" +import time +import numpy as np +from contextlib import contextmanager +from typing import Callable, Dict, Tuple, Any, Optional +import torch +import json + +def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, + input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]: + """High precision timing function with warmup and optional input generation per iteration.""" + # Warmup + for i in range(warmup): + if input_generator: + inputs = input_generator(i) + func(inputs) + else: + func() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + start = time.perf_counter() + result = None + for i in range(iters): + if input_generator: + inputs = input_generator(i + warmup) # Continue seed sequence after warmup + result = func(inputs) + else: + result = func() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + end = time.perf_counter() + avg_time = (end - start) / iters + return result, avg_time + +def memory_usage() -> Dict[str, float]: + """Get current memory usage in GB.""" + if not torch.cuda.is_available(): + return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0} + + return { + "allocated": torch.cuda.memory_allocated() / 1024**3, + "cached": torch.cuda.memory_reserved() / 1024**3, + "max_allocated": torch.cuda.max_memory_allocated() / 1024**3 + } + +@contextmanager +def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, + tokens: int = None, save_json: Optional[str] = None, + input_shape: Optional[Tuple] = None, input_seed_base: int = 42): + """Context manager for benchmarking with comprehensive metrics and optional input generation.""" + + def run_benchmark(model_func, *args, **kwargs): + torch.cuda.empty_cache() if torch.cuda.is_available() else None + + mem_before = memory_usage() + + # Create input generator if input_shape is provided + input_generator = None + if input_shape is not None: + def create_input(iteration: int): + # Use deterministic but different seed for each iteration + iteration_seed = input_seed_base + iteration * 123 # Spread out seeds + torch.manual_seed(iteration_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(iteration_seed) + return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1 + input_generator = create_input + + if input_generator: + result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator) + else: + result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters) + + mem_after = memory_usage() + + # Calculate metrics + metrics = { + "avg_time_ms": avg_time * 1000, + "throughput_tokens_per_sec": tokens / avg_time if tokens else None, + "memory_allocated_gb": mem_after["allocated"], + "memory_cached_gb": mem_after["cached"], + "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"], + "device": str(device) if device else "cpu", + "dtype": str(dtype) if dtype else "float32", + "tokens": tokens, + "warmup_iters": warmup, + "timing_iters": iters + } + + # Print results + print(f"Average time: {metrics['avg_time_ms']:.3f} ms") + if tokens: + print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec") + print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB") + print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB") + + # Save to JSON if requested + if save_json: + with open(save_json, 'w') as f: + json.dump(metrics, f, indent=2) + + return result + + yield run_benchmark + |
1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 | """Configuration for MoE benchmarks.""" +import torch + +# Model configuration +NUM_EXPERTS = 128 +HIDDEN_SIZE = 1152 +TOP_K = 4 + +# Benchmark configuration +BATCH_SIZE = 8 +SEQ_LEN = 512 +DTYPE = "bfloat16" +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Seeds for reproducibility +WEIGHT_SEED = 999 +EXPERT_SEED = 777 +INPUT_SEED = 123 +GENERAL_SEED = 42 + +print(f"Configuration:") +print(f" Experts: {NUM_EXPERTS}") +print(f" Hidden size: {HIDDEN_SIZE}") +print(f" Top-k: {TOP_K}") +print(f" Batch size: {BATCH_SIZE}") +print(f" Sequence length: {SEQ_LEN}") +print(f" Device: {DEVICE}") +print(f" Dtype: {DTYPE}") + |
1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 | """Generate and save shared weights for consistent comparison.""" +import torch +import numpy as np +from pathlib import Path + +# Model configuration +NUM_EXPERTS = 128 +HIDDEN_SIZE = 1152 +INTERMEDIATE_SIZE = 3072 +TOP_K = 4 + +# Input configuration +BATCH_SIZE = 1 +SEQ_LEN = 100 +DTYPE = "float32" +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Seeds for reproducibility +WEIGHT_SEED = 999 +EXPERT_SEED = 777 +INPUT_SEED = 123 +GENERAL_SEED = 42 + +def set_seed(seed: int): + """Set seeds for reproducibility.""" + torch.manual_seed(seed) + np.random.seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +# Generate shared weights for all implementations +print("Generating shared weights...") + +# Router weights +set_seed(WEIGHT_SEED) +router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE) +torch.nn.init.kaiming_uniform_(router_weight) +router_bias = torch.zeros(NUM_EXPERTS) + +# Expert weights - using proper dimensions for gate/up combined projection +set_seed(EXPERT_SEED) +gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02) +gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE) +down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02) +down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE) + +# Save weights +torch.save(router_weight, 'router_weight.pt') +torch.save(router_bias, 'router_bias.pt') +torch.save(gate_up_proj, 'gate_up_proj.pt') +torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt') +torch.save(down_proj, 'down_proj.pt') +torch.save(down_proj_bias, 'down_proj_bias.pt') + +print(f"Saved weights:") +print(f" Router: {tuple(router_weight.shape)}") +print(f" Gate/Up proj: {tuple(gate_up_proj.shape)}") +print(f" Down proj: {tuple(down_proj.shape)}") +print(f" Hidden size: {HIDDEN_SIZE}") + |
Artifacts:
+down_proj.pt +down_proj_bias.pt +gate_up_proj.pt +gate_up_proj_bias.pt +router_weight.pt +router_bias.pt +GPT-OSS Implementation
+This section benchmarks the GPT-OSS MoE implementation in non-training mode.
+1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 | import torch +from torch import nn +from torch.nn import functional as F +from utils import to_dtype, tensor_stats, set_seed, bench_context +from config import ( + NUM_EXPERTS, HIDDEN_SIZE, TOP_K, + BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE, + WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED +) +from pathlib import Path +import os + +# Discover the upstream artifact directory from env +data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.') + +# list all the files in the directory +print(f"Loading weights from: {data_dir}") +print(f"Files in directory: {list(Path(data_dir).glob('*'))}") + +router_weight = torch.load(Path(data_dir) / 'router_weight.pt') +router_bias = torch.load(Path(data_dir) / 'router_bias.pt') +gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt') +gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt') +down_proj = torch.load(Path(data_dir) / 'down_proj.pt') +down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt') + +print("Loaded shared weights from artifacts") +print(f"Router weight sum: {router_weight.sum().item():.6f}") +print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") +print(f"Down sum: {down_proj.sum().item():.6f}") + +class GptOssRouter(nn.Module): + def __init__(self, router_weight, router_bias): + super().__init__() + self.top_k = TOP_K + self.num_experts = NUM_EXPERTS + self.hidden_dim = HIDDEN_SIZE + self.weight = nn.Parameter(router_weight.clone()) + self.bias = nn.Parameter(router_bias.clone()) + + def forward(self, hidden_states): + hidden_states = hidden_states.reshape(-1, self.hidden_dim) + router_logits = F.linear(hidden_states, self.weight, self.bias) + router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1) + router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype) + router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value) + return router_scores, router_indices + +class GptOssExperts(nn.Module): + def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias): + super().__init__() + self.num_experts = NUM_EXPERTS + self.hidden_size = HIDDEN_SIZE + self.expert_dim = self.hidden_size + self.gate_up_proj = nn.Parameter(gate_up_proj.clone()) + self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone()) + self.down_proj = nn.Parameter(down_proj.clone()) + self.down_proj_bias = nn.Parameter(down_proj_bias.clone()) + self.alpha = 1.702 + self.limit = 7.0 + + def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor: + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) + num_experts = routing_weights.shape[1] + + if hidden_states.device.type == "cpu" or self.training: + next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts) + expert_mask = expert_mask.permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + + for expert_idx in expert_hit[:]: + expert_idx = expert_idx[0] + with torch.no_grad(): + _, token_idx = torch.where(expert_mask[expert_idx]) + current_state = hidden_states[token_idx] + gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx] + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx] + weighted_output = out * routing_weights[token_idx, expert_idx, None] + next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype)) + next_states = next_states.view(batch_size, -1, self.hidden_size) + else: + hidden_states = hidden_states.repeat(num_experts, 1) + hidden_states = hidden_states.view(num_experts, -1, self.hidden_size) + gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :] + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + next_states = torch.bmm(((up + 1) * glu), self.down_proj) + next_states = next_states + self.down_proj_bias[..., None, :] + next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size) + next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None] + next_states = next_states.sum(dim=0) + return next_states + +class GptOssMoEMLP(nn.Module): + def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias): + super().__init__() + self.router = GptOssRouter(router_weight, router_bias) + self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias) + + def forward(self, hidden_states): + router_scores, router_indices = self.router(hidden_states) + routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores) + return routed_out, router_scores + +# Run the model +set_seed(GENERAL_SEED) + +device = torch.device(DEVICE) +dtype = to_dtype(DTYPE) + +print("\n=== GPT-OSS Implementation ===") +# Initialize model with loaded weights +model = GptOssMoEMLP( + router_weight.to(device, dtype=dtype), + router_bias.to(device, dtype=dtype), + gate_up_proj.to(device, dtype=dtype), + gate_up_proj_bias.to(device, dtype=dtype), + down_proj.to(device, dtype=dtype), + down_proj_bias.to(device, dtype=dtype) +).to(device=device, dtype=dtype) + +print(f"Router weight sum: {model.router.weight.sum().item():.6f}") +print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}") +print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}") + +# Benchmark the model using different input tensors on each iteration +tokens = BATCH_SIZE * SEQ_LEN +input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) +with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, + save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench: + output, stats = bench(model) + print(f"\nOutput sum: {output[0].sum().item():.6f}") + |
Artifacts:
+gptoss_results.json +MegaBlocks Implementation
+This section benchmarks the MegaBlocks MoE implementation.
+1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 | import torch +from torch import nn +from torch.nn import functional as F +from kernels import get_kernel, get_local_kernel +from utils import to_dtype, tensor_stats, set_seed, bench_context +from config import ( + NUM_EXPERTS, HIDDEN_SIZE, TOP_K, + BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE, + WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED +) +from pathlib import Path +from collections import namedtuple +import os + +# Discover the upstream artifact directory from env +data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.') + +print(f"Loading weights from: {data_dir}") + +router_weight = torch.load(Path(data_dir) / 'router_weight.pt') +router_bias = torch.load(Path(data_dir) / 'router_bias.pt') +gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt') +gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt') +down_proj = torch.load(Path(data_dir) / 'down_proj.pt') +down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt') + +print("Loaded shared weights from artifacts") +print(f"Router weight sum: {router_weight.sum().item():.6f}") +print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") +print(f"Down sum: {down_proj.sum().item():.6f}") + +def build_megablocks_model(device: torch.device, dtype: torch.dtype): + # Download optimized kernels from the Hugging Face hub + megablocks = get_kernel("kernels-community/megablocks") + + # megablocks = get_local_kernel( + # Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks") + + model = megablocks.layers.MegaBlocksMoeMLP() + + # Create attribute container for expert weights + model.experts = namedtuple( + "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"] + ) + + # Use loaded router weights for consistency + model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype) + with torch.no_grad(): + model.router.weight.copy_(router_weight.to(dtype)) + model.router.bias.copy_(router_bias.to(dtype)) + + # Attach loaded expert weights to the experts container + e = model.experts + e.alpha = 1.702 + e.capacity_factor = 4 + e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype)) + e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype)) + e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype)) + e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype)) + e.hidden_size = HIDDEN_SIZE + + # Log weight statistics for comparison + print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}") + print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}") + print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}") + + return model + +# Create a wrapper to match the interface of other implementations +class MegaBlocksMoEWrapper(nn.Module): + def __init__(self, megablocks_model): + super().__init__() + self.model = megablocks_model + + def forward(self, hidden_states): + # MegaBlocks expects input in the format (batch, seq_len, hidden_dim) + output, dummy_routing_weights = self.model(hidden_states) + # Return output and dummy routing weights for consistency with other implementations + # dummy_routing_weights = torch.zeros( + # hidden_states.shape[0] * hidden_states.shape[1], + # NUM_EXPERTS, + # device=hidden_states.device, + # dtype=hidden_states.dtype + # ) + return output, dummy_routing_weights + +# Run the model +set_seed(GENERAL_SEED) + +device = torch.device(DEVICE) +dtype = to_dtype(DTYPE) + +print("\n=== MegaBlocks Implementation ===") +# Build MegaBlocks model with loaded weights +megablocks_model = build_megablocks_model(device, dtype) +model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype) + +# Benchmark the model using different input tensors on each iteration +tokens = BATCH_SIZE * SEQ_LEN +input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) +with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, + save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench: + output, stats = bench(model) + print(f"\nOutput sum: {output[0].sum().item():.6f}") + |
Artifacts:
+megablocks_results.json +Performance Comparison
+This section loads the benchmark results and creates visualizations comparing the two implementations.
+1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 | import json +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path +import os + +# Get result directories from environment variables +gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.') +megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.') + +print(f"Loading benchmark results from:") +print(f" GPT-OSS dir: {gptoss_dir}") +print(f" MegaBlocks dir: {megablocks_dir}") + +# Load benchmark results +gptoss_file = Path(gptoss_dir) / 'gptoss_results.json' +megablocks_file = Path(megablocks_dir) / 'megablocks_results.json' + +print(f"Loading results from:") +print(f" GPT-OSS: {gptoss_file}") +print(f" MegaBlocks: {megablocks_file}") + +if not gptoss_file.exists(): + print(f"Warning: {gptoss_file} not found") +if not megablocks_file.exists(): + print(f"Warning: {megablocks_file} not found") + +with open(gptoss_file, 'r') as f: + gptoss_results = json.load(f) + +with open(megablocks_file, 'r') as f: + megablocks_results = json.load(f) + +print(f"GPT-OSS results keys: {list(gptoss_results.keys())}") +print(f"MegaBlocks results keys: {list(megablocks_results.keys())}") + +# Helper function to extract metrics from either old or new JSON format +def get_metric(results, metric_name, default=0): + """Extract metric from results, handling both old and new JSON formats.""" + # New format (with stats dict) + if 'stats' in results: + return results['stats'].get(metric_name, default) + # Old format (direct keys) + elif metric_name in results: + return results[metric_name] + else: + return default + +# Create comparison plots +fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10)) + +# Performance comparison +implementations = ['GPT-OSS', 'MegaBlocks'] + +# Extract timing metrics (handle both avg_ms and avg_time_ms) +gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0)) +mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0)) +times = [gpt_time, mega_time] + +# Extract throughput metrics +gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0)) +mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0)) +throughputs = [gpt_throughput, mega_throughput] + +# Extract memory metrics +gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0) +mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0) +memory_usage = [gpt_memory, mega_memory] + +gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0) +mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0) +memory_increase = [gpt_mem_inc, mega_mem_inc] + +print(f"Extracted metrics:") +print(f" Times (ms): {times}") +print(f" Throughputs: {throughputs}") +print(f" Memory usage (GB): {memory_usage}") +print(f" Memory increase (GB): {memory_increase}") + +colors = ['#2E8B57', '#4169E1'] + +# Latency comparison +bars1 = ax1.bar(implementations, times, color=colors) +ax1.set_ylabel('Average Time (ms)') +ax1.set_title('Latency Comparison') +ax1.grid(True, alpha=0.3) + +# Add values on bars +for bar, time in zip(bars1, times): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01, + f'{time:.2f}ms', ha='center', va='bottom') + +# Throughput comparison +bars2 = ax2.bar(implementations, throughputs, color=colors) +ax2.set_ylabel('Tokens per Second') +ax2.set_title('Throughput Comparison') +ax2.grid(True, alpha=0.3) + +# Add values on bars +for bar, throughput in zip(bars2, throughputs): + height = bar.get_height() + ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01, + f'{throughput:.0f}', ha='center', va='bottom') + +# Memory usage comparison +bars3 = ax3.bar(implementations, memory_usage, color=colors) +ax3.set_ylabel('Memory Allocated (GB)') +ax3.set_title('Memory Usage Comparison') +ax3.grid(True, alpha=0.3) + +# Add values on bars +for bar, mem in zip(bars3, memory_usage): + height = bar.get_height() + ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01, + f'{mem:.2f}GB', ha='center', va='bottom') + +# Memory increase comparison +bars4 = ax4.bar(implementations, memory_increase, color=colors) +ax4.set_ylabel('Memory Increase (GB)') +ax4.set_title('Memory Increase Comparison') +ax4.grid(True, alpha=0.3) + +# Add values on bars +for bar, mem_inc in zip(bars4, memory_increase): + height = bar.get_height() + ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01, + f'{mem_inc:.3f}GB', ha='center', va='bottom') + +plt.tight_layout() +plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight') +plt.show() + +# Print summary table +print("\n" + "="*60) +print("PERFORMANCE COMPARISON SUMMARY") +print("="*60) +print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}") +print("-" * 60) + +# Determine winners +latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks" +throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks" +memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks" +mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks" + +print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}") +print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}") +print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}") +print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}") + +# Speed ratio +speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1] +faster_impl = latency_winner +print(f"\n{faster_impl} is {speed_ratio:.2f}x faster") + +# Throughput ratio +throughput_ratio = max(throughputs) / min(throughputs) +higher_throughput = throughput_winner +print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput") + +print("="*60) + |
Conclusion
+This focused benchmark compares the GPT-OSS (non-training mode) and MegaBlocks MoE implementations on the same hardware with identical weights and inputs. The comparison focuses on:
+-
+
- Latency: Average forward pass time +
- Throughput: Tokens processed per second +
- Memory Usage: GPU memory consumption +
- Memory Efficiency: Memory increase during execution +
Both implementations use:
+- 64 experts with top-2 routing
+- 768 hidden dimensions
+- Batch size of 8, sequence length of 512
+- bfloat16 precision
+- Identical pre-generated weights for fair comparison
The results show the performance characteristics of each approach, helping identify the optimal implementation for different use cases.
+