Spaces:

ISEEKYAN
/

megatron_memory_estimator

Running

App Files Files Community

Yan Bai commited on Jul 15

Commit

55e1701

1 Parent(s): 9eb3690

add

Browse files

Files changed (19) hide show

.gitignore +1 -0
Dockerfile +23 -0
__init__.py +1 -0
app.py +1 -0
estimate.py +499 -0
moe_mem_estimator/__init__.py +0 -0
moe_mem_estimator/base.py +211 -0
moe_mem_estimator/gpt_model.py +151 -0
moe_mem_estimator/layers.py +1813 -0
webui/index.html +163 -0
webui/main.py +211 -0
webui/model-configs/qwen3-14b.json +30 -0
webui/model-configs/qwen3-235b-a22b.json +38 -0
webui/model-configs/qwen3-30b-a3b.json +38 -0
webui/model-configs/qwen3-32b.json +30 -0
webui/model-configs/qwen3-8b.json +30 -0
webui/requirements.txt +3 -0
webui/script.js +715 -0
webui/style.css +383 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
+# 安装额外依赖（如果基础镜像已包含部分依赖，pip 会自动跳过）
+RUN pip install --no-cache-dir \
+    fastapi \
+    uvicorn[standard] \
+    mbridge \
+    termcolor \
+    ipdb
+# 添加 Megatron-LM core_v0.12.2
+RUN git clone -b core_v0.12.2 --depth 1 https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM
+# 复制代码至工作目录
+WORKDIR /app
+COPY . /app
+# HF Spaces 默认通过 $PORT 注入端口
+ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
+ENV PORT=7860
+EXPOSE 7860
+# 启动 FastAPI 服务
+CMD ["sh", "-c", "uvicorn app:app --host 0.0.0.0 --port $PORT"]

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from webui.main import app

estimate.py ADDED Viewed

	@@ -0,0 +1,499 @@

+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain GPT."""
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore")
+import os
+import torch
+from functools import partial
+from contextlib import nullcontext
+import inspect
+from typing import Union
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import mpu
+from megatron.core.enums import ModelType
+from megatron.core.datasets.blended_megatron_dataset_builder import (
+    BlendedMegatronDatasetBuilder,
+)
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
+import megatron.legacy.model
+from megatron.training import pretrain
+from megatron.core.utils import StragglerDetector
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.training.initialize import initialize_megatron
+from moe_mem_estimator.gpt_model import GPTModel
+from moe_mem_estimator.base import (
+    is_pipeline_first_stage,
+    is_pipeline_last_stage,
+    set_global_config,
+    set_pipeline_model_parallel_rank,
+)
+from moe_mem_estimator.layers import MLASelfAttention, MoELayer
+def _calculate_rank_memory(config, args, input_shape, pp_rank=0, pp_size=1):
+    """
+    Calculates the memory for a single pipeline parallel rank, containing the detailed logic.
+    """
+    # Build the model for the current rank
+    set_global_config(config)
+    pre_process = (pp_rank == 0)
+    post_process = (pp_rank == pp_size - 1)
+    use_te = True
+    if hasattr(config, 'spec') and config.spec is not None:
+        transformer_layer_spec = import_module(config.spec)
+    else:
+        if use_te:
+            transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                config.num_moe_experts, config.moe_grouped_gemm, config.qk_layernorm,
+                config.multi_latent_attention, config.fp8
+            )
+        else:
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                config.num_moe_experts, config.moe_grouped_gemm, config.qk_layernorm,
+                config.multi_latent_attention
+            )
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=getattr(config, 'fp16_lm_cross_entropy', False),
+        parallel_output=True,
+        share_embeddings_and_output_weights=args.tie_word_embeddings,
+        position_embedding_type="rope",
+        rotary_percent=getattr(args, 'rotary_percent', 1.0),
+        rotary_base=getattr(args, 'rotary_base', 10000),
+        rope_scaling=getattr(config, 'use_rope_scaling', False),
+    )
+    # --- Start of detailed memory calculation logic ---
+    num_parameter_this_shard = model.num_parameter()
+    num_activation = model.num_activation(input_shape)
+    output_shape = model.mock_forward(input_shape)
+    num_parameter_this_shard_sparse = sum(
+        layer.mlp.num_parameter() for layer in model.decoder.layers.modules
+        if isinstance(layer.mlp, MoELayer)
+    )
+    num_activation_this_shard_mlp = sum(
+        m.mlp.num_activation() for m in model.decoder.layers.modules
+    )
+    num_microbatch_this_pp_rank = pp_size - pp_rank
+    if config.num_layers_per_virtual_pipeline_stage is not None:
+        layers_this_pprank = len(model.decoder.layers.modules)
+        vpp_size = layers_this_pprank // config.num_layers_per_virtual_pipeline_stage
+        if vpp_size > 0:
+            num_microbatch_this_pp_rank = (pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1) / vpp_size
+    # Activation Recomputation
+    # The base activation number is for one microbatch. With pipeline parallelism,
+    # the total activation is multiplied by the number of microbatches in flight.
+    # Recomputation reduces this by re-calculating activations during the backward pass
+    # instead of storing them.
+    # This is the activation memory without any recomputation.
+    num_activation = (num_activation - model.num_act_post) * num_microbatch_this_pp_rank + model.num_act_post
+    if config.recompute_granularity == "full":
+        # This logic is transplanted from the more detailed `report_memory_usage_one_pp_rank`
+        recompute_num_layers = config.recompute_num_layers
+        num_layers = model.num_layers
+        # Activations of a model with recompute enabled.
+        # The activation of a layer is an input to the next layer.
+        # So, the total activation is the sum of the activations of all layers,
+        # plus the activation of the embedding layer.
+        # The activation of a layer is stored only if it is not recomputed.
+        common_act = (
+            model.num_act_pre
+            + model.num_act_between_layers * num_layers * num_microbatch_this_pp_rank
+        )
+        if config.recompute_method == "block":
+            num_layers_with_loss = num_layers - recompute_num_layers
+            if num_layers_with_loss == 0:
+                peak1 = common_act + model.num_act_post
+                peak2 = common_act + model.num_act_per_layer
+                recomputed_activation = max(peak1, peak2)
+            else:
+                recomputed_activation = (
+                    common_act
+                    + model.num_act_post
+                    + model.num_act_per_layer
+                    * num_layers_with_loss
+                    * num_microbatch_this_pp_rank
+                )
+        elif config.recompute_method == "uniform":
+            peak1 = common_act + model.num_act_post
+            peak2 = (
+                common_act
+                + model.num_act_per_layer
+                * recompute_num_layers
+                * num_microbatch_this_pp_rank
+            )
+            recomputed_activation = max(peak1, peak2)
+        if isinstance(model.decoder.layers.modules[0].self_attention, MLASelfAttention):
+             recomputed_activation += model.decoder.layers.modules[0].self_attention.core_attention.num_activation()
+        num_activation = recomputed_activation
+    elif config.recompute_granularity == "selective":
+        # Selective recomputation is the default in Megatron-LM and is handled
+        # by Transformer Engine. The base `num_activation` calculation from `GPTModel`
+        # already reflects this. We just need to scale it by the number of in-flight microbatches.
+        # This is already the case, so we do nothing here.
+        pass
+    # Context Parallelism
+    if config.context_parallel_size > 1:
+        num_activation = (num_activation - num_activation_this_shard_mlp) / config.context_parallel_size + num_activation_this_shard_mlp
+    # Calculate bytes per parameter for optimizer states
+    if args.use_distributed_optimizer:
+        base_optim_bytes = 6 # FP16 weight, FP32 master weight
+        world_optim_bytes = 12 # FP32 grad, FP32 momentum, FP32 variance
+    else:
+        base_optim_bytes = 18 # All states on each GPU
+        world_optim_bytes = 0
+    num_bytes_per_parameter = base_optim_bytes + (world_optim_bytes / (args.data_parallel_size * config.context_parallel_size))
+    # Handle MoE optimizer state sharding if applicable
+    if num_parameter_this_shard_sparse > 0 and config.expert_model_parallel_size > 1:
+        moe_dp_size = args.data_parallel_size * config.tensor_model_parallel_size // (config.expert_model_parallel_size * args.expert_tensor_parallel_size)
+        num_bytes_per_parameter_moe = base_optim_bytes + (world_optim_bytes / moe_dp_size)
+        weight_and_optimizer_memory = (
+            (num_parameter_this_shard - num_parameter_this_shard_sparse) * num_bytes_per_parameter +
+            num_parameter_this_shard_sparse * num_bytes_per_parameter_moe
+        ) / NUM_BYTES_IN_GIGABYTE
+    else:
+        weight_and_optimizer_memory = (num_parameter_this_shard * num_bytes_per_parameter) / NUM_BYTES_IN_GIGABYTE
+    activation_memory = num_activation * 2 / NUM_BYTES_IN_GIGABYTE  # Use GIGABYTE
+    total_memory = weight_and_optimizer_memory + activation_memory
+    report = {
+        "pp_rank": pp_rank,
+        "parameters_b": num_parameter_this_shard / 1e9,
+        "activation_b": num_activation / 1e9, # Renamed from _gb to _b
+        "weight_optimizer_gb": round(weight_and_optimizer_memory, 2),
+        "activation_gb": round(activation_memory, 2),
+        "total_gb": round(total_memory, 2),
+        "details": model.dump(),
+        "model_breakdown": str(model)
+    }
+    print(model)
+    return report, output_shape
+def estimate_from_config(config, args):
+    """
+    Estimate memory usage from a given config and args, instead of global state.
+    This version iterates over pipeline parallel ranks for accurate estimation.
+    """
+    reports = []
+    input_shape = [args.micro_batch_size, args.seq_length]
+    pp_size = config.pipeline_model_parallel_size
+    if pp_size > 1:
+        for pp_rank in range(pp_size):
+            set_pipeline_model_parallel_rank(pp_rank)
+            report_for_rank, new_input_shape = _calculate_rank_memory(config, args, input_shape, pp_rank, pp_size)
+            reports.append(report_for_rank)
+            input_shape = new_input_shape # Pass output shape to the next stage
+    else:
+        report_for_rank, _ = _calculate_rank_memory(config, args, input_shape, 0, 1)
+        reports.append(report_for_rank)
+    return reports
+def model_provider() -> GPTModel:
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+    assert not args.use_legacy_models
+    if args.spec is not None:
+        transformer_layer_spec = import_module(args.spec)
+    else:
+        if use_te:
+            transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                args.num_experts,
+                args.moe_grouped_gemm,
+                args.qk_layernorm,
+                args.multi_latent_attention,
+                args.fp8,
+            )
+        else:
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                args.num_experts,
+                args.moe_grouped_gemm,
+                args.qk_layernorm,
+                args.multi_latent_attention,
+            )
+    set_global_config(config)
+    pre_process = is_pipeline_first_stage()
+    post_process = is_pipeline_last_stage()
+    # TODO fp8
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+        rotary_base=args.rotary_base,
+        rope_scaling=args.use_rope_scaling,
+    )
+    return model
+NUM_BYTES_IN_MEGABYTE = 1024 * 1024
+NUM_BYTES_IN_GIGABYTE = 1024 * 1024 * 1024
+def report_memory_usage():
+    args = get_args()
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+    input_shape = [args.micro_batch_size, args.seq_length]
+    if config.pipeline_model_parallel_size > 1:
+        for pp_rank in range(config.pipeline_model_parallel_size):
+            set_pipeline_model_parallel_rank(pp_rank)
+            print(f"\n----------[Pipeline_Parallelism_Rank={pp_rank}]----------")
+            input_shape = report_memory_usage_one_pp_rank(
+                input_shape, pp_rank, config.pipeline_model_parallel_size
+            )
+    else:
+        report_memory_usage_one_pp_rank(input_shape)
+def report_memory_usage_one_pp_rank(
+    input_shape: list[int], pp_rank=0, pp_size=1
+) -> list[int]:
+    args = get_args()
+    print(f"{input_shape=}")
+    model: GPTModel = model_provider()
+    num_parameter_this_shard = model.num_parameter()
+    num_activation = model.num_activation(input_shape)
+    output_shape = model.mock_forward(input_shape)
+    num_parameter_this_shard_sparse = 0
+    for layer in model.decoder.layers.modules:
+        if isinstance(layer.mlp, MoELayer):
+            num_parameter_this_shard_sparse += layer.mlp.num_parameter()
+            if (
+                "shared_experts" in layer.mlp.__dir__()
+                and layer.mlp.shared_experts is not None
+            ):
+                num_parameter_this_shard_sparse -= (
+                    layer.mlp.shared_experts.num_parameter()
+                )
+    num_activation_this_shard_mlp = sum(
+        [m.mlp.num_activation() for m in model.decoder.layers.modules]
+    )
+    num_microbatch_this_pp_rank = pp_size - pp_rank
+    # vpp
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        layers_this_pprank = model.decoder.layers.modules.__len__()
+        vpp_size = layers_this_pprank // args.num_layers_per_virtual_pipeline_stage
+        num_microbatch_this_pp_rank = (
+            pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1
+        ) / vpp_size
+    num_parameter_this_shard_sparse = 0
+    for layer in model.decoder.layers.modules:
+        if isinstance(layer.mlp, MoELayer):
+            num_parameter_this_shard_sparse += layer.mlp.num_parameter()
+            if (
+                "shared_experts" in layer.mlp.__dir__()
+                and layer.mlp.shared_experts is not None
+            ):
+                num_parameter_this_shard_sparse -= (
+                    layer.mlp.shared_experts.num_parameter()
+                )
+    num_microbatch_this_pp_rank = pp_size - pp_rank
+    # vpp
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        layers_this_pprank = model.decoder.layers.modules.__len__()
+        vpp_size = layers_this_pprank // args.num_layers_per_virtual_pipeline_stage
+        num_microbatch_this_pp_rank = (
+            pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1
+        ) / vpp_size
+    model.__repr__()
+    print(model)
+    print(
+        f"Number of parameters in every GPU in billions: "
+        f"{num_parameter_this_shard / 10**9: .2f} where mlp part is {num_parameter_this_shard_sparse / 10**9: .2f}"
+    )
+    # recompute
+    if args.recompute_granularity == "full":
+        recompute_num_layers = args.recompute_num_layers
+        num_layers = model.num_layers
+        common_act = (
+            model.num_act_pre
+            + model.num_act_between_layers * num_layers * num_microbatch_this_pp_rank
+        )  # recompute with pipeline parallel
+        info = (
+            "With this recomputing setting, the number of activation achieve peak when "
+        )
+        if args.recompute_method == "block":
+            num_layers_with_loss = num_layers - recompute_num_layers
+            if num_layers_with_loss == 0:
+                peak1 = common_act + model.num_act_post
+                peak2 = common_act + model.num_act_per_layer
+                if peak1 > peak2:
+                    info += "calculating loss"
+                else:
+                    info += "back-propogating loss"
+                num_activation = max(peak1, peak2)
+            else:
+                info += (
+                    f"calculating loss with {num_layers_with_loss} non-recompute layers"
+                )
+                num_activation = (
+                    common_act
+                    + model.num_act_post
+                    + model.num_act_per_layer
+                    * num_layers_with_loss
+                    * num_microbatch_this_pp_rank
+                )
+        elif args.recompute_method == "uniform":
+            peak1 = common_act + model.num_act_post
+            peak2 = (
+                common_act
+                + model.num_act_per_layer
+                * recompute_num_layers
+                * num_microbatch_this_pp_rank
+            )
+            if peak1 > peak2:
+                info += "calculating loss"
+            else:
+                info += f"back-propogating loss recomputing every {recompute_num_layers} layers"
+            num_activation = max(peak1, peak2)
+        if isinstance(
+            model.decoder.layers.modules[0].self_attention, MLASelfAttention
+        ):  # MLA recompute achieve peak at backward
+            num_activation += model.decoder.layers.modules[
+                0
+            ].self_attention.core_attention.num_activation()
+        print(info)
+    else:
+        num_activation = (
+            num_activation - model.num_act_post
+        ) * num_microbatch_this_pp_rank + model.num_act_post
+    # CP
+    num_activation = (
+        num_activation - num_activation_this_shard_mlp
+    ) / args.context_parallel_size + num_activation_this_shard_mlp
+    if pp_size == 1:
+        print(
+            f"Number of activation in every GPU in billions: "
+            f"{num_activation / 10**9: .2f} where mlp part is {num_activation_this_shard_mlp / 10**9: .2f}"
+        )
+    else:
+        print(
+            f"Number of activation per microbatch in every GPU in billions: "
+            f"{num_activation / 10**9: .2f} where mlp part is {num_activation_this_shard_mlp / 10**9: .2f}"
+            f", {num_microbatch_this_pp_rank=}"
+        )
+    num_bytes_per_parameter = (
+        18
+        if not args.use_distributed_optimizer
+        else 6 + (12 / args.data_parallel_size / args.context_parallel_size)
+    )
+    if args.expert_model_parallel_size * args.expert_tensor_parallel_size > 1:
+        num_bytes_per_parameter_dense = num_bytes_per_parameter
+        num_bytes_per_parameter_moe = (
+            18
+            if not args.use_distributed_optimizer
+            else 6
+            + (
+                12
+                / (
+                    args.data_parallel_size
+                    * args.context_parallel_size
+                    * args.tensor_model_parallel_size
+                    / args.expert_model_parallel_size
+                    / args.expert_tensor_parallel_size
+                )
+            )
+        )
+        print(f"{num_bytes_per_parameter_dense=} {num_bytes_per_parameter_moe=}")
+        weight_and_optimizer_memory = (
+            (num_parameter_this_shard - num_parameter_this_shard_sparse)
+            * num_bytes_per_parameter_dense
+            + num_parameter_this_shard_sparse * num_bytes_per_parameter_moe
+        ) / NUM_BYTES_IN_GIGABYTE
+    else:
+        print(f"{num_bytes_per_parameter=}")
+        weight_and_optimizer_memory = (
+            num_parameter_this_shard * num_bytes_per_parameter / NUM_BYTES_IN_GIGABYTE
+        )
+    activation_memory = num_activation * 2 / NUM_BYTES_IN_GIGABYTE  # only support fp16
+    total_memory = weight_and_optimizer_memory + activation_memory
+    print(
+        f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory/1024:.2f} GB, "
+        f"activation={activation_memory/1024:.2f} GB, total={total_memory/1024:.2f} GB\n"
+    )
+    # import ipdb
+    # ipdb.set_trace()
+    return output_shape
+    pass
+if __name__ == "__main__":
+    initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
+    import ipdb
+    with ipdb.launch_ipdb_on_exception():
+        report_memory_usage()

moe_mem_estimator/__init__.py ADDED Viewed

File without changes

moe_mem_estimator/base.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from abc import ABC
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch.nn.modules.module import _addindent
+from termcolor import colored
+def prehook_save_input_shape(func):
+    def wrapper(self, *input_shapes, **kw_input_shapes):
+        if len(input_shapes) + len(kw_input_shapes) == 0:
+            if "_input_shape" in self.__dict__:
+                return func(self, *self._input_shape, **self._kw_input_shapes)
+            else:
+                return 0
+        self._input_shape = input_shapes
+        self._kw_input_shapes = kw_input_shapes
+        return func(self, *self._input_shape, **self._kw_input_shapes)
+    return wrapper
+class MetaBase(type):
+    def __new__(cls, name, bases, attrs):
+        if "num_activation" in attrs:
+            attrs["num_activation"] = prehook_save_input_shape(attrs["num_activation"])
+        return super().__new__(cls, name, bases, attrs)
+class MemEstimator(metaclass=MetaBase):
+    def __init__(self, *args, **kwargs):
+        self._modules = {}
+        pass
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        # extra_repr = self.extra_repr()
+        # # empty string will be split into list ['']
+        # if extra_repr:
+        #     extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+        stat = (
+            "\t/* n_params="
+            + colored(f"{self.num_parameter()/1024/1024:.2f}M", "red")
+            + "\tn_act="
+            + colored(f"{self.num_activation()/1024/1024:.2f}M", "green")
+            + " */"
+        )
+        main_str = self._get_name() + stat + " ("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+        return f"{self.__class__.__name__} n_param={self.num_parameter()}"
+    def dump(self):
+        ret = {}
+        ret['name'] = self._get_name()
+        ret['n_params'] = self.num_parameter()
+        ret['n_act'] = self.num_activation()
+        modules = {}
+        for key, module in self._modules.items():
+            modules[key] = module.dump()
+        if len(modules)>0:
+            ret['modules'] = modules
+        return ret
+    def _get_name(self):
+        return self.__class__.__name__
+    def num_parameter(self):
+        """
+        Calculate number of the model parameters
+        """
+        raise NotImplemented
+    def num_activation(self, input_shape: list[int]):
+        """
+        Calculate number of the activation with given input_shape.
+        Args:
+            input shape
+        """
+        raise NotImplemented
+    def mock_forward(self, input_shape: list[int]):
+        """
+        Mock the forward.
+        Args:
+            input shape
+        return:
+            output shape
+        """
+        raise NotImplemented
+    def __setattr__(self, name: str, value) -> None:
+        if isinstance(value, MemEstimator):
+            modules = self.__dict__.get("_modules")
+            modules[name] = value
+        else:
+            pass
+        return super().__setattr__(name, value)
+    def __delattr__(self, name):
+        modules = self.__dict__.get("_modules")
+        if name in modules:
+            del modules[name]
+        return super().__delattr__(name)
+_global_config: TransformerConfig = None
+def set_global_config(cfg):
+    global _global_config
+    _global_config = cfg
+def get_tensor_model_parallel_world_size():
+    global _global_config
+    return _global_config.tensor_model_parallel_size
+def get_tensor_model_parallel_rank():
+    return 0
+def get_expert_tensor_parallel_world_size():
+    global _global_config
+    return _global_config.expert_tensor_parallel_size
+def get_expert_tensor_parallel_rank():
+    return 0
+_pp_rank = 0
+def set_pipeline_model_parallel_rank(rank):
+    global _pp_rank
+    _pp_rank = rank
+def get_pipeline_model_parallel_rank():
+    global _pp_rank
+    return _pp_rank
+def get_virtual_pipeline_model_parallel_rank():
+    return 0
+def get_pipeline_model_parallel_world_size():
+    global _global_config
+    return _global_config.pipeline_model_parallel_size
+def get_expert_model_parallel_rank():
+    return 0
+def get_expert_model_parallel_world_size():
+    global _global_config
+    return _global_config.expert_model_parallel_size
+def get_virtual_pipeline_model_parallel_world_size():
+    global _global_config
+    return _global_config.virtual_pipeline_model_parallel_size
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if (
+            get_virtual_pipeline_model_parallel_world_size() is not None
+            and get_virtual_pipeline_model_parallel_rank() != 0
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline-model-parallel stage, False otherwise."""
+    return get_pipeline_model_parallel_rank() == (
+        get_pipeline_model_parallel_world_size() - 1
+    )
+def cum_mul(l: list):
+    try:
+        ret = 1
+        for one in l:
+            ret *= one
+        return ret
+    except:
+        return 0
+        __import__('ipdb').set_trace()

moe_mem_estimator/gpt_model.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from .base import (
+    MemEstimator,
+    set_global_config,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
+    cum_mul,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from typing import Dict, Literal, Optional, Union
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+    _get_block_submodules,
+)
+from megatron.core.transformer.enums import ModelType
+from .layers import LanguageModelEmbedding, TransformerBlock, ColumnParallelLinear
+class GPTModel(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal[
+            "learned_absolute", "rope", "none"
+        ] = "learned_absolute",
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        rope_scaling: bool = False,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ):
+        super().__init__()
+        self.config = config
+        config.use_cpu_initialization = True
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+        # These 4 attributes are needed for TensorRT-LLM export.
+        self.max_position_embeddings = max_sequence_length
+        self.rotary_percent = rotary_percent
+        self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+            )
+        # remove RotaryEmbedding
+        # Transformer.
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        # Output
+        if post_process:
+            if self.config.defer_embedding_wgrad_compute:
+                self.embedding_activation_buffer = []
+                self.grad_output_buffer = []
+            else:
+                self.embedding_activation_buffer = None
+                self.grad_output_buffer = None
+            self.output_layer = ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
+            )
+    def num_parameter(self):
+        ret = 0
+        if self.pre_process:
+            ret += self.embedding.num_parameter()
+        ret += self.decoder.num_parameter()
+        if self.post_process:
+            ret += self.output_layer.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        self._inited = True
+        ret = 0
+        self.num_act_pre = 0
+        self.num_act_post = 0
+        self.num_act_per_layer = 0
+        self.num_act_between_layers = 0
+        self.num_layers = self.decoder.layers.modules.__len__()
+        if self.pre_process:
+            self.num_act_pre = self.embedding.num_activation(input_shape)
+            ret += self.num_act_pre
+            input_shape = self.embedding.mock_forward(input_shape)
+        ret += self.decoder.num_activation(input_shape)
+        self.num_act_per_layer = self.decoder.layers.modules[0].num_activation()
+        input_shape = self.decoder.mock_forward(input_shape)
+        self.num_act_between_layers = cum_mul(input_shape)
+        if self.post_process:
+            self.num_act_post = self.output_layer.num_activation(input_shape)
+            softmax_activation = (
+                self.output_layer.num_activation(input_shape) * 2
+            )  # due to softmax is calculate in fp32
+            self.num_act_post += softmax_activation
+            ret += self.num_act_post
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        if self.pre_process:
+            input_shape = self.embedding.mock_forward(input_shape)
+        input_shape = self.decoder.mock_forward(input_shape)
+        if self.post_process:
+            input_shape = self.output_layer.mock_forward(input_shape)
+        return input_shape

moe_mem_estimator/layers.py ADDED Viewed

	@@ -0,0 +1,1813 @@

+from .base import (
+    MemEstimator,
+    set_global_config,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
+    cum_mul,
+    get_expert_tensor_parallel_world_size,
+    get_expert_tensor_parallel_rank,
+    get_pipeline_model_parallel_world_size,
+    get_pipeline_model_parallel_rank,
+    get_expert_model_parallel_rank,
+    get_expert_model_parallel_world_size,
+    is_pipeline_first_stage,
+    is_pipeline_last_stage,
+    _addindent,
+    colored,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from typing import Dict, Literal, Optional, Union
+from megatron.core.transformer.transformer_config import (
+    TransformerConfig,
+    MLATransformerConfig,
+)
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+)
+from megatron.core.models.common.embeddings import (
+    _yarn_get_mscale,
+    apply_rotary_pos_emb,
+)
+from megatron.core.extensions.transformer_engine import (
+    _get_extra_te_kwargs,
+    get_expert_parallel_rng_tracker_name,
+    condition_init_method,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.utils import divide
+from megatron.core.transformer.spec_utils import import_module
+from megatron.core.transformer import transformer_layer
+import types, math
+import warnings
+from copy import deepcopy
+class LanguageModelEmbedding(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        position_embedding_type: Literal[
+            "learned_absolute", "rope", "none"
+        ] = "learned_absolute",
+        num_tokentypes: int = 0,
+    ):
+        super().__init__()
+        self.config: TransformerConfig = config
+        self.vocab_size: int = vocab_size
+        self.max_sequence_length: int = max_sequence_length
+        self.add_position_embedding: bool = (
+            position_embedding_type == "learned_absolute"
+        )
+        self.num_tokentypes = num_tokentypes
+        self.reduce_scatter_embeddings = (
+            (not self.add_position_embedding)
+            and self.num_tokentypes <= 0
+            and self.config.sequence_parallel
+        )
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.config.hidden_size,
+            init_method=self.config.init_method,
+            reduce_scatter_embeddings=self.reduce_scatter_embeddings,
+            config=self.config,
+        )
+        # TODO if self.add_position_embedding:
+        # TODO if self.num_tokentypes > 0:
+        self.embedding_dropout = Dropout(self.config.hidden_dropout)
+    def num_parameter(self):
+        ret = self.word_embeddings.num_parameter()
+        ret += self.embedding_dropout.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        ret = self.word_embeddings.num_activation(input_shape)
+        input_shape = self.word_embeddings.mock_forward(input_shape)
+        ret += self.embedding_dropout.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        input_shape = self.word_embeddings.mock_forward(input_shape)
+        return input_shape
+class VocabParallelEmbedding(MemEstimator):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        init_method,
+        reduce_scatter_embeddings: bool = False,
+        config: ModelParallelConfig,
+    ):
+        super().__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.reduce_scatter_embeddings = reduce_scatter_embeddings
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        (self.vocab_start_index, self.vocab_end_index) = (
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings,
+                get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size,
+            )
+        )
+        self.num_embeddings_per_partition = (
+            self.vocab_end_index - self.vocab_start_index
+        )
+        self.deterministic_mode = config.deterministic_mode
+        self.weight = (self.num_embeddings_per_partition, self.embedding_dim)
+    def num_parameter(self):
+        return self.weight[0] * self.weight[1]
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape + [self.weight[1]]
+class Dropout(MemEstimator):
+    def __init__(self, p=0, *args, **kwargs):
+        super().__init__()
+        self.p = p
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        if self.p == 0:
+            return 0
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class ColumnParallelLinear(MemEstimator):
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias=True,
+        gather_output=False,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        skip_weight_param_allocation: bool = False,
+        embedding_activation_buffer=None,
+        grad_output_buffer=None,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+        disable_grad_reduce: bool = False,
+        is_mla: bool = False,
+    ):
+        super().__init__()
+        if is_mla and config.sequence_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            output_size = divide(output_size, tp_size)
+            parallel_mode = None
+            tp_size = 1
+            tp_group = None
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        self.skip_bias_add = skip_bias_add
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.embedding_activation_buffer = embedding_activation_buffer
+        self.grad_output_buffer = grad_output_buffer
+        self.config = config
+        self.disable_grad_reduce = disable_grad_reduce
+        if is_expert:
+            world_size = get_expert_tensor_parallel_world_size()
+            rank = get_expert_tensor_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+        self.output_size_per_partition = divide(output_size, world_size)
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if not skip_weight_param_allocation:
+            self.weight = (self.output_size_per_partition, self.input_size)
+        else:
+            self.weight = (self.output_size_per_partition, self.input_size)
+        if bias:
+            self.bias = [self.output_size_per_partition]
+        else:
+            self.bias = None
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and world_size <= 1:
+            warnings.warn(
+                "`sequence_parallel` is set to `True`, but tensor model parallel size "
+                f"is {world_size}. Disabling sequence parallel."
+            )
+            self.sequence_parallel = False
+        self.allreduce_dgrad = (
+            world_size > 1
+            and not self.sequence_parallel
+            and not self.disable_grad_reduce
+        )
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+    def num_parameter(self):
+        ret = cum_mul(self.weight)
+        if self.bias is not None:
+            ret += self.bias[0]
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[0]
+    def mock_forward(self, input_shape: list[int]):
+        assert self.weight[-1] == input_shape[-1]
+        return input_shape[:-1] + [self.weight[0]]
+class RowParallelLinear(MemEstimator):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        input_is_parallel: bool,
+        skip_bias_add: bool,
+        stride: int = 1,
+        keep_master_weight_for_test: bool = False,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+    ):
+        super().__init__()
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and not self.input_is_parallel:
+            raise RuntimeError(
+                "To enable `sequence_parallel`, `input_is_parallel` must be `True`"
+            )
+        # Divide the weight matrix along the last dimension.
+        if self.is_expert:
+            world_size = get_expert_tensor_parallel_world_size()
+            rank = get_expert_tensor_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.weight = (self.output_size, self.input_size_per_partition)
+        if bias:
+            self.bias = [self.output_size]
+        else:
+            self.bias = None
+    def num_parameter(self):
+        ret = cum_mul(self.weight)
+        if self.bias is not None:
+            ret += self.bias[0]
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        assert self.weight[0] == input_shape[-1]
+        return input_shape[:-1] + [self.weight[1]]
+class RMSNorm(MemEstimator):
+    def __init__(self, hidden_size: int, *args, **kwargs):
+        super().__init__()
+        self.weight = hidden_size
+    def num_parameter(self):
+        return self.weight
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class GetBiasDropoutAdd(MemEstimator):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+get_bias_dropout_add = GetBiasDropoutAdd()
+class MLP(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        is_expert: bool = False,
+        input_size: int = None,
+    ):
+        super().__init__()
+        self.config: TransformerConfig = config
+        self.input_size = input_size if input_size != None else self.config.hidden_size
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name="fc1",
+        )
+        self.activation_func = self.config.activation_func
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name="fc2",
+        )
+    def num_parameter(self):
+        return self.linear_fc1.num_parameter() + self.linear_fc2.num_parameter()
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        result += self.linear_fc1.num_activation(input_shape)
+        intermediate_shape = self.linear_fc1.mock_forward(input_shape)
+        result += cum_mul(intermediate_shape) / 2  # activation layer
+        self.linear_fc2.num_activation(intermediate_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        intermediate_shape = self.linear_fc1.mock_forward(input_shape)
+        output_shape = self.linear_fc2.mock_forward(intermediate_shape)
+        return output_shape
+class ModuleList(MemEstimator):
+    def __init__(self, modules: list[MemEstimator] = None):
+        super().__init__()
+        if modules is None:
+            modules = []
+        self.modules = modules
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self.modules]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+        lines = []
+        stat = (
+            "\t/* n_params="
+            + colored(f"{self.num_parameter()/1024/1024:.2f}M", "red")
+            + "\tn_act="
+            + colored(f"{self.num_activation()/1024/1024:.2f}M", "green")
+            + " */"
+        )
+        main_str = self._get_name() + stat + " ("
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+    def dump(self):
+        list_of_reprs = [repr(item) for item in self.modules]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+        list_of_dumps = [item.dump() for item in self.modules]
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        repeated_blocks_dump = [list_of_dumps[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+            repeated_blocks_dump(list_of_dumps[i])
+        modules = {}
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks_dump):
+            key = f"({start_id})"
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                key = f"({start_id}-{end_id}) {n} layers"
+            modules[key] = b
+        ret = {}
+        ret["name"] = self._get_name()
+        ret["n_params"] = self.num_parameter()
+        ret["n_act"] = self.num_activation()
+        if len(modules) > 0:
+            ret["modules"] = modules
+        return ret
+    def append(self, m: MemEstimator):
+        self.modules.append(m)
+    def __len__(
+        self,
+    ):
+        return self.modules.__len__()
+    def num_parameter(self):
+        return sum([x.num_parameter() for x in self.modules])
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        for m in self.modules:
+            result += m.num_activation(input_shape)
+            input_shape = m.mock_forward(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        for m in self.modules:
+            result += m.num_activation(input_shape)
+            input_shape = m.mock_forward(input_shape)
+        return input_shape
+class SequentialMLP(MemEstimator):
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules):
+        super().__init__()
+        self.config = config
+        self.add_bias = config.add_bias_linear
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.local_experts = ModuleList()
+        for _ in range(self.num_local_experts):
+            expert = MLP(self.config, submodules, is_expert=True)
+            self.local_experts.append(expert)
+    def num_parameter(self):
+        return self.local_experts.num_parameter()
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed equally
+        all_tokens = input_shape[1]
+        result = 0
+        for m in self.local_experts.modules:
+            result += m.num_activation(
+                input_shape[:1]
+                + [all_tokens // self.num_local_experts]
+                + input_shape[2:]
+            )
+        return result
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed to the first expert
+        input_shape = self.local_experts.modules[0].mock_forward(input_shape)
+        return input_shape
+class TEGroupedMLP(MemEstimator):
+    """An efficient implementation of the Experts layer using TE's GroupedLinear.
+    Executes multiple experts in parallel to maximize computational efficiency.
+    """
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules):
+        super().__init__()
+        self.config = config
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.input_size = self.config.hidden_size
+        # Double the output width with gated linear unit, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.moe_ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.num_local_experts,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name="fc1",
+        )
+        self.activation_func = self.config.activation_func
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.num_local_experts,
+            self.config.moe_ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name="fc2",
+        )
+        # TODO if self.config.fp8:
+    def num_parameter(self):
+        ret = self.linear_fc1.num_parameter()
+        ret += self.linear_fc2.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        ret = 0
+        ret += self.linear_fc1.num_activation(input_shape)
+        input_shape = self.linear_fc1.mock_forward(input_shape)
+        # activation
+        ret += cum_mul(input_shape) / 2  # swiglu or gelu
+        input_shape = deepcopy(input_shape)
+        input_shape[-1] //= 2
+        self.linear_fc2.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed to the first expert
+        input_shape = self.local_experts.modules[0].mock_forward(input_shape)
+        return input_shape
+class TEGroupedLinear(MemEstimator):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        parallel_mode: str,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__()
+        self.config = config
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+        self.is_first_microbatch = True
+        self.disable_parameter_transpose_cache = (
+            self.config.disable_parameter_transpose_cache
+        )
+        extra_kwargs = _get_extra_te_kwargs(config)
+        extra_kwargs["ub_name"] = tp_comm_buffer_name
+        self.expert_parallel = self.config.expert_model_parallel_size > 1
+        if self.expert_parallel:
+            extra_kwargs["rng_tracker_name"] = get_expert_parallel_rng_tracker_name()
+        # For MoE models, the comms between TP and EP group is explicitly handled by
+        # MoE token dispatcher. So we disable comms by making TE agnostic of model parallel.
+        self.explicit_expert_comm = is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+        if is_expert:
+            tp_size = get_expert_tensor_parallel_world_size()
+        else:
+            tp_size = get_tensor_model_parallel_world_size()
+        if self.explicit_expert_comm:
+            if parallel_mode == "column":
+                output_size = divide(output_size, tp_size)
+            elif parallel_mode == "row":
+                input_size = divide(input_size, tp_size)
+            parallel_mode = None
+            tp_size = 1
+        assert not bias, "bias is not considered for now"
+        self.num_gemms = num_gemms
+        self.input_size = input_size
+        self.output_size = output_size
+    def num_parameter(self):
+        ret = self.num_gemms * self.input_size * self.output_size
+        return ret
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        ret = cum_mul(self.mock_forward(input_shape))
+        return ret
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        return input_shape[:-1] + [self.output_size]
+class TEColumnParallelGroupedLinear(TEGroupedLinear):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__(
+            num_gemms=num_gemms,
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="column",
+            config=config,
+            init_method=condition_init_method(config, init_method),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+class TERowParallelGroupedLinear(TEGroupedLinear):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__(
+            num_gemms=num_gemms,
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="row",
+            config=config,
+            init_method=condition_init_method(config, init_method),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+class SharedExpertMLP(MLP):
+    """
+    MLP layer for Shared Experts.
+    """
+    def __init__(self, config: TransformerConfig, spec: ModuleSpec):
+        config = deepcopy(config)
+        assert (
+            config.add_bias_linear == False
+        ), "bias is not supported in the shared experts, "
+        "please set '--disable-bias-linear' instead."
+        config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
+        super().__init__(config=config, submodules=spec.submodules)
+        self.use_shared_expert_gate = spec.params.get("gate", False)
+        if self.use_shared_expert_gate:
+            assert False, "use_shared_expert_gate is not Implemented"
+            # self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))
+            # if config.perform_initialization:
+            #     if get_cuda_rng_tracker().is_initialized():
+            #         with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
+            #             config.init_method(self.gate_weight)
+            # else:
+            #     config.init_method(self.gate_weight)
+            # self.gate_weight.data = self.gate_weight.data.to(dtype=config.params_dtype)
+            # setattr(self.gate_weight, 'sequence_parallel', self.config.sequence_parallel)
+        else:
+            self.gate_weight = None
+class TransformerBlock(MemEstimator):
+    """Transformer class."""
+    def __init__(
+        self,
+        config: TransformerConfig,
+        spec: Union[TransformerBlockSubmodules, ModuleSpec],
+        post_layer_norm: bool = True,
+        pre_process: bool = True,
+        post_process: bool = True,
+    ):
+        super().__init__()
+        self.config = config
+        self.submodules = _get_block_submodules(config, spec)
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.cuda_graphs = {}
+        self.current_microbatch = -1
+        self.input_tensor = None
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+        )
+        self._build_layers()
+        self.num_layers_per_pipeline_rank = len(self.layers)
+        self.tp_only_amax_red = config.tp_only_amax_red
+    def _build_layers(self):
+        def build_layer(layer_spec, layer_number):
+            return build_module(
+                layer_spec, config=self.config, layer_number=layer_number
+            )
+        # offset is implicit in TransformerLayer
+        self.layers = ModuleList(
+            [
+                build_layer(layer_spec, i + 1)
+                for i, layer_spec in enumerate(self.submodules.layer_specs)
+            ]
+        )
+        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+            self.final_layernorm = build_module(
+                self.submodules.layer_norm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None  # Either this or nn.Identity
+    def num_parameter(self):
+        ret = self.layers.num_parameter()
+        if self.final_layernorm is not None:
+            ret += self.final_layernorm.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        result = self.layers.num_activation(input_shape)
+        if self.final_layernorm is not None:
+            result += self.final_layernorm.num_activation(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class TopKRouter(MemEstimator):
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.topk = self.config.moe_router_topk
+        self.routing_type = self.config.moe_router_load_balancing_type
+        self.input_jitter = None
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        result = cum_mul(input_shape) * 2  # sinkhorn and sinkhorn activation
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape[:-1] + [self.topk]
+class MoELayer(MemEstimator):
+    def __init__(
+        self, config: TransformerConfig, submodules=None, layer_number: int = None
+    ):
+        super().__init__()
+        self.config = config
+        self.submodules = submodules
+        self.moe_layer_recompute = config.moe_layer_recompute
+        self.expert_parallel_size = get_expert_model_parallel_world_size()
+        assert (
+            self.expert_parallel_size > 0
+        ), "Expected non-negative expert parallel size"
+        assert self.config.num_moe_experts % self.expert_parallel_size == 0
+        self.num_local_experts = (
+            self.config.num_moe_experts // self.expert_parallel_size
+        )
+        local_expert_indices_offset = (
+            get_expert_model_parallel_rank() * self.num_local_experts
+        )
+        self.router = TopKRouter(config=self.config)
+        self.use_shared_expert = (
+            self.config.moe_shared_expert_intermediate_size is not None
+        )
+        self.shared_expert_overlap = self.config.moe_shared_expert_overlap
+        self.local_expert_indices = [
+            local_expert_indices_offset + i for i in range(self.num_local_experts)
+        ]
+        assert all(
+            map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices)
+        )
+        self.experts = None
+        self.shared_experts = None
+        self.token_dispatcher = None
+        self.layer_number = layer_number
+        # Initialize experts
+        self.experts = build_module(
+            self.submodules.experts, self.num_local_experts, self.config
+        )
+        # Initialize shared experts
+        if self.use_shared_expert:
+            self.shared_experts = SharedExpertMLP(
+                self.config, self.submodules.shared_experts
+            )
+            # if self.shared_expert_overlap:
+            #     self.token_dispatcher.set_shared_experts(self.shared_experts)
+    def num_parameter(self):
+        ret = self.experts.num_parameter() + self.router.num_parameter()
+        if self.use_shared_expert:
+            ret += self.shared_experts.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        result = self.router.num_activation(input_shape)
+        result += cum_mul(input_shape) * self.router.topk  # token dispatcher
+        moe_input_shape_average = deepcopy(input_shape)
+        moe_input_shape_average[1] = int(moe_input_shape_average[1] * self.router.topk)
+        result += self.experts.num_activation(moe_input_shape_average)
+        if self.use_shared_expert:
+            result += self.shared_experts.num_activation(input_shape)
+        if self.config.moe_layer_recompute:
+            result = cum_mul(input_shape) * 2
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class IdentityOp(MemEstimator):
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        return 0
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+IdentityFuncOp = IdentityOp
+TERowParallelLinear = RowParallelLinear
+TEColumnParallelLinear = ColumnParallelLinear
+TELayerNormColumnParallelLinear = ColumnParallelLinear
+class TEDotProductAttention(MemEstimator):
+    def __init__(self, config: TransformerConfig, *args, **kwargs):
+        super().__init__()
+        self.config = config
+    def num_parameter(self):
+        return 0
+    def num_activation(
+        self, q_shape: list[int], k_shape: list[int], v_shape: list[int]
+    ):
+        bs, seqs, heads, dim = q_shape
+        if self.config.multi_latent_attention and False:
+            result = bs * seqs * seqs * heads
+        else:
+            bs, seqs, heads, dim = k_shape
+            result = (
+                bs * seqs * dim * heads * 2  # * self.config.tensor_model_parallel_size
+            )  # flash attention
+            if self.config.context_parallel_size > 1:
+                result *= 2
+        return result
+    def mock_forward(
+        self,
+        hidden_size: int,
+        q_shape: list[int],
+        k_shape: list[int],
+        v_shape: list[int],
+    ):
+        seqs, bs, heads, dim = q_shape
+        return [seqs, bs, hidden_size]
+class TransformerLayer(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+    ):
+        super().__init__()
+        self.config = config
+        if config.enable_cuda_graph and self.training:
+            assert (
+                not config.cpu_offloading and config.recompute_granularity is None
+            ), "Cudagraphs not supported"
+            self.cudagraph_manager = CudaGraphManager()
+        self.submodules_config = submodules
+        self.layer_number = layer_number + get_transformer_layer_offset(self.config)
+        self.hidden_dropout = (
+            config.hidden_dropout if hidden_dropout is None else hidden_dropout
+        )
+        # [Module 1: Input Layernorm] Optional Layernorm on the input data
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = build_module(
+            submodules.input_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 2: SelfAttention]
+        self.self_attention = build_module(
+            submodules.self_attention, config=self.config, layer_number=layer_number
+        )
+        # [Module 3: BiasDropoutFusion]
+        self.self_attn_bda = build_module(submodules.self_attn_bda)
+        # [Module 4: Post SelfAttention] Optional Layernorm after self-attn
+        self.pre_cross_attn_layernorm = build_module(
+            submodules.pre_cross_attn_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 5: CrossAttention]
+        self.cross_attention = build_module(
+            submodules.cross_attention, config=self.config, layer_number=layer_number
+        )
+        # [Module 6: BiasDropoutFusion]
+        self.cross_attn_bda = build_module(
+            submodules.cross_attn_bda, config=self.config
+        )
+        # [Module 7: Pre MLP] Optional Layernorm before MLP
+        self.pre_mlp_layernorm = build_module(
+            submodules.pre_mlp_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 8: MLP block]
+        self.mlp = build_module(submodules.mlp, config=self.config)
+        if hasattr(self.mlp, "set_layer_number"):
+            self.mlp.set_layer_number(self.layer_number)
+        # [Module 9: BiasDropoutFusion]
+        self.mlp_bda = build_module(submodules.mlp_bda)
+    def num_parameter(self):
+        result = self.input_layernorm.num_parameter()
+        result += self.self_attention.num_parameter()
+        result += self.pre_cross_attn_layernorm.num_parameter()
+        result += self.cross_attention.num_parameter()
+        result += self.cross_attn_bda.num_parameter()
+        result += self.pre_mlp_layernorm.num_parameter()
+        result += self.mlp.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        result += self.self_attention.num_activation(input_shape)
+        result += self.mlp.num_activation(input_shape)
+        # __import__('ipdb').set_trace()
+        # sequence parallel
+        if self.config.sequence_parallel and self.config.tensor_model_parallel_size > 1:
+            input_shape = deepcopy(input_shape)
+            input_shape[1] /= self.config.tensor_model_parallel_size
+        result += self.input_layernorm.num_activation(input_shape)
+        result += self.pre_mlp_layernorm.num_activation(input_shape)
+        result += self.self_attn_bda.num_activation(input_shape)
+        result += self.mlp_bda.num_activation(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class SelfAttention(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        layer_number: int,
+        attn_mask_type,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = ""
+        # For normal attention without groups, num_query_groups == num_attention_heads,
+        # so these two will be the same
+        self.query_projection_size = (
+            self.config.kv_channels * self.config.num_attention_heads
+        )
+        self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
+        # Per attention head and per partition values.
+        world_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(
+            self.config.num_attention_heads, world_size
+        )
+        self.num_query_groups_per_partition = divide(
+            self.config.num_query_groups, world_size
+        )
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+        )
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name="qkv",
+        )
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="proj",
+        )
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+        )
+    def num_parameter(self):
+        result = 0
+        result += self.core_attention.num_parameter()
+        result += self.linear_proj.num_parameter()
+        result += self.linear_qkv.num_parameter()
+        if self.q_layernorm is not None:
+            result += self.q_layernorm.num_parameter()
+        if self.k_layernorm is not None:
+            result += self.k_layernorm.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        ret = 0
+        ## in estimator: act(linear) = 1.5*cum_mul(input_shape)
+        ## in reality: act(linear) = cum_mul(input_shape), act(rotary) = cum_mul(input_shape), act(attn_forward_func_with_cp) = cum_mul(input_shape)
+        # ret += self.linear_qkv.num_activation(input_shape)
+        mixed_qkv_shape = self.linear_qkv.mock_forward(input_shape)
+        new_tensor_shape = mixed_qkv_shape[:-1] + [
+            self.num_query_groups_per_partition,
+            (
+                (
+                    self.num_attention_heads_per_partition
+                    // self.num_query_groups_per_partition
+                    + 2
+                )
+                * self.hidden_size_per_attention_head
+            ),
+        ]
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+        # [sq, b, ng, (np/ng + 2) * hn]
+        # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+        q_shape = new_tensor_shape[:-1] + [split_arg_list[0]]
+        k_shape = new_tensor_shape[:-1] + [split_arg_list[1]]
+        v_shape = new_tensor_shape[:-1] + [split_arg_list[2]]
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        q_shape = (
+            q_shape[:2]
+            + [cum_mul(q_shape[-2:]) // self.hidden_size_per_attention_head]
+            + [self.hidden_size_per_attention_head]
+        )
+        if not self.checkpoint_core_attention:
+            ret += self.core_attention.num_activation(q_shape, k_shape, v_shape)
+        ret += self.linear_proj.num_activation(input_shape)
+        ## in reality: act(linear) = cum_mul(input_shape), act(rotary) = cum_mul(input_shape), act(attn_forward_func_with_cp) = cum_mul(input_shape)
+        ret += self.linear_proj.num_activation(input_shape) * 3
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class Linear(MemEstimator):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__()
+        self.weight = (in_features, out_features)
+    def num_parameter(self):
+        return self.weight[0] * self.weight[1]
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape[:-1] + [self.weight[1]]
+class MLASelfAttention(MemEstimator):
+    """MLA Self-attention layer class
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = "self"
+        self.world_size = get_tensor_model_parallel_world_size()
+        # assert (
+        #     world_size == 1
+        # ), "MLA is not supported with Tensor Parallelism yet, \
+        # use Expert Parallelism and Pipeline Parallelism for better performance."
+        self.query_projection_size = (
+            self.config.v_head_dim * self.config.num_attention_heads
+        )
+        self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
+        mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale)
+        self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim)
+        # Per attention head and per partition values.
+        world_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(
+            self.config.num_attention_heads, world_size
+        )
+        self.num_query_groups_per_partition = divide(
+            self.config.num_query_groups, world_size
+        )
+        # TODO Rotary Embedding
+        # self.rotary_pos_emb = YarnRotaryEmbedding(
+        #     self.config.qk_pos_emb_head_dim,
+        #     rotary_base=self.config.rotary_base,
+        #     scaling_factor=self.config.rotary_scaling_factor,
+        #     original_max_position_embeddings=self.config.max_position_embeddings,
+        #     beta_fast=self.config.beta_fast,
+        #     beta_slow=self.config.beta_slow,
+        #     mscale=self.config.mscale,
+        #     mscale_all_dim=self.config.mscale_all_dim,
+        # )
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+        )
+        if self.config.q_lora_rank is None:
+            # Not projectiing query
+            self.linear_q_proj = build_module(
+                submodules.linear_q_proj,
+                self.config.hidden_size,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                is_mla=True,
+            )
+        else:
+            self.linear_q_down_proj = Linear(
+                self.config.hidden_size, self.config.q_lora_rank, bias=False
+            )
+            self.linear_q_up_proj = build_module(
+                submodules.linear_q_up_proj,
+                self.config.q_lora_rank,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                is_mla=True,
+            )
+        self.linear_kv_down_proj = Linear(
+            self.config.hidden_size,
+            self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim,
+            bias=False,
+        )
+        self.linear_kv_up_proj = build_module(
+            submodules.linear_kv_up_proj,
+            self.config.kv_lora_rank,
+            self.config.num_attention_heads
+            * (self.config.qk_head_dim + self.config.v_head_dim),
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            is_mla=True,
+        )
+        if self.config.q_lora_rank is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.config.q_lora_rank,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        self.kv_layernorm = build_module(
+            submodules.kv_layernorm,
+            hidden_size=self.config.kv_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="proj",
+        )
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+        )
+    def num_parameter(self):
+        result = 0
+        result += self.core_attention.num_parameter()
+        result += self.linear_proj.num_parameter()
+        if self.config.q_lora_rank is None:
+            result += self.linear_q_proj.num_parameter()
+        else:
+            result += self.linear_q_down_proj.num_parameter()
+            result += self.linear_q_up_proj.num_parameter()
+        result += self.linear_kv_down_proj.num_parameter()
+        result += self.linear_kv_up_proj.num_parameter()
+        result += self.kv_layernorm.num_parameter()
+        if self.config.q_lora_rank is not None:
+            result += self.q_layernorm.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        q_len, bsz, _ = input_shape
+        ret = 0
+        if self.config.q_lora_rank is not None:
+            ret += self.linear_q_down_proj.num_activation(input_shape)
+            q_compressed_shape = self.linear_q_down_proj.mock_forward(input_shape)
+            ret += self.q_layernorm.num_activation(q_compressed_shape)
+            ret += self.linear_q_up_proj.num_activation(q_compressed_shape)
+            q_shape = self.linear_q_up_proj.mock_forward(q_compressed_shape)
+        else:
+            # hidden_states:[s, b, 2048], q: [s, b, n * 192]
+            ret += self.linear_q_proj.num_activation(input_shape)
+            q_shape = self.linear_q_proj.mock_forward(input_shape)
+        # kv_combined: [s, b, 576]
+        ret += self.linear_kv_down_proj.num_activation(input_shape)
+        kv_combined_shape = self.linear_kv_down_proj.mock_forward(input_shape)
+        # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64]
+        kv_compressed_shape = kv_combined_shape[:-1] + [self.config.kv_lora_rank]
+        # kv: [s, b, 2048]
+        ret += self.kv_layernorm.num_activation(kv_compressed_shape)
+        ret += self.linear_kv_up_proj.num_activation(kv_compressed_shape)
+        q_shape = [q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim]
+        k_shape = [q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim]
+        v_shape = [
+            q_len,
+            bsz,
+            self.num_attention_heads_per_partition,
+            self.config.v_head_dim,
+        ]
+        if not self.checkpoint_core_attention:
+            ret += self.core_attention.num_activation(q_shape, k_shape, v_shape)
+        ret += self.linear_proj.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class TENorm:
+    def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5):
+        from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs, te
+        if config.normalization == "LayerNorm":
+            # TODO layernorm
+            pass
+        elif config.normalization == "RMSNorm":
+            assert hasattr(
+                te.pytorch, "RMSNorm"
+            ), "Transformer-Engine >= v0.11 required to use this feature"
+            instance = RMSNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=config.sequence_parallel,
+                zero_centered_gamma=config.layernorm_zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        else:
+            raise Exception("Only LayerNorm and RMSNorm are curently supported")
+        return instance
+def build_module(
+    spec_or_module: Union[ModuleSpec, type], *args, **kwargs
+) -> MemEstimator:
+    """replace module with MemEstimators"""
+    if isinstance(spec_or_module, types.FunctionType):
+        return globals()[spec_or_module.__name__]
+    if isinstance(spec_or_module, ModuleSpec) and isinstance(
+        spec_or_module.module, types.FunctionType
+    ):
+        assert False
+        return spec_or_module.module
+    if isinstance(spec_or_module, type):
+        module = spec_or_module
+    elif hasattr(spec_or_module, "module") and isinstance(spec_or_module.module, type):
+        module = spec_or_module.module
+    else:
+        module = import_module(spec_or_module.module)
+    if isinstance(module, types.FunctionType):
+        assert False
+        return module
+    if hasattr(spec_or_module, "submodules") and spec_or_module.submodules is not None:
+        kwargs["submodules"] = spec_or_module.submodules
+    try:
+        module = globals()[module.__name__]
+        return module(
+            *args,
+            **spec_or_module.params if hasattr(spec_or_module, "params") else {},
+            **kwargs,
+        )
+    except Exception as e:
+        # import ipdb
+        # ipdb.set_trace()
+        # improve the error message since we hide the module name in the line above
+        import sys
+        raise type(e)(f"{str(e)} when instantiating {module.__name__}").with_traceback(
+            sys.exc_info()[2]
+        )
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+    BaseTransformerLayer,
+    LayerNormImpl,
+)
+def _get_block_submodules(
+    config: TransformerConfig, spec: Union[TransformerBlockSubmodules, ModuleSpec]
+) -> TransformerBlockSubmodules:
+    """
+    Retrieve or construct TransformerBlockSubmodules based on the provided specification.
+    Args:
+        config (TransformerConfig): Configuration object for the transformer model.
+        spec (Union[TransformerBlockSubmodules, ModuleSpec]): Specification for the
+            transformer block submodules. Can be either a TransformerBlockSubmodules
+            instance or a ModuleSpec.
+    Returns:
+        TransformerBlockSubmodules: The submodules for the transformer block.
+    """
+    # Transformer block submodules.
+    if isinstance(spec, TransformerBlockSubmodules):
+        return spec
+    # ModuleSpec here is generally assumed to be for a transformer layer that
+    # is implemented in `transformer_layer.py` or if it subclasses
+    # `BaseTransformerLayer` from the `transformer_layer.py` file.
+    elif isinstance(spec, ModuleSpec):
+        if issubclass(spec.module, TransformerBlock):
+            return spec.submodules
+        elif issubclass(spec.module, BaseTransformerLayer):
+            num_layers = get_num_layers_to_build(config)
+            return TransformerBlockSubmodules(
+                layer_specs=[spec] * num_layers, layer_norm=LayerNormImpl
+            )
+        else:
+            raise Exception(f"specialize for {spec.module.__name__}.")
+    else:
+        raise Exception(f"specialize for {type(spec).__name__}.")
+def get_num_layers_to_build(config: TransformerConfig) -> int:
+    """
+    Determine the number of transformer layers to build for the current pipeline stage.
+    Args:
+        config (TransformerConfig): Configuration object containing transformer model parameters.
+    Returns:
+        int: The number of layers to be built for the current pipeline stage.
+    """
+    if (
+        config.num_layers_in_first_pipeline_stage is not None
+        or config.num_layers_in_last_pipeline_stage is not None
+    ):
+        assert not (
+            config.account_for_embedding_in_pipeline_split
+            or config.account_for_loss_in_pipeline_split
+        ), " \
+        Does not support standalone embedding stage and standalone loss stage with uneven pp"
+        # Number of layers to distribute over rest of pipeline stages
+        layers_to_distribute = config.num_layers
+        # Number of pipeline stages left for distributing transformer layers
+        pipeline_stages_left = get_pipeline_model_parallel_world_size()
+        # If the uneven first (last) pipeline stage is enabled, remove the specified number
+        # of layers to calculate the number of layers on each middle pipeline stage.
+        if config.num_layers_in_first_pipeline_stage is not None:
+            layers_to_distribute -= config.num_layers_in_first_pipeline_stage
+            pipeline_stages_left -= 1
+        if config.num_layers_in_last_pipeline_stage is not None:
+            layers_to_distribute -= config.num_layers_in_last_pipeline_stage
+            pipeline_stages_left -= 1
+        assert (
+            layers_to_distribute % pipeline_stages_left == 0
+        ), "With uneven pipelineing the left over layers must be divisible by left over stages"
+        num_layers_per_pipeline_rank = layers_to_distribute // pipeline_stages_left
+        # If the uneven first (last) pipeline stage is enabled, return the specified number
+        # of layers for all virtual pipeline parallel stages within the first (last) pipeline
+        # parallel stage.
+        if (
+            is_pipeline_first_stage(ignore_virtual=True)
+            and config.num_layers_in_first_pipeline_stage is not None
+        ):
+            num_layers_per_pipeline_rank = config.num_layers_in_first_pipeline_stage
+        if (
+            is_pipeline_last_stage(ignore_virtual=True)
+            and config.num_layers_in_last_pipeline_stage is not None
+        ):
+            num_layers_per_pipeline_rank = config.num_layers_in_last_pipeline_stage
+    else:
+        # Include the embedding layer and loss layer into pipeline parallelism partition
+        num_layers = config.num_layers
+        if config.account_for_embedding_in_pipeline_split:
+            num_layers += 1
+        if config.account_for_loss_in_pipeline_split:
+            num_layers += 1
+        assert (
+            num_layers % config.pipeline_model_parallel_size == 0
+        ), "num_layers should be divisible by pipeline_model_parallel_size"
+        num_layers_per_pipeline_rank = num_layers // config.pipeline_model_parallel_size
+    # if get_virtual_pipeline_model_parallel_world_size() is not None:
+    #     # Interleaved pipeline parallelism:
+    #     # Number of layers in each model chunk is the number of layers in the stage,
+    #     # divided by the number of model chunks in a stage.
+    #     # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+    #     # layers to stages like (each list is a model chunk):
+    #     # Stage 0: [0]  [2]  [4]  [6]
+    #     # Stage 1: [1]  [3]  [5]  [7]
+    #     # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+    #     # layers to stages like (each list is a model chunk):
+    #     # Stage 0: [0, 1]  [4, 5]
+    #     # Stage 1: [2, 3]  [6, 7]
+    #     vp_size = get_virtual_pipeline_model_parallel_world_size()
+    #     assert (
+    #         num_layers_per_pipeline_rank % vp_size == 0
+    #     ), "num_layers_per_pipeline_rank should be divisible by vp_size"
+    #     num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+    #     num_layers_to_build = num_layers_per_virtual_rank
+    # else:
+    #     # Non-interleaved pipeline parallelism:
+    #     # Each stage gets a contiguous set of layers.
+    #     num_layers_to_build = num_layers_per_pipeline_rank
+    num_layers_to_build = num_layers_per_pipeline_rank
+    # The embedding (or loss) layer cannot function as a standalone transformer layer
+    # Reduce the number of layers to construct by 1 on the first (or last) stage if the
+    # embedding (or loss) layer is included in the pipeline parallelism partition and placement.
+    if is_pipeline_first_stage() and config.account_for_embedding_in_pipeline_split:
+        num_layers_to_build -= 1
+        assert (
+            num_layers_to_build >= 0
+        ), "Not enough layers in the first virtual pipeline stage"
+    if is_pipeline_last_stage() and config.account_for_loss_in_pipeline_split:
+        num_layers_to_build -= 1
+        assert (
+            num_layers_to_build >= 0
+        ), "Not enough layers in the last virtual pipeline stage"
+    return num_layers_to_build
+def get_transformer_layer_offset(config: TransformerConfig):
+    """Get the index offset of current pipeline stage, given the level of pipelining."""
+    pipeline_rank = get_pipeline_model_parallel_rank()
+    # if not is_inside_encoder():
+    if True:
+        pp_decoder_start = 0
+        if pp_decoder_start is not None:
+            pipeline_rank = pipeline_rank - pp_decoder_start
+    if config.pipeline_model_parallel_size > 1:
+        if (
+            config.num_layers_in_first_pipeline_stage is not None
+            or config.num_layers_in_last_pipeline_stage is not None
+        ):
+            # Calculate number of pipeline stages to distribute the remaining Transformer
+            # layers after deducting the Transformer layers in the first or the last stages
+            middle_pipeline_stages = config.pipeline_model_parallel_size
+            middle_pipeline_stages -= sum(
+                [
+                    1 if x is not None else 0
+                    for x in (
+                        config.num_layers_in_first_pipeline_stage,
+                        config.num_layers_in_last_pipeline_stage,
+                    )
+                ]
+            )
+            # Calculate layers to distribute in each pipeline stage. If the
+            # num_layers_in_first_pipeline_stage and num_layers_in_last_pipeline_stage
+            # are not set, we will not enable uneven pipeline. All layers will be treated
+            # as middle layers.
+            num_layers_in_first_pipeline_stage = (
+                0
+                if config.num_layers_in_first_pipeline_stage is None
+                else config.num_layers_in_first_pipeline_stage
+            )
+            num_layers_in_last_pipeline_stage = (
+                0
+                if config.num_layers_in_last_pipeline_stage is None
+                else config.num_layers_in_last_pipeline_stage
+            )
+            middle_num_layers = (
+                config.num_layers
+                - num_layers_in_first_pipeline_stage
+                - num_layers_in_last_pipeline_stage
+            )
+            if middle_pipeline_stages > 0:
+                num_layers_per_pipeline_rank = (
+                    middle_num_layers // middle_pipeline_stages
+                )
+            else:
+                num_layers_per_pipeline_rank = 0
+            middle_pipeline_rank = (
+                pipeline_rank
+                if config.num_layers_in_first_pipeline_stage is None
+                else pipeline_rank - 1
+            )
+            if pipeline_rank == 0:
+                offset = 0
+            else:
+                offset = (
+                    middle_pipeline_rank * num_layers_per_pipeline_rank
+                ) + num_layers_in_first_pipeline_stage
+        else:
+            num_layers = config.num_layers
+            # Increase the number of layers by one if we include the embedding (loss)
+            # layer into pipeline parallelism partition and placement
+            if config.account_for_embedding_in_pipeline_split:
+                num_layers += 1
+            if config.account_for_loss_in_pipeline_split:
+                num_layers += 1
+            num_layers_per_pipeline_rank = (
+                num_layers // config.pipeline_model_parallel_size
+            )
+            offset = pipeline_rank * num_layers_per_pipeline_rank
+            # Reduce the offset of embedding layer from the total layer number
+            if (
+                config.account_for_embedding_in_pipeline_split
+                and not is_pipeline_first_stage()
+            ):
+                offset -= 1
+    else:
+        offset = 0
+    return offset

webui/index.html ADDED Viewed

	@@ -0,0 +1,163 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Megatron Memory Estimator</title>
+    <link rel="stylesheet" href="style.css">
+</head>
+<body>
+    <div class="container">
+        <h1>Megatron Memory Estimator</h1>
+        <div class="disclaimer-banner">
+    Note: This estimator only measures the GPU memory directly managed by PyTorch when running Megatron. It does not include extra consumption from NCCL communication buffers, kernel fusion, overlap optimizations, CUDA Graphs, etc. Please use the "Overhead per GPU" option below to account for these additional costs.
+</div>
+        <div class="main-layout">
+            <div class="top-section">
+                <div class="config-column">
+                    <form id="config-form">
+                <h2>Configuration</h2>
+                        <div class="form-group">
+                            <label for="model-select">Select a Local Config:</label>
+                            <select id="model-select" name="model">
+                                <option value="">Loading...</option>
+                            </select>
+                        </div>
+                        <!-- All settings are now in one block -->
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="num-gpus">Total GPUs:</label>
+                                <input type="number" id="num-gpus" name="num_gpus" value="8" step="8" min="8">
+                            </div>
+                    <div class="form-group">
+                                <label for="mbs">micro batch size:</label>
+                                <input type="number" id="mbs" name="mbs" value="1" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="seq-len">SeqLen:</label>
+                                <input type="number"id="seq-len" name="seq-len" value="4096" min="1">
+                            </div>
+                        </div>
+                        <div class="form-group">
+                            <input type="checkbox" id="use-distributed-optimizer" name="use_distributed_optimizer" checked>
+                            <label for="use-distributed-optimizer" class="inline-label">Use Distributed Optimizer</label>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="recompute-granularity">Recomputation:</label>
+                                <select id="recompute-granularity" name="recompute_granularity">
+                                    <option value="none">None</option>
+                                    <option value="selective">Selective</option>
+                                    <option value="full">Full</option>
+                                </select>
+                            </div>
+                            <div class="form-group recompute-options" style="display: none;">
+                                <label for="recompute-method">Method:</label>
+                                <select id="recompute-method" name="recompute_method">
+                                    <option value="uniform">Uniform</option>
+                                    <option value="block">Block</option>
+                                </select>
+                            </div>
+                            <div class="form-group recompute-options" style="display: none;">
+                                <label for="recompute-num-layers">Layers:</label>
+                                <input type="number" id="recompute-num-layers" name="recompute_num_layers" value="1" min="1">
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="tp">TP:</label>
+                                <select id="tp" name="tp"></select>
+                    </div>
+                    <div class="form-group">
+                                <label for="pp">PP:</label>
+                        <input type="number" id="pp" name="pp" value="1" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="ep">EP:</label>
+                                <select id="ep" name="ep"></select>
+                            </div>
+                            <div class="form-group">
+                                <label for="cp">CP:</label>
+                                <select id="cp" name="cp"></select>
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="vpp">VPP:</label>
+                                <input type="number" id="vpp" name="vpp" placeholder="None" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="etp">ETP:</label>
+                                <input type="number" id="etp" name="etp" placeholder="None" min="1">
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="num_layers_in_first_pipeline_stage">First Stage Layers:</label>
+                                <input type="number" id="num_layers_in_first_pipeline_stage" name="num_layers_in_first_pipeline_stage" placeholder="None" min="0">
+                    </div>
+                    <div class="form-group">
+                                <label for="num_layers_in_last_pipeline_stage">Last Stage Layers:</label>
+                                <input type="number" id="num_layers_in_last_pipeline_stage" name="num_layers_in_last_pipeline_stage" placeholder="None" min="0">
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="overhead">Overhead per GPU:</label>
+                                <select id="overhead" name="overhead">
+                                    <option value="5">5GB</option>
+                                    <option value="10" selected>10GB</option>
+                                </select>
+                            </div>
+                        </div>
+                        <div id="validation-message" class="error-message" style="display: none;"></div>
+                        <div class="button-container">
+                            <button type="submit">Estimate</button>
+                    </div>
+                </form>
+                </div>
+                <div class="output-column">
+                    <div class="config-editor-wrapper">
+                        <h2>Model Config (Editable)</h2>
+                        <textarea id="config-editor" rows="20"></textarea>
+                    </div>
+                </div>
+            </div>
+            <div class="bottom-section">
+                <div id="output-container">
+                    <div id="loading" style="display: none;">Calculating...</div>
+                    <div id="history-wrapper">
+                        <h3>History</h3>
+                        <table id="history-table">
+                            <thead>
+                                <tr>
+                                    <th>Model</th>
+                                    <th>Weight Optimizer (GB)</th>
+                                    <th>Activation (GB)</th>
+                                    <th>Total (GB/GPU)</th>
+                                    <th>Actions</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                            </tbody>
+                        </table>
+                        <button id="clear-history" style="margin-top: 1em;">Clear History</button>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="script.js"></script>
+    <footer class="footer">
+        <p>&copy; 2025 <a href="https://github.com/ISEEKYAN" target="_blank">ISEEKYAN</a>. Developed at NVIDIA.</p>
+    </footer>
+</body>
+</html>

webui/main.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import glob
+from fastapi import FastAPI, Body
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+import requests
+from pydantic import BaseModel, field_validator
+from typing import Optional
+from mbridge import AutoBridge
+from estimate import estimate_from_config
+from megatron.core import parallel_state as mpu
+import argparse
+import json
+import tempfile
+# The directory of the current script (main.py)
+WEBUI_DIR = os.path.dirname(os.path.abspath(__file__))
+app = FastAPI()
+# Mount static files from the webui directory
+app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="static")
+@app.get("/")
+async def read_index():
+    return FileResponse(os.path.join(WEBUI_DIR, 'index.html'))
+@app.get("/style.css")
+async def read_css():
+    return FileResponse(os.path.join(WEBUI_DIR, 'style.css'))
+@app.get("/script.js")
+async def read_js():
+    return FileResponse(os.path.join(WEBUI_DIR, 'script.js'))
+SUPPORTED_MODELS = [
+    "Qwen/Qwen3-235B-A22B",
+    "Qwen/Qwen3-30B-A3B",
+    "Qwen/Qwen3-32B",
+    "Qwen/Qwen3-14B",
+    "Qwen/Qwen3-8B",
+    "Qwen/Qwen2.5-7B",
+    "Qwen/Qwen2.5-14B",
+    "Qwen/Qwen2.5-32B",
+    "Qwen/Qwen2.5-72B",
+    "moonshotai/Moonlight-16B-A3B",
+    "moonshotai/Kimi-K2-Instruct",
+    "deepseek-ai/DeepSeek-V3",
+]
+@app.get("/local-hf-configs")
+async def get_supported_models():
+    """Return the list of HF model identifiers supported by the UI."""
+    return SUPPORTED_MODELS
+@app.get("/get-megatron-config/{model_path:path}")
+async def get_remote_hf_config(model_path: str):
+    """Fetch the HuggingFace config.json for the given model id."""
+    url = f"https://huggingface.co/{model_path}/raw/main/config.json"
+    try:
+        resp = requests.get(url, timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+    except Exception as e:
+        return {"error": f"Failed to fetch config from {url}: {str(e)}"}
+class MBridgeEstimateConfig(BaseModel):
+    hf_model_path: str
+    custom_hf_config: Optional[dict] = None # Renamed for clarity
+    # Hardware & Training
+    num_gpus: int = 8
+    mbs: int = 1
+    seq_len: int = 4096
+    use_distributed_optimizer: bool = True
+    # Recompute settings are now part of the main config
+    recompute_granularity: str = "selective"
+    recompute_method: str = "uniform"
+    recompute_num_layers: Optional[int] = 1
+    # Parallelism
+    tp: int = 1
+    pp: int = 1
+    ep: int = 1
+    cp: int = 1
+    vpp: Optional[int] = None
+    etp: Optional[int] = None
+    # Pipeline stage layer counts
+    num_layers_in_first_pipeline_stage: Optional[int] = None
+    num_layers_in_last_pipeline_stage: Optional[int] = None
+    @field_validator('num_gpus')
+    def num_gpus_must_be_multiple_of_8(cls, v):
+        if v <= 0 or v % 8 != 0:
+            raise ValueError('must be a positive multiple of 8')
+        return v
+def patch_parallel_states(config: MBridgeEstimateConfig):
+    from mbridge.core.parallel_states import ParallelStates
+    ParallelStates.get_default_parallel_states = lambda: ParallelStates(
+        tp_size=config.tp,
+        pp_size=config.pp,
+        ep_size=config.ep,
+        cp_size=config.cp,
+        vpp_size=config.vpp,
+        etp_size=config.etp,
+    )
+@app.post("/estimate_with_mbridge")
+async def estimate_with_mbridge(config: MBridgeEstimateConfig):
+    # Validate Inputs
+    if config.num_gpus <= 0 or config.num_gpus % 8 != 0:
+        return {"error": "Total number of GPUs must be a positive multiple of 8."}
+    parallel_product = config.tp * config.pp * config.cp
+    if parallel_product == 0: # Avoid division by zero
+        return {"error": "Parallelism dimensions (TP, PP, CP) cannot be zero."}
+    if config.num_gpus % parallel_product != 0:
+        return {"error": f"Number of GPUs ({config.num_gpus}) must be divisible by the product of TP*PP*CP ({parallel_product})."}
+    patch_parallel_states(config)
+    # If the path is just a filename, assume it's in our local model-configs dir
+    hf_model_path = config.hf_model_path
+    # This logic needs to change. The custom config from the UI is an HF config, not a Megatron config.
+    # We need to load it via a temporary file.
+    if config.custom_hf_config:
+        try:
+            # Create a temporary file to save the custom HF config
+            with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix=".json", dir=os.path.join(WEBUI_DIR, 'model-configs')) as tmp:
+                json.dump(config.custom_hf_config, tmp)
+                tmp_path = tmp.name
+            # Load the bridge from the temporary config file
+            from transformers import AutoConfig
+            AutoConfig.trust_remote_code = True
+            bridge = AutoBridge.from_pretrained(tmp_path)
+            tf_config = bridge.config
+            hf_config = bridge.hf_config
+        finally:
+            # Ensure the temporary file is deleted
+            if 'tmp_path' in locals() and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+    else:
+        # If no custom config, load from the original path
+        if not os.path.isabs(hf_model_path) and not hf_model_path.startswith(('http', './', '../')):
+            hf_model_path = os.path.join(WEBUI_DIR, 'model-configs', hf_model_path)
+        bridge = AutoBridge.from_pretrained(hf_model_path)
+        tf_config = bridge.config
+        hf_config = bridge.hf_config
+    # --- Configuration Unification ---
+    # Update the tf_config with values from the form. This makes tf_config the single source of truth.
+    tf_config.tensor_model_parallel_size = config.tp
+    tf_config.pipeline_model_parallel_size = config.pp
+    tf_config.expert_model_parallel_size = config.ep
+    tf_config.context_parallel_size = config.cp
+    tf_config.recompute_granularity = config.recompute_granularity
+    tf_config.recompute_method = config.recompute_method
+    tf_config.recompute_num_layers = config.recompute_num_layers
+    tf_config.num_layers_per_virtual_pipeline_stage = config.vpp if config.vpp and config.vpp > 1 else None
+    if config.num_layers_in_first_pipeline_stage is not None:
+        tf_config.num_layers_in_first_pipeline_stage = config.num_layers_in_first_pipeline_stage
+    if config.num_layers_in_last_pipeline_stage is not None:
+        tf_config.num_layers_in_last_pipeline_stage = config.num_layers_in_last_pipeline_stage
+    # print(tf_config)
+    # Create a minimal 'args' object with parameters not present in TransformerConfig
+    args = argparse.Namespace()
+    args.micro_batch_size = config.mbs
+    args.seq_length = config.seq_len
+    args.use_distributed_optimizer = config.use_distributed_optimizer
+    args.data_parallel_size = config.num_gpus // parallel_product
+    args.expert_tensor_parallel_size = config.etp if config.etp else 1
+    # These are required by the estimator but can be derived or defaulted
+    args.transformer_impl = "transformer_engine"
+    args.fp8 = False
+    args.num_experts = getattr(tf_config, 'num_moe_experts', 1) # Needed for layer spec
+    args.moe_grouped_gemm = True # Default
+    args.qk_layernorm = tf_config.qk_layernorm
+    args.multi_latent_attention = "deepseek" in getattr(hf_config, "model_type", "")
+    args.padded_vocab_size = getattr(hf_config, "vocab_size")
+    args.max_position_embeddings = getattr(hf_config, "max_position_embeddings")
+    args.tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)
+    # This function now returns a list of reports, one for each PP rank
+    raw_reports_list = estimate_from_config(tf_config, args)
+    # The report from estimate.py now has the correct units (GB), so no conversion is needed.
+    # We just need to remove the complex 'details' part for the main display table.
+    processed_reports = []
+    for report in raw_reports_list:
+        # Create a copy of the report and remove the 'details' key
+        processed_report = report.copy()
+        processed_report.pop('details', None)
+        processed_reports.append(processed_report)
+    return {
+        "processed_report": processed_reports,
+        "raw_report": raw_reports_list
+    }

webui/model-configs/qwen3-14b.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 17408,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 40,
+    "model_type": "qwen3",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

webui/model-configs/qwen3-235b-a22b.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architectures": [
+      "Qwen3MoeForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "decoder_sparse_step": 1,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 94,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_moe",
+    "moe_intermediate_size": 1536,
+    "norm_topk_prob": true,
+    "num_attention_heads": 64,
+    "num_experts": 128,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 94,
+    "num_key_value_heads": 4,
+    "output_router_logits": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.001,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

webui/model-configs/qwen3-30b-a3b.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architectures": [
+      "Qwen3MoeForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "decoder_sparse_step": 1,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 48,
+    "mlp_only_layers": [],
+    "model_type": "qwen3_moe",
+    "moe_intermediate_size": 768,
+    "norm_topk_prob": true,
+    "num_attention_heads": 32,
+    "num_experts": 128,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 4,
+    "output_router_logits": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.001,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

webui/model-configs/qwen3-32b.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 25600,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 64,
+    "model_type": "qwen3",
+    "num_attention_heads": 64,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

webui/model-configs/qwen3-8b.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

webui/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn[standard]
+mbridge

webui/script.js ADDED Viewed

	@@ -0,0 +1,715 @@

+document.addEventListener('DOMContentLoaded', () => {
+    // Initial UI setup
+    loadLocalConfigs();
+    updateHistoryView();
+    setupEventListeners();
+    updateParallelismOptions();
+    validateParallelismLive();
+    toggleEpBasedOnConfig(); // Disable EP initially
+});
+// Utility: convert ANSI color codes (red 31, green 32) to HTML spans for display
+function ansiToHtml(str) {
+    if (!str) return '';
+    // Replace known ANSI codes
+    return str
+        .replace(/\u001b\[31m/g, '<span class="ansi-red">')
+        .replace(/\u001b\[32m/g, '<span class="ansi-green">')
+        .replace(/\u001b\[33m/g, '<span class="ansi-yellow">')
+        .replace(/\u001b\[34m/g, '<span class="ansi-blue">')
+        .replace(/\u001b\[35m/g, '<span class="ansi-magenta">')
+        .replace(/\u001b\[36m/g, '<span class="ansi-cyan">')
+        .replace(/\u001b\[0m/g, '</span>');
+}
+function setupEventListeners() {
+    document.getElementById('config-form').addEventListener('submit', (e) => {
+        e.preventDefault();
+        submitForm();
+    });
+    document.getElementById('model-select').addEventListener('change', loadSelectedModelConfig);
+    document.getElementById('recompute-granularity').addEventListener('change', (e) => {
+        const recomputeOptions = document.querySelectorAll('.recompute-options');
+        recomputeOptions.forEach(opt => {
+            opt.style.display = e.target.value === 'full' ? 'block' : 'none';
+        });
+    });
+    const liveValidationInputs = ['num-gpus', 'tp', 'pp', 'ep', 'cp', 'etp', 'config-editor'];
+    liveValidationInputs.forEach(id => {
+        const input = document.getElementById(id);
+        if(input) {
+            input.addEventListener('change', validateParallelismLive);
+            if (id === 'num-gpus') {
+                input.addEventListener('change', updateParallelismOptions);
+            }
+        }
+    });
+    document.getElementById('config-editor').addEventListener('input', toggleEpBasedOnConfig);
+    document.getElementById('history-table').addEventListener('click', handleHistoryAction);
+    document.getElementById('clear-history').addEventListener('click', clearHistory);
+        }
+async function loadLocalConfigs() {
+    const modelSelect = document.getElementById('model-select');
+    const defaultConfigName = 'Qwen/Qwen3-235B-A22B'; // Updated default model
+        try {
+        const response = await fetch('/local-hf-configs');
+        const configs = await response.json();
+        modelSelect.innerHTML = '<option value="">Select a model...</option>';
+        // Add custom option to allow user supplied configs
+        modelSelect.innerHTML += '<option value="__custom__">Custom (paste JSON below)...</option>';
+        configs.forEach(config => {
+            modelSelect.innerHTML += `<option value="${config}">${config}</option>`;
+        });
+        // Check if the default config exists and select it
+        if (configs.includes(defaultConfigName)) {
+            modelSelect.value = defaultConfigName;
+            // Await the loading of the model config to ensure it's ready
+            await loadSelectedModelConfig();
+            }
+        } catch (error) {
+        modelSelect.innerHTML = '<option value="">Error loading configs</option>';
+        console.error('Error loading local configs:', error);
+        }
+    }
+async function loadSelectedModelConfig() {
+    const modelSelect = document.getElementById('model-select');
+    const editor = document.getElementById('config-editor');
+    const selectedConfig = modelSelect.value;
+    const messageDiv = document.getElementById('validation-message'); // move early for use in all branches
+    let configData = null; // declare for wider scope
+    if (!selectedConfig) {
+        editor.value = '';
+        toggleEpBasedOnConfig();
+        if (messageDiv) messageDiv.style.display = 'none';
+        return;
+    } else if (selectedConfig === '__custom__') {
+        // Custom config: do not fetch, user must paste JSON
+        editor.value = '';
+        toggleEpBasedOnConfig();
+        if (messageDiv) messageDiv.style.display = 'none';
+        return;
+    }
+        try {
+        const response = await fetch(`/get-megatron-config/${encodeURIComponent(selectedConfig)}`);
+        configData = await response.json();
+        if (configData.error) {
+            editor.value = `Error: ${configData.error}`;
+            } else {
+            editor.value = JSON.stringify(configData, null, 2);
+            }
+        } catch (error) {
+        editor.value = 'Failed to fetch model configuration.';
+        console.error('Error fetching model config:', error);
+        }
+    // Trigger validation and UI updates after loading new config
+    validateParallelismLive();
+    toggleEpBasedOnConfig();
+    // Show Kimi-K2-Instruct warning if needed
+    if (selectedConfig.includes('Kimi-K2-Instruct') && configData && configData.model_type !== 'deepseek_v3') {
+        messageDiv.textContent = 'Notice: For Kimi-K2-Instruct the config field "model_type" must be set to "deepseek_v3" before memory estimation.';
+        messageDiv.style.display = 'block';
+    } else if (messageDiv) {
+        messageDiv.style.display = 'none';
+    }
+}
+function getFormValues(isSubmission = false) {
+    const form = document.getElementById('config-form');
+    const formData = new FormData(form);
+    const modelSelect = document.getElementById('model-select');
+    const hfPath = modelSelect.value;
+        if (!hfPath) {
+        // We will now handle this case in the submitForm function instead of an alert.
+        return null;
+    }
+    const editor = document.getElementById('config-editor');
+    let customConfig = null;
+    try {
+        // Only parse if the editor has content
+        if (editor.value) {
+            customConfig = JSON.parse(editor.value);
+        }
+    } catch (e) {
+        // Only alert on final submission, not on live validation
+        if (isSubmission) {
+            // alert('Model Config is not valid JSON.'); // Removing alert
+        }
+        return null; // Return null if JSON is invalid
+        }
+    const vppInput = formData.get('vpp');
+    const etpInput = formData.get('etp');
+    return {
+            hf_model_path: hfPath,
+        custom_hf_config: customConfig, // Renamed for clarity
+        num_gpus: parseInt(formData.get('num_gpus')),
+        mbs: parseInt(formData.get('mbs')),
+        seq_len: parseInt(formData.get('seq-len')),
+        use_distributed_optimizer: document.getElementById('use-distributed-optimizer').checked,
+        recompute_granularity: formData.get('recompute_granularity'),
+        recompute_method: formData.get('recompute_method'),
+        recompute_num_layers: parseInt(formData.get('recompute_num_layers')),
+        tp: parseInt(formData.get('tp')),
+        pp: parseInt(formData.get('pp')),
+        ep: parseInt(formData.get('ep')) || 1, // Default to 1 if disabled/null
+        cp: parseInt(formData.get('cp')),
+        vpp: vppInput ? parseInt(vppInput) : null,
+        etp: etpInput ? parseInt(etpInput) : null,
+        num_layers_in_first_pipeline_stage: formData.get('num_layers_in_first_pipeline_stage') ? parseInt(formData.get('num_layers_in_first_pipeline_stage')) : null,
+        num_layers_in_last_pipeline_stage: formData.get('num_layers_in_last_pipeline_stage') ? parseInt(formData.get('num_layers_in_last_pipeline_stage')) : null,
+        overhead: parseInt(formData.get('overhead')),
+    };
+}
+async function submitForm() {
+    const messageDiv = document.getElementById('validation-message');
+    messageDiv.textContent = '';
+    messageDiv.style.display = 'none';
+    // Get all form values first. We use getFormValues(false) to avoid any legacy alerts
+    // and handle all validation directly within this function for clarity.
+    const formValues = getFormValues(false);
+    // === START SUBMISSION VALIDATION ===
+    // 1. Check if form values could be retrieved. This catches both missing model selection
+    //    and invalid JSON, as getFormValues returns null in those cases.
+    if (!formValues) {
+        if (!document.getElementById('model-select').value) {
+            messageDiv.textContent = 'Validation Error: Please select a model config.';
+        } else {
+            messageDiv.textContent = 'Validation Error: Model Config is not valid JSON.';
+        }
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // Custom config must have valid JSON
+    if (document.getElementById('model-select').value === '__custom__' && !formValues.custom_hf_config) {
+        messageDiv.textContent = 'Validation Error: Please paste a valid model configuration JSON for the custom model.';
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // 2. Perform all numeric and parallelism validation.
+    const { num_gpus, tp, pp, ep, cp, etp, custom_hf_config } = formValues;
+    const num_kv_heads = custom_hf_config?.num_key_value_heads || null;
+    let errors = [];
+    if (tp * pp * cp > num_gpus) {
+        errors.push(`TP*PP*CP (${tp * pp * cp}) > GPUs (${num_gpus}).`);
+    }
+    if (etp){
+        if (etp * pp * cp * ep > num_gpus) {
+            errors.push(`ETP*PP*CP*EP (${etp * pp * cp * ep}) > GPUs (${num_gpus}).`);
+        }
+    } else {
+        if (tp * pp * cp * ep > num_gpus) {
+            errors.push(`TP*PP*CP*EP (${tp * pp * cp * ep}) > GPUs (${num_gpus}) when ETP is not set.`);
+        }
+    }
+    if (num_kv_heads && tp > num_kv_heads) {
+        errors.push(`TP (${tp}) > Num KV Heads (${num_kv_heads}).`);
+    }
+    if (errors.length > 0) {
+        messageDiv.textContent = 'Validation Error: ' + errors.join(' ');
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // === END SUBMISSION VALIDATION ===
+    const loading = document.getElementById('loading');
+    const submitBtn = document.querySelector('#config-form button[type="submit"]');
+    loading.style.display = 'block';
+    if (submitBtn) submitBtn.disabled = true;
+        try {
+            const response = await fetch('/estimate_with_mbridge', {
+                method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(formValues) // Send the now fully-validated formValues
+            });
+        console.log('Response Status:', response.status);
+        if (response.ok) {
+            const data = await response.json();
+            // FIX: Ensure history wrapper is visible before updating and showing details
+            document.getElementById('history-wrapper').style.display = 'block';
+            saveToHistory(formValues, data);
+            updateHistoryView();
+            const newEntryRow = document.querySelector('#history-table tbody tr:first-child');
+            if (newEntryRow) {
+                const detailBtn = newEntryRow.querySelector('.detail-btn');
+                if (detailBtn) {
+                    // We need to pass the event object structure to handleHistoryAction
+                    handleHistoryAction({ target: detailBtn });
+                }
+            }
+        } else {
+            const error = await response.text();
+            console.error('Server error response:', error);
+            // Since we removed the main results display, show error in the validation div
+            messageDiv.textContent = `Server Error: ${error}`;
+            messageDiv.style.display = 'block';
+        }
+    } catch (error) {
+        console.error('Fetch API Error:', error);
+        messageDiv.textContent = `Client Error: ${error.message}`;
+        messageDiv.style.display = 'block';
+    } finally {
+        loading.style.display = 'none';
+        if (submitBtn) submitBtn.disabled = false;
+    }
+}
+function renderTable(details, rawFullReport) {
+    if (!details || details.length === 0) {
+        return '<p>No detailed memory breakdown available.</p>';
+    }
+    const headers = Object.keys(details[0]);
+    headers.push('Breakdown');
+    let table = '<table><thead><tr>';
+    headers.forEach(h => table += `<th>${h}</th>`);
+    table += '</tr></thead><tbody>';
+    details.forEach(row => {
+        const ppRank = row.pp_rank;
+        // FIX: Look in the full raw report array passed in.
+        const rawDataForRank = rawFullReport ? rawFullReport.find(r => r.pp_rank === ppRank) : null;
+        // FIX: Change to `let` to allow modification for highlighting.
+        let modelBreakdown = (rawDataForRank && rawDataForRank.model_breakdown)
+            ? rawDataForRank.model_breakdown
+            : 'No breakdown available.';
+        // Add syntax-like highlighting for params and activations
+        // Basic HTML escaping for safety before inserting spans
+        modelBreakdown = modelBreakdown.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
+        modelBreakdown = modelBreakdown
+            .replace(/(n_params=[0-9.]+[a-zA-Z]*)/g, '<span class="highlight-red">$1</span>')
+            .replace(/(n_act=[0-9.]+[a-zA-Z]*)/g, '<span class="highlight-red">$1</span>');
+        // Main row with data
+        table += `<tr data-pp-rank="${ppRank}">`;
+        headers.forEach(h => {
+            if (h !== 'Breakdown') {
+                table += `<td>${row[h]}</td>`;
+            }
+        });
+        table += `<td><button class="action-btn raw-per-rank-btn" data-pp-rank="${ppRank}">Raw</button></td>`;
+        table += '</tr>';
+        // Hidden row for the breakdown
+        table += `<tr class="raw-breakdown-row" data-pp-rank="${ppRank}" style="display: none;">
+                    <td colspan="${headers.length}">
+                        <pre>${modelBreakdown}</pre>
+                    </td>
+                  </tr>`;
+    });
+    table += '</tbody></table>';
+    return table;
+}
+function saveToHistory(params, resultData) {
+    let history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const historyEntry = {
+        params: params,
+        result: resultData, // Store the full result object { processed_report, raw_report }
+        id: new Date().getTime()
+    };
+    history.unshift(historyEntry); // Add to the beginning
+    if (history.length > 20) { // Keep history size manageable
+        history.pop();
+    }
+    localStorage.setItem('estimationHistory', JSON.stringify(history));
+}
+function updateHistoryView() {
+    const history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const historyTableBody = document.querySelector('#history-table tbody');
+    const historyWrapper = document.getElementById('history-wrapper');
+    historyTableBody.innerHTML = '';
+    if (history.length === 0) {
+        historyWrapper.style.display = 'none';
+        return;
+    }
+    historyWrapper.style.display = 'block';
+    history.forEach(item => {
+        const row = document.createElement('tr');
+        const params = item.params;
+        const resultData = item.result || {};
+        // FIX: Handle both old and new data structures for compatibility.
+        const details = (resultData.report && resultData.report.details) ? resultData.report.details : (resultData.processed_report || []);
+        const pp0Result = details.find(r => r.pp_rank === 0) || details[0] || {};
+        const modelName = params.hf_model_path.split('/').pop();
+        // Build parallelism string, e.g., "TP2 PP2 VPP2"
+        const parallelismParts = [];
+        ['tp', 'pp', 'ep', 'cp', 'vpp', 'etp'].forEach(p => {
+            const value = params[p];
+            if (value && value > 1) {
+                parallelismParts.push(`${p.toUpperCase()}${value}`);
+            }
+        });
+        const parallelismInfo = parallelismParts.join(' ') || 'No Parallelism';
+        const overheadGb = params.overhead ? parseInt(params.overhead) : 0;
+        const baseTotal = details.length > 0 ? Math.max(...details.map(r => r.total_gb || 0)) : null;
+        const totalGb = baseTotal !== null ? (baseTotal + overheadGb).toFixed(2) : 'N/A';
+        const seqLen = params.seq_len || 0;
+        const formattedSeqLen = seqLen >= 1024 ? `${seqLen / 1024}k` : seqLen;
+        const sequenceInfo = `${params.mbs || 'N/A'}*${formattedSeqLen}`;
+        row.innerHTML = `
+            <td>
+                <div>${modelName}</div>
+                <div class="model-meta-info">
+                    <span>GPUs: ${params.num_gpus || 'N/A'}</span>
+                    <span>${parallelismInfo}</span>
+                    <span>Sequence: ${sequenceInfo}</span>
+                </div>
+            </td>
+            <td>${pp0Result.weight_optimizer_gb || 'N/A'}</td>
+            <td>${pp0Result.activation_gb || 'N/A'}</td>
+            <td>${totalGb}</td>
+            <td>
+                <button class="restore-btn" data-id="${item.id}">Restore</button>
+                <button class="detail-btn" data-id="${item.id}">Detail</button>
+                <button class="delete-btn" data-id="${item.id}">Delete</button>
+            </td>
+        `;
+        historyTableBody.appendChild(row);
+    });
+}
+async function handleHistoryAction(e) {
+    const button = e.target.closest('button');
+    if (!button) return;
+    // Handle breakdown toggle first
+    if (button.classList.contains('breakdown-btn')) {
+        const ppRank = button.dataset.ppRank;
+        const detailTable = button.closest('table');
+        if (!detailTable) return;
+        const breakdownRow = detailTable.querySelector(`tr.breakdown-row[data-pp-rank="${ppRank}"]`);
+        if (!breakdownRow) return;
+        const isVisible = breakdownRow.style.display !== 'none';
+        breakdownRow.style.display = isVisible ? 'none' : 'table-row';
+        button.textContent = isVisible ? 'Breakdown' : 'Hide';
+        return; // Do not continue to other handlers
+    }
+    if (!button.matches('.detail-btn, .restore-btn, .delete-btn')) return;
+    const id = parseInt(button.dataset.id, 10);
+    const history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const entry = history.find(item => item.id === id);
+    if (!entry) {
+        console.error('History entry not found for id:', id);
+            return;
+        }
+    const row = button.closest('tr');
+    if (button.classList.contains('detail-btn')) {
+        const isDetailsVisible = row.nextElementSibling && row.nextElementSibling.classList.contains('detail-row');
+        document.querySelectorAll('.detail-row').forEach(detailRow => {
+            const prevRow = detailRow.previousElementSibling;
+            const detailBtn = prevRow.querySelector('.detail-btn');
+            if (detailRow !== row.nextElementSibling) {
+                detailRow.remove();
+                if (detailBtn) detailBtn.textContent = 'Detail';
+            }
+        });
+        if (isDetailsVisible) {
+            row.nextElementSibling.remove();
+            button.textContent = 'Detail';
+        } else {
+            const detailRow = document.createElement('tr');
+            detailRow.classList.add('detail-row');
+            const detailCell = detailRow.insertCell();
+            detailCell.colSpan = row.cells.length;
+            // FIX: Handle both old and new data structures for compatibility.
+            const report = entry.result.report;
+            const details = (report && report.details) ? report.details : (entry.result.processed_report || []);
+            const modelBreakdown = (report && report.model_breakdown) ? report.model_breakdown : null;
+            if (details && details.length > 0) {
+                const newTable = document.createElement('table');
+                // Determine if breakdown information exists per-row or globally
+                let headers = Object.keys(details[0]);
+                // If old-format data, there is a 'model_breakdown' key on each detail row
+                const hasRowBreakdown = headers.includes('model_breakdown');
+                // Remove the raw model_breakdown column from headers to keep table compact
+                if (hasRowBreakdown) {
+                    headers = headers.filter(h => h !== 'model_breakdown');
+                }
+                // Include global breakdown if provided, or row breakdowns if present
+                const includeBreakdown = hasRowBreakdown || (modelBreakdown && typeof modelBreakdown === 'string');
+                if (includeBreakdown) {
+                    headers.push('Breakdown');
+                }
+                const headerRow = newTable.insertRow();
+                headers.forEach(h => {
+                    const th = document.createElement('th');
+                    th.textContent = h;
+                    headerRow.appendChild(th);
+                });
+                details.forEach(detail => {
+                    const newRow = newTable.insertRow();
+                    headers.forEach(header => {
+                        if (header === 'Breakdown') {
+                            const cell = newRow.insertCell();
+                            cell.innerHTML = `<button class="breakdown-btn" data-pp-rank="${detail.pp_rank}">Breakdown</button>`;
+                        } else {
+                            const cell = newRow.insertCell();
+                            let value = detail[header];
+                            if (typeof value === 'number' && !Number.isInteger(value)) {
+                                value = value.toFixed(4);
+                            }
+                            cell.textContent = value;
+                        }
+                    });
+                    // Hidden breakdown row
+                    if (includeBreakdown) {
+                        const breakdownRow = newTable.insertRow();
+                        breakdownRow.classList.add('breakdown-row');
+                        breakdownRow.dataset.ppRank = detail.pp_rank;
+                        breakdownRow.style.display = 'none';
+                        const breakdownCell = breakdownRow.insertCell();
+                        breakdownCell.colSpan = headers.length;
+                        const rowSpecificBreakdown = hasRowBreakdown ? (detail.model_breakdown || '') : modelBreakdown;
+                        const htmlBreakdown = ansiToHtml(rowSpecificBreakdown);
+                        breakdownCell.innerHTML = `<pre class="model-breakdown-view">${htmlBreakdown || 'No breakdown available.'}</pre>`;
+                    }
+                });
+                detailCell.appendChild(newTable);
+            } else {
+                detailCell.innerHTML = 'No detailed per-rank results available.';
+            }
+            row.after(detailRow);
+            button.textContent = 'Hide';
+        }
+    } else if (button.classList.contains('restore-btn')) {
+        restoreForm(entry.params);
+    } else if (button.classList.contains('delete-btn')) {
+        deleteHistoryEntry(id);
+    }
+}
+function deleteHistoryEntry(id) {
+    let history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const updatedHistory = history.filter(item => item.id != id);
+    localStorage.setItem('estimationHistory', JSON.stringify(updatedHistory));
+    updateHistoryView();
+    // If history is now empty, hide the whole output container
+    if (updatedHistory.length === 0) {
+        // document.getElementById('output-container').style.display = 'none';
+    }
+}
+function clearHistory() {
+    localStorage.removeItem('estimationHistory');
+    updateHistoryView();
+    // document.getElementById('output-container').style.display = 'none';
+}
+function restoreForm(params) {
+    if (!params) return;
+    const setElementValue = (id, value, defaultValue = '') => {
+        const element = document.getElementById(id);
+        if (element) {
+            if (element.type === 'checkbox') {
+                element.checked = value ?? defaultValue;
+            } else {
+                element.value = value ?? defaultValue;
+            }
+        }
+    };
+    setElementValue('num-gpus', params.num_gpus, 8);
+    setElementValue('mbs', params.mbs, 1);
+    setElementValue('seq-len', params.seq_len, 4096);
+    setElementValue('use-distributed-optimizer', params.use_distributed_optimizer, true);
+    setElementValue('recompute_granularity', params.recompute_granularity, 'selective');
+    setElementValue('recompute_method', params.recompute_method, 'uniform');
+    setElementValue('recompute_num_layers', params.recompute_num_layers, 1);
+    setElementValue('tp', params.tp, 1);
+    setElementValue('pp', params.pp, 1);
+    setElementValue('ep', params.ep, 1);
+    setElementValue('cp', params.cp, 1);
+    setElementValue('vpp', params.vpp);
+    setElementValue('etp', params.etp);
+    setElementValue('num_layers_in_first_pipeline_stage', params.num_layers_in_first_pipeline_stage);
+    setElementValue('num_layers_in_last_pipeline_stage', params.num_layers_in_last_pipeline_stage);
+    setElementValue('overhead', params.overhead, 10);
+    const modelSelect = document.getElementById('model-select');
+    if (modelSelect && params.hf_model_path) {
+        modelSelect.value = params.hf_model_path;
+    }
+    // Manually trigger change event for UI updates
+    const recomputeSelect = document.getElementById('recompute_granularity');
+    if (recomputeSelect) {
+        recomputeSelect.dispatchEvent(new Event('change'));
+            }
+}
+function updateParallelismOptions() {
+    const numGpusInput = document.getElementById('num-gpus');
+    if (!numGpusInput) return;
+    const numGpus = parseInt(numGpusInput.value);
+    if (isNaN(numGpus) || numGpus <= 0) {
+        return; // Don't update if GPU count is invalid
+    }
+    const tpSelect = document.getElementById('tp');
+    const epSelect = document.getElementById('ep');
+    const cpSelect = document.getElementById('cp');
+    // PP is now a manual input, so we only handle TP, EP, CP here.
+    const selects = [tpSelect, epSelect, cpSelect];
+    const powersOfTwo = [1];
+    for (let i = 1; (1 << i) <= numGpus; i++) {
+        powersOfTwo.push(1 << i);
+    }
+    selects.forEach(select => {
+        if (!select) return;
+        const currentVal = select.value;
+        select.innerHTML = ''; // Clear existing options
+        powersOfTwo.forEach(val => {
+            const option = document.createElement('option');
+            option.value = val;
+            option.textContent = val;
+            select.appendChild(option);
+        });
+        // Try to restore the previous value, otherwise default to 1
+        if (powersOfTwo.includes(parseInt(currentVal))) {
+            select.value = currentVal;
+        } else {
+            select.value = 1;
+        }
+    });
+}
+function validateParallelismLive() {
+    const messageDiv = document.getElementById('validation-message');
+    // Pass isSubmission = false to getFormValues to prevent alerts during live validation
+    const formValues = getFormValues(false);
+    if (!formValues) {
+        messageDiv.textContent = '';
+        return true;
+        }
+    const { num_gpus, tp, pp, ep, cp, etp, custom_hf_config } = formValues;
+    // The key is the same in the HF config, so this logic remains valid.
+    const num_kv_heads = custom_hf_config?.num_key_value_heads || null;
+    let errors = [];
+    if (tp * pp * cp > num_gpus) {
+        errors.push(`TP*PP*CP (${tp*pp*cp}) > GPUs (${num_gpus}).`);
+        }
+    if (etp) {
+        if (etp * pp * cp * ep > num_gpus) {
+            errors.push(`ETP*PP*CP*EP (${etp*pp*cp*ep}) > GPUs (${num_gpus}).`);
+        }
+    } else {
+        if (tp * pp * cp * ep > num_gpus) {
+            errors.push(`TP*PP*CP*EP (${tp*pp*cp*ep}) > GPUs (${num_gpus}) when ETP is not set.`);
+        }
+    }
+    if (num_kv_heads && tp > num_kv_heads) {
+        errors.push(`TP (${tp}) > Num KV Heads (${num_kv_heads}).`);
+    }
+    if (errors.length > 0) {
+        messageDiv.textContent = 'Validation Error: ' + errors.join(' ');
+        messageDiv.style.display = 'block';
+    } else {
+        messageDiv.textContent = '';
+        messageDiv.style.display = 'none';
+    }
+    return errors.length === 0;
+}
+function toggleEpBasedOnConfig() {
+    const editor = document.getElementById('config-editor');
+    const epSelect = document.getElementById('ep');
+    if (!editor || !epSelect) return;
+    let config = null;
+    try {
+        if (editor.value) {
+            config = JSON.parse(editor.value);
+        }
+    } catch (e) {
+        // Invalid JSON, disable EP as a safety measure
+        epSelect.disabled = true;
+        return;
+    }
+    if (config && config.num_experts && config.num_experts > 0) {
+        epSelect.disabled = false;
+    } else {
+        epSelect.disabled = true;
+        epSelect.value = 1; // Reset to 1 if disabled
+    }
+}

webui/style.css ADDED Viewed

	@@ -0,0 +1,383 @@

+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+    line-height: 1.6;
+    background-color: #f4f4f4;
+    color: #333;
+    margin: 0;
+    padding: 1em;
+}
+.container {
+    max-width: 1600px;
+    margin: auto;
+    background: #fff;
+    padding: 2em;
+    border-radius: 8px;
+    box-shadow: 0 0 20px rgba(0, 0, 0, 0.05);
+}
+.main-layout {
+    display: flex;
+    flex-direction: column; /* Main axis is vertical */
+    gap: 2em;
+}
+.top-section {
+    display: flex;
+    flex-direction: row; /* Children are horizontal */
+    gap: 2em;
+}
+.config-column, .output-column {
+    flex: 1; /* Each column takes up half the space */
+    display: flex;
+    flex-direction: column;
+}
+/* The editor wrapper should grow to fill the space */
+.config-editor-wrapper {
+    flex-grow: 1;
+    display: flex;
+    flex-direction: column;
+}
+#config-editor {
+    flex-grow: 1; /* The textarea itself should grow */
+    width: 100%;
+    box-sizing: border-box; /* Include padding and border in the element's total width and height */
+    resize: vertical; /* Allow vertical resizing */
+}
+.bottom-section {
+    width: 100%;
+}
+.form-row {
+    display: flex;
+    gap: 1em;
+    align-items: flex-end;
+}
+.form-row .form-group {
+    flex: 1; /* Allow groups to grow and fill space */
+    margin-bottom: 0.8em;
+}
+.form-group {
+    margin-bottom: 0.8em; /* Reduced from default */
+}
+.form-group label {
+    display: block;
+    margin-bottom: 0.25em; /* Reduced */
+    font-weight: 500;
+}
+.form-group label.inline-label {
+    display: inline-block;
+    margin-left: 0.5em;
+    font-weight: normal;
+}
+.form-group input[type="number"],
+.form-group select {
+    width: 100%;
+    padding: 6px 10px; /* Reduced padding */
+    border-radius: 4px;
+    border: 1px solid #ccc;
+    box-sizing: border-box;
+}
+button {
+    background-color: #3498db;
+    color: white;
+    padding: 10px 15px;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 16px;
+    margin-top: 10px;
+}
+button:hover {
+    background-color: #2980b9;
+}
+#results {
+    background-color: #ecf0f1;
+    padding: 15px;
+    border-radius: 4px;
+    white-space: pre-wrap;
+    word-wrap: break-word;
+    min-height: 100px;
+}
+.results-container {
+    margin-top: 20px;
+}
+/* New styles for results table */
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 20px;
+}
+th, td {
+    border: 1px solid #ddd;
+    padding: 12px;
+    text-align: left;
+}
+th {
+    background-color: #f2f2f2;
+    font-weight: bold;
+}
+tbody tr:nth-child(even) {
+    background-color: #f9f9f9;
+}
+tbody tr:hover {
+    background-color: #f1f1f1;
+}
+.error {
+    color: #e74c3c;
+    font-weight: bold;
+}
+.button-container {
+    grid-column: 1 / -1; /* Span across all columns */
+    text-align: center;
+    margin-top: 20px;
+}
+/* History Section */
+.history-container {
+    margin-top: 40px;
+    border-top: 1px solid #e0e0e0;
+    padding-top: 20px;
+}
+.history-container h2 {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+#history-list table {
+    margin-top: 10px;
+}
+.small-button {
+    padding: 4px 8px;
+    font-size: 0.8em;
+    background-color: #e74c3c;
+}
+.small-button:hover {
+    background-color: #c0392b;
+}
+.history-item-actions {
+    display: flex;
+    gap: 10px;
+}
+#output-container {
+    margin-top: 2em;
+    padding: 1.5em;
+    background-color: #f9f9f9;
+    border: 1px solid #ddd;
+    border-radius: 8px;
+}
+#results-wrapper h3, #history-wrapper h3 {
+    margin-top: 0;
+    border-bottom: 2px solid #eee;
+    padding-bottom: 0.5em;
+    margin-bottom: 1em;
+}
+#results-display table {
+    width: 100%;
+    border-collapse: collapse;
+}
+#results-display th, #results-display td {
+    padding: 8px 12px;
+    border: 1px solid #ddd;
+    text-align: left;
+}
+#results-display th {
+    background-color: #f2f2f2;
+}
+#history-table {
+    width: 100%;
+    border-collapse: collapse;
+}
+#history-table th, #history-table td {
+    padding: 8px 12px;
+    border: 1px solid #ddd;
+    text-align: left;
+}
+#history-table th {
+    background-color: #f2f2f2;
+}
+#history-table td:last-child {
+    text-align: right;
+}
+#raw-json-output {
+    background-color: #2d2d2d;
+    color: #f1f1f1;
+    padding: 1em;
+    border-radius: 5px;
+    max-height: 500px;
+    overflow-y: auto;
+}
+#clear-history {
+    background-color: #dc3545;
+}
+#clear-history:hover {
+    background-color: #c82333;
+}
+.error-message {
+    color: #dc3545;
+    background-color: #f8d7da;
+    border: 1px solid #f5c6cb;
+    padding: 0.75rem 1.25rem;
+    margin-top: 1rem;
+    margin-bottom: 1rem;
+    border-radius: 0.25rem;
+    text-align: center;
+}
+/* Responsive Design for smaller screens */
+@media (max-width: 992px) {
+    .top-section {
+        flex-direction: column;
+    }
+}
+.history-detail-row td {
+    background-color: #333;
+    padding: 15px;
+    border-top: 2px solid #555;
+    text-align: left; /* Align content to the left */
+}
+.history-detail-row pre {
+    background-color: #1e1e1e;
+    color: #d4d4d4;
+    padding: 10px;
+    border-radius: 4px;
+    white-space: pre-wrap;
+    word-break: break-all;
+}
+.history-detail-row table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 0;
+}
+.history-detail-row table th {
+    background-color: #e0e0e0;
+    color: #333;
+    padding: 8px 12px;
+    border: 1px solid #555;
+}
+.history-detail-row table td {
+    color: #d4d4d4;
+    padding: 8px 12px;
+    border: 1px solid #555;
+    background-color: #2a2a2a;
+}
+.model-breakdown-view {
+    max-height: 400px; /* Or any other suitable height */
+    overflow-y: auto;
+    overflow-x: auto;
+    background-color: #2d2d2d;
+    color: #f1f1f1;
+    padding: 1em;
+    border-radius: 5px;
+    white-space: pre-wrap; /* Ensures the pre content wraps */
+    margin: 0;
+    font-family: monospace;
+    font-size: 0.85em;
+}
+.model-meta-info {
+    font-size: 0.9em;
+    color: #666;
+    margin-top: 4px;
+}
+.model-meta-info span {
+    margin-right: 15px;
+}
+.action-btn.raw-btn {
+    background-color: #555;
+    color: white;
+}
+.highlight-red {
+    color: #ff6b6b;
+}
+.ansi-red { color: #e74c3c; }
+.ansi-green { color: #2ecc71; }
+.ansi-yellow { color: #f1c40f; }
+.ansi-blue { color: #3498db; }
+.ansi-magenta { color: #9b59b6; }
+.ansi-cyan { color: #1abc9c; }
+.breakdown-row td {
+    text-align: left !important;
+}
+.footer {
+    margin-top: 2em;
+    font-size: 0.85em;
+    color: #555;
+    text-align: center;
+}
+.footer a {
+    color: #2a77d4;
+    text-decoration: none;
+}
+.footer a:hover {
+    text-decoration: underline;
+}
+.disclaimer {
+    margin-top: 0.5em;
+    font-style: italic;
+}
+.disclaimer-banner {
+    background-color: #fff3cd;
+    color: #856404;
+    border: 1px solid #ffeeba;
+    padding: 10px 15px;
+    border-radius: 4px;
+    margin: 15px 0;
+    font-weight: bold;
+    text-align: center;
+}