Spaces:

ISEEKYAN
/

megatron_memory_estimator

Running

Yan Bai

add

55e1701 22 days ago

20.2 kB

	# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
	"""Pretrain GPT."""
	import warnings

	warnings.filterwarnings("ignore", category=DeprecationWarning)
	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore")
	import os
	import torch
	from functools import partial
	from contextlib import nullcontext
	import inspect

	from typing import Union
	from megatron.training import get_args
	from megatron.training import print_rank_0
	from megatron.training import get_timers
	from megatron.training import get_tokenizer
	from megatron.core import mpu
	from megatron.core.enums import ModelType
	from megatron.core.datasets.blended_megatron_dataset_builder import (
	BlendedMegatronDatasetBuilder,
	)
	from megatron.core.datasets.utils import get_blend_from_list
	from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
	from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
	import megatron.legacy.model
	from megatron.training import pretrain
	from megatron.core.utils import StragglerDetector
	from megatron.core.transformer.spec_utils import import_module
	from megatron.training.utils import (
	get_batch_on_this_cp_rank,
	get_batch_on_this_tp_rank,
	)
	from megatron.training.arguments import core_transformer_config_from_args
	from megatron.training.yaml_arguments import core_transformer_config_from_yaml
	from megatron.core.models.gpt.gpt_layer_specs import (
	get_gpt_layer_local_spec,
	get_gpt_layer_with_transformer_engine_spec,
	)
	from megatron.training.initialize import initialize_megatron
	from moe_mem_estimator.gpt_model import GPTModel
	from moe_mem_estimator.base import (
	is_pipeline_first_stage,
	is_pipeline_last_stage,
	set_global_config,
	set_pipeline_model_parallel_rank,
	)
	from moe_mem_estimator.layers import MLASelfAttention, MoELayer


	def _calculate_rank_memory(config, args, input_shape, pp_rank=0, pp_size=1):
	"""
	Calculates the memory for a single pipeline parallel rank, containing the detailed logic.
	"""
	# Build the model for the current rank
	set_global_config(config)
	pre_process = (pp_rank == 0)
	post_process = (pp_rank == pp_size - 1)

	use_te = True
	if hasattr(config, 'spec') and config.spec is not None:
	transformer_layer_spec = import_module(config.spec)
	else:
	if use_te:
	transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
	config.num_moe_experts, config.moe_grouped_gemm, config.qk_layernorm,
	config.multi_latent_attention, config.fp8
	)
	else:
	transformer_layer_spec = get_gpt_layer_local_spec(
	config.num_moe_experts, config.moe_grouped_gemm, config.qk_layernorm,
	config.multi_latent_attention
	)

	model = GPTModel(
	config=config,
	transformer_layer_spec=transformer_layer_spec,
	vocab_size=args.padded_vocab_size,
	max_sequence_length=args.max_position_embeddings,
	pre_process=pre_process,
	post_process=post_process,
	fp16_lm_cross_entropy=getattr(config, 'fp16_lm_cross_entropy', False),
	parallel_output=True,
	share_embeddings_and_output_weights=args.tie_word_embeddings,
	position_embedding_type="rope",
	rotary_percent=getattr(args, 'rotary_percent', 1.0),
	rotary_base=getattr(args, 'rotary_base', 10000),
	rope_scaling=getattr(config, 'use_rope_scaling', False),
	)

	# --- Start of detailed memory calculation logic ---
	num_parameter_this_shard = model.num_parameter()
	num_activation = model.num_activation(input_shape)
	output_shape = model.mock_forward(input_shape)

	num_parameter_this_shard_sparse = sum(
	layer.mlp.num_parameter() for layer in model.decoder.layers.modules
	if isinstance(layer.mlp, MoELayer)
	)
	num_activation_this_shard_mlp = sum(
	m.mlp.num_activation() for m in model.decoder.layers.modules
	)

	num_microbatch_this_pp_rank = pp_size - pp_rank
	if config.num_layers_per_virtual_pipeline_stage is not None:
	layers_this_pprank = len(model.decoder.layers.modules)
	vpp_size = layers_this_pprank // config.num_layers_per_virtual_pipeline_stage
	if vpp_size > 0:
	num_microbatch_this_pp_rank = (pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1) / vpp_size

	# Activation Recomputation
	# The base activation number is for one microbatch. With pipeline parallelism,
	# the total activation is multiplied by the number of microbatches in flight.
	# Recomputation reduces this by re-calculating activations during the backward pass
	# instead of storing them.

	# This is the activation memory without any recomputation.
	num_activation = (num_activation - model.num_act_post) * num_microbatch_this_pp_rank + model.num_act_post

	if config.recompute_granularity == "full":
	# This logic is transplanted from the more detailed `report_memory_usage_one_pp_rank`
	recompute_num_layers = config.recompute_num_layers
	num_layers = model.num_layers
	# Activations of a model with recompute enabled.
	# The activation of a layer is an input to the next layer.
	# So, the total activation is the sum of the activations of all layers,
	# plus the activation of the embedding layer.
	# The activation of a layer is stored only if it is not recomputed.
	common_act = (
	model.num_act_pre
	+ model.num_act_between_layers * num_layers * num_microbatch_this_pp_rank
	)
	if config.recompute_method == "block":
	num_layers_with_loss = num_layers - recompute_num_layers
	if num_layers_with_loss == 0:
	peak1 = common_act + model.num_act_post
	peak2 = common_act + model.num_act_per_layer
	recomputed_activation = max(peak1, peak2)
	else:
	recomputed_activation = (
	common_act
	+ model.num_act_post
	+ model.num_act_per_layer
	* num_layers_with_loss
	* num_microbatch_this_pp_rank
	)
	elif config.recompute_method == "uniform":
	peak1 = common_act + model.num_act_post
	peak2 = (
	common_act
	+ model.num_act_per_layer
	* recompute_num_layers
	* num_microbatch_this_pp_rank
	)
	recomputed_activation = max(peak1, peak2)

	if isinstance(model.decoder.layers.modules[0].self_attention, MLASelfAttention):
	recomputed_activation += model.decoder.layers.modules[0].self_attention.core_attention.num_activation()

	num_activation = recomputed_activation

	elif config.recompute_granularity == "selective":
	# Selective recomputation is the default in Megatron-LM and is handled
	# by Transformer Engine. The base `num_activation` calculation from `GPTModel`
	# already reflects this. We just need to scale it by the number of in-flight microbatches.
	# This is already the case, so we do nothing here.
	pass


	# Context Parallelism
	if config.context_parallel_size > 1:
	num_activation = (num_activation - num_activation_this_shard_mlp) / config.context_parallel_size + num_activation_this_shard_mlp

	# Calculate bytes per parameter for optimizer states
	if args.use_distributed_optimizer:
	base_optim_bytes = 6 # FP16 weight, FP32 master weight
	world_optim_bytes = 12 # FP32 grad, FP32 momentum, FP32 variance
	else:
	base_optim_bytes = 18 # All states on each GPU
	world_optim_bytes = 0

	num_bytes_per_parameter = base_optim_bytes + (world_optim_bytes / (args.data_parallel_size * config.context_parallel_size))

	# Handle MoE optimizer state sharding if applicable
	if num_parameter_this_shard_sparse > 0 and config.expert_model_parallel_size > 1:
	moe_dp_size = args.data_parallel_size * config.tensor_model_parallel_size // (config.expert_model_parallel_size * args.expert_tensor_parallel_size)
	num_bytes_per_parameter_moe = base_optim_bytes + (world_optim_bytes / moe_dp_size)

	weight_and_optimizer_memory = (
	(num_parameter_this_shard - num_parameter_this_shard_sparse) * num_bytes_per_parameter +
	num_parameter_this_shard_sparse * num_bytes_per_parameter_moe
	) / NUM_BYTES_IN_GIGABYTE
	else:
	weight_and_optimizer_memory = (num_parameter_this_shard * num_bytes_per_parameter) / NUM_BYTES_IN_GIGABYTE

	activation_memory = num_activation * 2 / NUM_BYTES_IN_GIGABYTE # Use GIGABYTE
	total_memory = weight_and_optimizer_memory + activation_memory

	report = {
	"pp_rank": pp_rank,
	"parameters_b": num_parameter_this_shard / 1e9,
	"activation_b": num_activation / 1e9, # Renamed from _gb to _b
	"weight_optimizer_gb": round(weight_and_optimizer_memory, 2),
	"activation_gb": round(activation_memory, 2),
	"total_gb": round(total_memory, 2),
	"details": model.dump(),
	"model_breakdown": str(model)
	}
	print(model)

	return report, output_shape


	def estimate_from_config(config, args):
	"""
	Estimate memory usage from a given config and args, instead of global state.
	This version iterates over pipeline parallel ranks for accurate estimation.
	"""
	reports = []
	input_shape = [args.micro_batch_size, args.seq_length]
	pp_size = config.pipeline_model_parallel_size

	if pp_size > 1:
	for pp_rank in range(pp_size):
	set_pipeline_model_parallel_rank(pp_rank)
	report_for_rank, new_input_shape = _calculate_rank_memory(config, args, input_shape, pp_rank, pp_size)
	reports.append(report_for_rank)
	input_shape = new_input_shape # Pass output shape to the next stage
	else:
	report_for_rank, _ = _calculate_rank_memory(config, args, input_shape, 0, 1)
	reports.append(report_for_rank)

	return reports


	def model_provider() -> GPTModel:
	args = get_args()
	use_te = args.transformer_impl == "transformer_engine"

	# Experimental loading arguments from yaml
	if args.yaml_cfg is not None:
	config = core_transformer_config_from_yaml(args, "language_model")
	else:
	config = core_transformer_config_from_args(args)
	assert not args.use_legacy_models

	if args.spec is not None:
	transformer_layer_spec = import_module(args.spec)
	else:
	if use_te:
	transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
	args.num_experts,
	args.moe_grouped_gemm,
	args.qk_layernorm,
	args.multi_latent_attention,
	args.fp8,
	)
	else:
	transformer_layer_spec = get_gpt_layer_local_spec(
	args.num_experts,
	args.moe_grouped_gemm,
	args.qk_layernorm,
	args.multi_latent_attention,
	)
	set_global_config(config)
	pre_process = is_pipeline_first_stage()
	post_process = is_pipeline_last_stage()
	# TODO fp8
	model = GPTModel(
	config=config,
	transformer_layer_spec=transformer_layer_spec,
	vocab_size=args.padded_vocab_size,
	max_sequence_length=args.max_position_embeddings,
	pre_process=pre_process,
	post_process=post_process,
	fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
	parallel_output=True,
	share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
	position_embedding_type=args.position_embedding_type,
	rotary_percent=args.rotary_percent,
	rotary_base=args.rotary_base,
	rope_scaling=args.use_rope_scaling,
	)

	return model


	NUM_BYTES_IN_MEGABYTE = 1024 * 1024
	NUM_BYTES_IN_GIGABYTE = 1024 * 1024 * 1024

	def report_memory_usage():
	args = get_args()
	if args.yaml_cfg is not None:
	config = core_transformer_config_from_yaml(args, "language_model")
	else:
	config = core_transformer_config_from_args(args)

	input_shape = [args.micro_batch_size, args.seq_length]

	if config.pipeline_model_parallel_size > 1:
	for pp_rank in range(config.pipeline_model_parallel_size):
	set_pipeline_model_parallel_rank(pp_rank)
	print(f"\n----------[Pipeline_Parallelism_Rank={pp_rank}]----------")
	input_shape = report_memory_usage_one_pp_rank(
	input_shape, pp_rank, config.pipeline_model_parallel_size
	)
	else:
	report_memory_usage_one_pp_rank(input_shape)


	def report_memory_usage_one_pp_rank(
	input_shape: list[int], pp_rank=0, pp_size=1
	) -> list[int]:
	args = get_args()

	print(f"{input_shape=}")
	model: GPTModel = model_provider()
	num_parameter_this_shard = model.num_parameter()
	num_activation = model.num_activation(input_shape)
	output_shape = model.mock_forward(input_shape)

	num_parameter_this_shard_sparse = 0
	for layer in model.decoder.layers.modules:
	if isinstance(layer.mlp, MoELayer):
	num_parameter_this_shard_sparse += layer.mlp.num_parameter()
	if (
	"shared_experts" in layer.mlp.__dir__()
	and layer.mlp.shared_experts is not None
	):
	num_parameter_this_shard_sparse -= (
	layer.mlp.shared_experts.num_parameter()
	)
	num_activation_this_shard_mlp = sum(
	[m.mlp.num_activation() for m in model.decoder.layers.modules]
	)
	num_microbatch_this_pp_rank = pp_size - pp_rank
	# vpp
	if args.num_layers_per_virtual_pipeline_stage is not None:
	layers_this_pprank = model.decoder.layers.modules.__len__()
	vpp_size = layers_this_pprank // args.num_layers_per_virtual_pipeline_stage
	num_microbatch_this_pp_rank = (
	pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1
	) / vpp_size

	num_parameter_this_shard_sparse = 0
	for layer in model.decoder.layers.modules:
	if isinstance(layer.mlp, MoELayer):
	num_parameter_this_shard_sparse += layer.mlp.num_parameter()
	if (
	"shared_experts" in layer.mlp.__dir__()
	and layer.mlp.shared_experts is not None
	):
	num_parameter_this_shard_sparse -= (
	layer.mlp.shared_experts.num_parameter()
	)
	num_microbatch_this_pp_rank = pp_size - pp_rank
	# vpp
	if args.num_layers_per_virtual_pipeline_stage is not None:
	layers_this_pprank = model.decoder.layers.modules.__len__()
	vpp_size = layers_this_pprank // args.num_layers_per_virtual_pipeline_stage
	num_microbatch_this_pp_rank = (
	pp_size * (vpp_size - 1) + (pp_size - pp_rank) * 2 - 1
	) / vpp_size
	model.__repr__()
	print(model)
	print(
	f"Number of parameters in every GPU in billions: "
	f"{num_parameter_this_shard / 109: .2f} where mlp part is {num_parameter_this_shard_sparse / 109: .2f}"
	)
	# recompute
	if args.recompute_granularity == "full":
	recompute_num_layers = args.recompute_num_layers
	num_layers = model.num_layers
	common_act = (
	model.num_act_pre
	+ model.num_act_between_layers * num_layers * num_microbatch_this_pp_rank
	) # recompute with pipeline parallel
	info = (
	"With this recomputing setting, the number of activation achieve peak when "
	)
	if args.recompute_method == "block":
	num_layers_with_loss = num_layers - recompute_num_layers
	if num_layers_with_loss == 0:
	peak1 = common_act + model.num_act_post
	peak2 = common_act + model.num_act_per_layer
	if peak1 > peak2:
	info += "calculating loss"
	else:
	info += "back-propogating loss"
	num_activation = max(peak1, peak2)
	else:
	info += (
	f"calculating loss with {num_layers_with_loss} non-recompute layers"
	)
	num_activation = (
	common_act
	+ model.num_act_post
	+ model.num_act_per_layer
	* num_layers_with_loss
	* num_microbatch_this_pp_rank
	)
	elif args.recompute_method == "uniform":
	peak1 = common_act + model.num_act_post
	peak2 = (
	common_act
	+ model.num_act_per_layer
	* recompute_num_layers
	* num_microbatch_this_pp_rank
	)
	if peak1 > peak2:
	info += "calculating loss"
	else:
	info += f"back-propogating loss recomputing every {recompute_num_layers} layers"
	num_activation = max(peak1, peak2)
	if isinstance(
	model.decoder.layers.modules[0].self_attention, MLASelfAttention
	): # MLA recompute achieve peak at backward
	num_activation += model.decoder.layers.modules[
	0
	].self_attention.core_attention.num_activation()
	print(info)

	else:
	num_activation = (
	num_activation - model.num_act_post
	) * num_microbatch_this_pp_rank + model.num_act_post

	# CP
	num_activation = (
	num_activation - num_activation_this_shard_mlp
	) / args.context_parallel_size + num_activation_this_shard_mlp
	if pp_size == 1:
	print(
	f"Number of activation in every GPU in billions: "
	f"{num_activation / 109: .2f} where mlp part is {num_activation_this_shard_mlp / 109: .2f}"
	)
	else:
	print(
	f"Number of activation per microbatch in every GPU in billions: "
	f"{num_activation / 109: .2f} where mlp part is {num_activation_this_shard_mlp / 109: .2f}"
	f", {num_microbatch_this_pp_rank=}"
	)
	num_bytes_per_parameter = (
	18
	if not args.use_distributed_optimizer
	else 6 + (12 / args.data_parallel_size / args.context_parallel_size)
	)
	if args.expert_model_parallel_size * args.expert_tensor_parallel_size > 1:
	num_bytes_per_parameter_dense = num_bytes_per_parameter
	num_bytes_per_parameter_moe = (
	18
	if not args.use_distributed_optimizer
	else 6
	+ (
	12
	/ (
	args.data_parallel_size
	* args.context_parallel_size
	* args.tensor_model_parallel_size
	/ args.expert_model_parallel_size
	/ args.expert_tensor_parallel_size
	)
	)
	)
	print(f"{num_bytes_per_parameter_dense=} {num_bytes_per_parameter_moe=}")

	weight_and_optimizer_memory = (
	(num_parameter_this_shard - num_parameter_this_shard_sparse)
	* num_bytes_per_parameter_dense
	+ num_parameter_this_shard_sparse * num_bytes_per_parameter_moe
	) / NUM_BYTES_IN_GIGABYTE
	else:
	print(f"{num_bytes_per_parameter=}")
	weight_and_optimizer_memory = (
	num_parameter_this_shard * num_bytes_per_parameter / NUM_BYTES_IN_GIGABYTE
	)

	activation_memory = num_activation * 2 / NUM_BYTES_IN_GIGABYTE # only support fp16
	total_memory = weight_and_optimizer_memory + activation_memory
	print(
	f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory/1024:.2f} GB, "
	f"activation={activation_memory/1024:.2f} GB, total={total_memory/1024:.2f} GB\n"
	)

	# import ipdb

	# ipdb.set_trace()
	return output_shape
	pass


	if __name__ == "__main__":
	initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)

	import ipdb

	with ipdb.launch_ipdb_on_exception():
	report_memory_usage()