|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
import nemo_run as run |
|
from huggingface_hub import snapshot_download |
|
from nemo.collections import llm |
|
from nemo.collections.diffusion.models.model import DiT7BConfig, DiT14BConfig |
|
from nemo.collections.diffusion.train import pretrain, videofolder_datamodule |
|
from nemo.lightning.pytorch.strategies.utils import RestoreConfig |
|
|
|
|
|
@run.cli.factory(target=llm.train) |
|
def cosmos_diffusion_7b_text2world_finetune() -> run.Partial: |
|
|
|
recipe = pretrain() |
|
recipe.model.config = run.Config(DiT7BConfig) |
|
|
|
|
|
recipe.trainer.max_steps = 1000 |
|
recipe.optim.config.lr = 1e-6 |
|
|
|
|
|
recipe.trainer.strategy.tensor_model_parallel_size = 8 |
|
recipe.trainer.strategy.sequence_parallel = True |
|
recipe.trainer.strategy.ckpt_async_save = False |
|
|
|
|
|
recipe.trainer.strategy.ddp.with_megatron_fsdp_code_path = True |
|
recipe.trainer.strategy.ddp.data_parallel_sharding_strategy = "MODEL_AND_OPTIMIZER_STATES" |
|
recipe.trainer.strategy.ddp.overlap_param_gather = True |
|
recipe.trainer.strategy.ddp.overlap_grad_reduce = True |
|
recipe.model.config.use_cpu_initialization = True |
|
|
|
|
|
recipe.data = videofolder_datamodule() |
|
recipe.data.path = "" |
|
|
|
|
|
recipe.resume.restore_config = run.Config(RestoreConfig, load_artifacts=False) |
|
recipe.resume.restore_config.path = os.path.join( |
|
snapshot_download("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", allow_patterns=["nemo/*"]), "nemo" |
|
) |
|
recipe.resume.resume_if_exists = False |
|
|
|
|
|
recipe.log_log_dir = "nemo_experiments/cosmos_diffusion_7b_text2world_finetune" |
|
|
|
return recipe |
|
|
|
|
|
@run.cli.factory(target=llm.train) |
|
def cosmos_diffusion_14b_text2world_finetune() -> run.Partial: |
|
|
|
recipe = pretrain() |
|
recipe.model.config = run.Config(DiT14BConfig) |
|
|
|
|
|
recipe.trainer.max_steps = 1000 |
|
recipe.optim.config.lr = 1e-6 |
|
|
|
|
|
recipe.trainer.strategy.tensor_model_parallel_size = 8 |
|
recipe.trainer.strategy.sequence_parallel = True |
|
recipe.trainer.strategy.ckpt_async_save = False |
|
|
|
|
|
recipe.trainer.strategy.ddp.with_megatron_fsdp_code_path = True |
|
recipe.trainer.strategy.ddp.data_parallel_sharding_strategy = "MODEL_AND_OPTIMIZER_STATES" |
|
recipe.trainer.strategy.ddp.overlap_param_gather = True |
|
recipe.trainer.strategy.ddp.overlap_grad_reduce = True |
|
recipe.model.config.use_cpu_initialization = True |
|
|
|
|
|
recipe.model.config.recompute_granularity = "full" |
|
recipe.model.config.recompute_method = "uniform" |
|
recipe.model.config.recompute_num_layers = 1 |
|
|
|
|
|
recipe.data = videofolder_datamodule() |
|
recipe.data.path = "" |
|
|
|
|
|
recipe.resume.restore_config = run.Config(RestoreConfig, load_artifacts=False) |
|
recipe.resume.restore_config.path = os.path.join( |
|
snapshot_download("nvidia/Cosmos-1.0-Diffusion-14B-Text2World", allow_patterns=["nemo/*"]), "nemo" |
|
) |
|
|
|
recipe.resume.resume_if_exists = False |
|
|
|
|
|
recipe.log_log_dir = "nemo_experiments/cosmos_diffusion_14b_text2world_finetune" |
|
|
|
return recipe |
|
|
|
|
|
if __name__ == "__main__": |
|
run.cli.main(llm.train, default_factory=cosmos_diffusion_7b_text2world_finetune) |
|
|