Huiwenshi's picture
Upload folder using huggingface_hub
600759a verified
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
from typing import Tuple, List
import warnings
warnings.filterwarnings("ignore")
import os
import torch
import argparse
from pathlib import Path
from omegaconf import OmegaConf, DictConfig
from einops._torch_specific import allow_ops_in_compiled_graph # requires einops>=0.6.1
allow_ops_in_compiled_graph()
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, Callback
from pytorch_lightning.strategies import DDPStrategy, DeepSpeedStrategy
from pytorch_lightning.loggers import Logger, TensorBoardLogger
from pytorch_lightning.utilities import rank_zero_info
from hy3dshape.utils import get_config_from_file, instantiate_from_config
class SetupCallback(Callback):
def __init__(self, config: DictConfig, basedir: Path, logdir: str = "log", ckptdir: str = "ckpt") -> None:
super().__init__()
self.logdir = basedir / logdir
self.ckptdir = basedir / ckptdir
self.config = config
def on_fit_start(self, trainer: pl.trainer.Trainer, pl_module: pl.LightningModule) -> None:
if trainer.global_rank == 0:
os.makedirs(self.logdir, exist_ok=True)
os.makedirs(self.ckptdir, exist_ok=True)
def setup_callbacks(config: DictConfig) -> Tuple[List[Callback], Logger]:
training_cfg = config.training
basedir = Path(training_cfg.output_dir)
os.makedirs(basedir, exist_ok=True)
all_callbacks = []
setup_callback = SetupCallback(config, basedir)
all_callbacks.append(setup_callback)
checkpoint_callback = ModelCheckpoint(
dirpath=setup_callback.ckptdir,
filename="ckpt-{step:08d}",
monitor=training_cfg.monitor,
mode="max",
save_top_k=-1,
verbose=False,
every_n_train_steps=training_cfg.every_n_train_steps)
all_callbacks.append(checkpoint_callback)
if "callbacks" in config:
for key, value in config['callbacks'].items():
custom_callback = instantiate_from_config(value)
all_callbacks.append(custom_callback)
logger = TensorBoardLogger(save_dir=str(setup_callback.logdir), name="tensorboard")
return all_callbacks, logger
def merge_cfg(cfg, arg_cfg):
for key in arg_cfg.keys():
if key in cfg.training:
arg_cfg[key] = cfg.training[key]
cfg.training = DictConfig(arg_cfg)
return cfg
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--fast", action='store_true')
parser.add_argument("-c", "--config", type=str, required=True)
parser.add_argument("-s", "--seed", type=int, default=0)
parser.add_argument("-nn", "--num_nodes", type=int, default=1)
parser.add_argument("-ng", "--num_gpus", type=int, default=1)
parser.add_argument("-u", "--update_every", type=int, default=1)
parser.add_argument("-st", "--steps", type=int, default=50000000)
parser.add_argument("-lr", "--base_lr", type=float, default=4.5e-6)
parser.add_argument("-a", "--use_amp", default=False, action="store_true")
parser.add_argument("--amp_type", type=str, default="16")
parser.add_argument("--gradient_clip_val", type=float, default=None)
parser.add_argument("--gradient_clip_algorithm", type=str, default=None)
parser.add_argument("--every_n_train_steps", type=int, default=50000)
parser.add_argument("--log_every_n_steps", type=int, default=50)
parser.add_argument("--val_check_interval", type=int, default=1024)
parser.add_argument("--limit_val_batches", type=int, default=64)
parser.add_argument("--monitor", type=str, default="val/total_loss")
parser.add_argument("--output_dir", type=str, help="the output directory to save everything.")
parser.add_argument("--ckpt_path", type=str, default="", help="the restore checkpoints.")
parser.add_argument("--deepspeed", default=False, action="store_true")
parser.add_argument("--deepspeed2", default=False, action="store_true")
parser.add_argument("--scale_lr", type=bool, nargs="?", const=True, default=False,
help="scale base-lr by ngpu * batch_size * n_accumulate")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
if args.fast:
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('medium')
torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 0.05
# Set random seed
pl.seed_everything(args.seed, workers=True)
# Load configuration
config = get_config_from_file(args.config)
config = merge_cfg(config, vars(args))
training_cfg = config.training
# print config
rank_zero_info("Begin to print configuration ...")
rank_zero_info(OmegaConf.to_yaml(config))
rank_zero_info("Finish print ...")
# Setup callbacks
callbacks, loggers = setup_callbacks(config)
# Build data modules
data: pl.LightningDataModule = instantiate_from_config(config.dataset)
# Build model
model: pl.LightningModule = instantiate_from_config(config.model)
nodes = args.num_nodes
ngpus = args.num_gpus
base_lr = training_cfg.base_lr
accumulate_grad_batches = training_cfg.update_every
batch_size = config.dataset.params.batch_size
if 'NNODES' in os.environ:
nodes = int(os.environ['NNODES'])
training_cfg.num_nodes = nodes
args.num_nodes = nodes
if args.scale_lr:
model.learning_rate = accumulate_grad_batches * nodes * ngpus * batch_size * base_lr
info = f"Setting learning rate to {model.learning_rate:.2e} = {accumulate_grad_batches} (accumulate)"
info += f" * {nodes} (nodes) * {ngpus} (num_gpus) * {batch_size} (batchsize) * {base_lr:.2e} (base_lr)"
rank_zero_info(info)
else:
model.learning_rate = base_lr
rank_zero_info("++++ NOT USING LR SCALING ++++")
rank_zero_info(f"Setting learning rate to {model.learning_rate:.2e}")
# Build trainer
if args.num_nodes > 1 or args.num_gpus > 1:
if args.deepspeed:
ddp_strategy = DeepSpeedStrategy(stage=1)
elif args.deepspeed2:
ddp_strategy = 'deepspeed_stage_2'
else:
ddp_strategy = DDPStrategy(find_unused_parameters=False, bucket_cap_mb=1500)
else:
ddp_strategy = None # 'auto'
rank_zero_info(f'*' * 100)
if training_cfg.use_amp:
amp_type = training_cfg.amp_type
assert amp_type in ['bf16', '16', '32'], f"Invalid amp_type: {amp_type}"
rank_zero_info(f'Using {amp_type} precision')
else:
amp_type = 32
rank_zero_info(f'Using 32 bit precision')
rank_zero_info(f'*' * 100)
trainer = pl.Trainer(
max_steps=training_cfg.steps,
precision=amp_type,
callbacks=callbacks,
accelerator="gpu",
devices=training_cfg.num_gpus,
num_nodes=training_cfg.num_nodes,
strategy=ddp_strategy,
gradient_clip_val=training_cfg.get('gradient_clip_val'),
gradient_clip_algorithm=training_cfg.get('gradient_clip_algorithm'),
accumulate_grad_batches=args.update_every,
logger=loggers,
log_every_n_steps=training_cfg.log_every_n_steps,
val_check_interval=training_cfg.val_check_interval,
limit_val_batches=training_cfg.limit_val_batches,
check_val_every_n_epoch=None
)
# Train
if training_cfg.ckpt_path == '':
training_cfg.ckpt_path = None
trainer.fit(model, datamodule=data, ckpt_path=training_cfg.ckpt_path)