|
#!/bin/bash |
|
|
|
set -e -x |
|
|
|
|
|
|
|
export WANDB_MODE="offline" |
|
export NCCL_P2P_DISABLE=1 |
|
export TORCH_NCCL_ENABLE_MONITORING=0 |
|
export FINETRAINERS_LOG_LEVEL="DEBUG" |
|
|
|
|
|
|
|
BACKEND="ptd" |
|
|
|
|
|
NUM_GPUS=2 |
|
CUDA_VISIBLE_DEVICES="2,3" |
|
|
|
|
|
TRAINING_DATASET_CONFIG="examples/training/sft/wan/crush_smol_lora/training.json" |
|
VALIDATION_DATASET_FILE="examples/training/sft/wan/crush_smol_lora/validation.json" |
|
|
|
|
|
DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1" |
|
DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1" |
|
DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1" |
|
FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1" |
|
FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1" |
|
HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1" |
|
|
|
|
|
parallel_cmd=( |
|
$DDP_2 |
|
) |
|
|
|
|
|
model_cmd=( |
|
--model_name "wan" |
|
--pretrained_model_name_or_path "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset_cmd=( |
|
--dataset_config $TRAINING_DATASET_CONFIG |
|
--dataset_shuffle_buffer_size 10 |
|
--precomputation_items 25 |
|
--precomputation_once |
|
) |
|
|
|
|
|
dataloader_cmd=( |
|
--dataloader_num_workers 0 |
|
) |
|
|
|
|
|
diffusion_cmd=( |
|
--flow_weighting_scheme "logit_normal" |
|
) |
|
|
|
|
|
|
|
|
|
training_cmd=( |
|
--training_type "lora" |
|
--seed 42 |
|
--batch_size 1 |
|
--train_steps 3000 |
|
--rank 32 |
|
--lora_alpha 32 |
|
--target_modules "blocks.*(to_q|to_k|to_v|to_out.0)" |
|
--gradient_accumulation_steps 1 |
|
--gradient_checkpointing |
|
--checkpointing_steps 500 |
|
--checkpointing_limit 2 |
|
|
|
--enable_slicing |
|
--enable_tiling |
|
) |
|
|
|
|
|
optimizer_cmd=( |
|
--optimizer "adamw" |
|
--lr 5e-5 |
|
--lr_scheduler "constant_with_warmup" |
|
--lr_warmup_steps 1000 |
|
--lr_num_cycles 1 |
|
--beta1 0.9 |
|
--beta2 0.99 |
|
--weight_decay 1e-4 |
|
--epsilon 1e-8 |
|
--max_grad_norm 1.0 |
|
) |
|
|
|
|
|
validation_cmd=( |
|
--validation_dataset_file "$VALIDATION_DATASET_FILE" |
|
--validation_steps 500 |
|
) |
|
|
|
|
|
miscellaneous_cmd=( |
|
--tracker_name "finetrainers-wan" |
|
--output_dir "/raid/aryan/wan" |
|
--init_timeout 600 |
|
--nccl_timeout 600 |
|
--report_to "wandb" |
|
) |
|
|
|
|
|
if [ "$BACKEND" == "accelerate" ]; then |
|
|
|
ACCELERATE_CONFIG_FILE="" |
|
if [ "$NUM_GPUS" == 1 ]; then |
|
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml" |
|
elif [ "$NUM_GPUS" == 2 ]; then |
|
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml" |
|
elif [ "$NUM_GPUS" == 4 ]; then |
|
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml" |
|
elif [ "$NUM_GPUS" == 8 ]; then |
|
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml" |
|
fi |
|
|
|
accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \ |
|
"${parallel_cmd[@]}" \ |
|
"${model_cmd[@]}" \ |
|
"${dataset_cmd[@]}" \ |
|
"${dataloader_cmd[@]}" \ |
|
"${diffusion_cmd[@]}" \ |
|
"${training_cmd[@]}" \ |
|
"${optimizer_cmd[@]}" \ |
|
"${validation_cmd[@]}" \ |
|
"${miscellaneous_cmd[@]}" |
|
|
|
elif [ "$BACKEND" == "ptd" ]; then |
|
|
|
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES |
|
|
|
torchrun \ |
|
--standalone \ |
|
--nnodes=1 \ |
|
--nproc_per_node=$NUM_GPUS \ |
|
--rdzv_backend c10d \ |
|
--rdzv_endpoint="localhost:0" \ |
|
train.py \ |
|
"${parallel_cmd[@]}" \ |
|
"${model_cmd[@]}" \ |
|
"${dataset_cmd[@]}" \ |
|
"${dataloader_cmd[@]}" \ |
|
"${diffusion_cmd[@]}" \ |
|
"${training_cmd[@]}" \ |
|
"${optimizer_cmd[@]}" \ |
|
"${validation_cmd[@]}" \ |
|
"${miscellaneous_cmd[@]}" |
|
fi |
|
|
|
echo -ne "-------------------- Finished executing script --------------------\n\n" |
|
|