File size: 2,259 Bytes
910e2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash

# This script is used for Pyramid-Flow Video Generation Training (Using Temporal Pyramid and autoregressive training)
# It enables the autoregressive video generative training with temporal pyramid
# make sure to set, NUM_FRAMES % VIDEO_SYNC_GROUP == 0; GPUS % VIDEO_SYNC_GROUP == 0

GPUS=8  # The gpu number
SHARD_STRATEGY=zero2   # zero2 or zero3
VIDEO_SYNC_GROUP=8     # values in [4, 8, 16] The number of process that accepts the same input video, used for temporal pyramid AR training.
MODEL_NAME=pyramid_flux     # The model name, `pyramid_flux` or `pyramid_mmdit`
MODEL_PATH=/PATH/pyramid-flow-miniflux  # The downloaded ckpt dir. IMPORTANT: It should match with model_name, flux or mmdit (sd3)
VARIANT=diffusion_transformer_384p  # The DiT Variant
OUTPUT_DIR=/PATH/output_dir    # The checkpoint saving dir

BATCH_SIZE=4    # It should satisfy batch_size % 4 == 0
GRAD_ACCU_STEPS=2
RESOLUTION="384p"     # 384p or 768p
NUM_FRAMES=16         # e.g., 16 for 5s, 32 for 10s
ANNO_FILE=annotation/video_text.jsonl   # The video annotation file path

# For the 768p version, make sure to add the args:  --gradient_checkpointing

torchrun --nproc_per_node $GPUS \
    train/train_pyramid_flow.py \
    --num_workers 8 \
    --task t2v \
    --use_fsdp \
    --fsdp_shard_strategy $SHARD_STRATEGY \
    --use_temporal_causal \
    --use_temporal_pyramid \
    --interp_condition_pos \
    --sync_video_input \
    --video_sync_group $VIDEO_SYNC_GROUP \
    --load_text_encoder \
    --model_name $MODEL_NAME \
    --model_path $MODEL_PATH \
    --model_dtype bf16 \
    --model_variant $VARIANT \
    --schedule_shift 1.0 \
    --gradient_accumulation_steps $GRAD_ACCU_STEPS \
    --output_dir $OUTPUT_DIR \
    --batch_size $BATCH_SIZE \
    --max_frames $NUM_FRAMES \
    --resolution $RESOLUTION \
    --anno_file $ANNO_FILE \
    --frame_per_unit 1 \
    --lr_scheduler constant_with_warmup \
    --opt adamw \
    --opt_beta1 0.9 \
    --opt_beta2 0.95 \
    --seed 42 \
    --weight_decay 1e-4 \
    --clip_grad 1.0 \
    --lr 5e-5 \
    --warmup_steps 1000 \
    --epochs 20 \
    --iters_per_epoch 2000 \
    --report_to tensorboard \
    --print_freq 40 \
    --save_ckpt_freq 1