root
add offload mode
647f0a5
# ================ Train Config ================ #
lyric_processor:
max_dur: 150
min_dur: 30
prompt_len: 10
pad_to_max: true
# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000
# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
# ================== LM =========================== #
lm:
lm_type: Llama # [Llama]
dim: 1536
intermediate_size: 8960
num_heads: 12
num_layers: 28
num_layers_sub: 12
code_depth: 3
code_size: 16384
max_position_embeddings: 8196
max_position_embeddings_sub: 10000
rope_theta: 100000.0
rope_theta_sub: 500000.0
dropout: 0.0
use_flash_attn_2: true
activation: gelu
norm_first: true
bias_ff: false
bias_attn: false
causal: true
custom: false
memory_efficient: true
attention_as_float32: false
layer_scale: null
positional_embedding: sin
xpos: false
checkpointing: torch
weight_init: gaussian
depthwise_init: current
zero_bias_init: true
norm: layer_norm
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
codebooks_pattern:
modeling: delay
delay:
delays: [ 0, 250, 250 ]
flatten_first: 0
empty_initial: 0
# ================ Conditioners ===================== #
classifier_free_guidance:
# drop all conditions simultaneously
training_dropout: 0.15
inference_coef: 1.5
attribute_dropout:
# drop each condition separately
args:
active_on_eval: false
text:
description: 0.0
type_info: 0.5
audio:
prompt_audio: 0.0
use_text_training: True
fuser:
sum: []
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
conditioners:
prompt_audio:
model: qt_embedding
qt_embedding:
code_size: 16384
code_depth: 3
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
description:
model: QwTokenizer
QwTokenizer:
token_path: third_party/Qwen2-7B
max_len: 300
add_token_list: ${load_yaml:conf/vocab.yaml}
type_info:
model: QwTextTokenizer
QwTextTokenizer:
token_path: third_party/Qwen2-7B
max_len: 50
offload:
audiolm:
offload_module: self
cpu_mem_gb: 0
pre_copy_step: 1
clean_cache_after_forward: false
dtype: torch.float16
offload_layer_dict:
transformer: 4
transformer2: 4
ignore_layer_list: []
clean_cache_wrapper:
module: self
method_name: _sample_next_token
diff_mem_gb_thre: 2
debug: false
wav_tokenizer_diffusion:
offload_module: self.model.model
pre_copy_step: 1
clean_cache_after_forward: false
cpu_mem_gb: -1
dtype: null
offload_layer_dict:
cfm_wrapper: 5
hubert: 4
ignore_layer_list: []
clean_cache_wrapper:
module: self.model.model.cfm_wrapper.estimator
method_name: forward
diff_mem_gb_thre: 1
debug: false