|
|
|
lyric_processor: |
|
max_dur: 150 |
|
min_dur: 30 |
|
prompt_len: 10 |
|
pad_to_max: true |
|
|
|
|
|
|
|
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors |
|
audio_tokenizer_frame_rate: 25 |
|
audio_tokenizer_code_depth: 1 |
|
sample_rate: 48000 |
|
|
|
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors |
|
audio_tokenizer_frame_rate_sep: 25 |
|
audio_tokenizer_code_depth_sep: 2 |
|
sample_rate_sep: 48000 |
|
|
|
|
|
vae_config: ./ckpt/vae/stable_audio_1920_vae.json |
|
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt |
|
|
|
|
|
lm: |
|
lm_type: Llama |
|
dim: 1536 |
|
intermediate_size: 8960 |
|
num_heads: 12 |
|
num_layers: 28 |
|
num_layers_sub: 12 |
|
code_depth: 3 |
|
code_size: 16384 |
|
max_position_embeddings: 8196 |
|
max_position_embeddings_sub: 10000 |
|
rope_theta: 100000.0 |
|
rope_theta_sub: 500000.0 |
|
dropout: 0.0 |
|
use_flash_attn_2: true |
|
activation: gelu |
|
norm_first: true |
|
bias_ff: false |
|
bias_attn: false |
|
causal: true |
|
custom: false |
|
memory_efficient: true |
|
attention_as_float32: false |
|
layer_scale: null |
|
positional_embedding: sin |
|
xpos: false |
|
checkpointing: torch |
|
weight_init: gaussian |
|
depthwise_init: current |
|
zero_bias_init: true |
|
norm: layer_norm |
|
cross_attention: false |
|
qk_layer_norm: false |
|
qk_layer_norm_cross: false |
|
attention_dropout: null |
|
kv_repeat: 1 |
|
|
|
codebooks_pattern: |
|
modeling: delay |
|
delay: |
|
delays: [ 0, 250, 250 ] |
|
flatten_first: 0 |
|
empty_initial: 0 |
|
|
|
|
|
classifier_free_guidance: |
|
|
|
training_dropout: 0.15 |
|
inference_coef: 1.5 |
|
|
|
attribute_dropout: |
|
|
|
args: |
|
active_on_eval: false |
|
text: |
|
description: 0.0 |
|
type_info: 0.5 |
|
audio: |
|
prompt_audio: 0.0 |
|
|
|
|
|
use_text_training: True |
|
fuser: |
|
sum: [] |
|
prepend: [ description, prompt_audio, type_info ] |
|
|
|
conditioners: |
|
prompt_audio: |
|
model: qt_embedding |
|
qt_embedding: |
|
code_size: 16384 |
|
code_depth: 3 |
|
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} |
|
description: |
|
model: QwTokenizer |
|
QwTokenizer: |
|
token_path: third_party/Qwen2-7B |
|
max_len: 300 |
|
add_token_list: ${load_yaml:conf/vocab.yaml} |
|
type_info: |
|
model: QwTextTokenizer |
|
QwTextTokenizer: |
|
token_path: third_party/Qwen2-7B |
|
max_len: 50 |
|
|
|
offload: |
|
audiolm: |
|
offload_module: self |
|
cpu_mem_gb: 0 |
|
pre_copy_step: 1 |
|
clean_cache_after_forward: false |
|
dtype: torch.float16 |
|
offload_layer_dict: |
|
transformer: 4 |
|
transformer2: 4 |
|
ignore_layer_list: [] |
|
clean_cache_wrapper: |
|
module: self |
|
method_name: _sample_next_token |
|
diff_mem_gb_thre: 2 |
|
debug: false |
|
|
|
wav_tokenizer_diffusion: |
|
offload_module: self.model.model |
|
pre_copy_step: 1 |
|
clean_cache_after_forward: false |
|
cpu_mem_gb: -1 |
|
dtype: null |
|
offload_layer_dict: |
|
cfm_wrapper: 5 |
|
hubert: 4 |
|
ignore_layer_list: [] |
|
clean_cache_wrapper: |
|
module: self.model.model.cfm_wrapper.estimator |
|
method_name: forward |
|
diff_mem_gb_thre: 1 |
|
debug: false |
|
|