Spaces:
Build error
Build error
| # ================ Logging ====================== # | |
| root_dir: exp/song/${get_fname:} | |
| # ================ Checkpoints ================== # | |
| use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed'] | |
| pretrained: | |
| ddp_checkpoint: | |
| deepspeed_checkpoint: ./ckpt/60000_alnew.pt | |
| continue_checkpoint: | |
| # ================ Data & loader ================== # | |
| prompt_select: random | |
| train_jsonl_list: | |
| - .jsonl | |
| val_jsonl_list: | |
| - .jsonl | |
| train_scp_list: | |
| - .scp | |
| val_scp_list: | |
| - .scp | |
| lyric_processor: | |
| max_dur: 150 | |
| min_dur: 30 | |
| batch_size: 2 | |
| prompt_len: 10 | |
| pad_to_max: true | |
| # ================ Training ======================= # | |
| accelerator: gpu | |
| devices: 8 | |
| num_nodes: 4 | |
| val_check_interval: 2500 | |
| accumulate_grad_batches: 1 | |
| strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true'] | |
| precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] | |
| optim: | |
| optimizer: adamw | |
| updates_per_epoch: 1000 | |
| epochs: 100 | |
| old_lr: 0 # 1e-4 | |
| new_lr: 1e-4 | |
| max_norm: 0.5 | |
| adam: | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| weight_decay: 0.00001 # 0.1 | |
| eps: 1e-8 | |
| schedule: | |
| lr_scheduler: cosine | |
| cosine: | |
| warmup: 4000 | |
| lr_min_ratio: 0.0 | |
| cycle_length: 1.0 | |
| # ================ Audio tokenzier ================ # | |
| audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors | |
| audio_tokenizer_frame_rate: 25 | |
| audio_tokenizer_code_depth: 1 | |
| sample_rate: 48000 | |
| audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors | |
| audio_tokenizer_frame_rate_sep: 25 | |
| audio_tokenizer_code_depth_sep: 2 | |
| sample_rate_sep: 48000 | |
| # ================ VAE ================ # | |
| vae_config: ./ckpt/vae/stable_audio_1920_vae.json | |
| vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt | |
| # ================== LM =========================== # | |
| lm: | |
| lm_type: Llama # [Llama] | |
| dim: 1536 | |
| intermediate_size: 8960 | |
| num_heads: 12 | |
| num_layers: 28 | |
| code_depth: 3 | |
| code_size: 16384 | |
| dropout: 0.0 | |
| activation: gelu | |
| norm_first: true | |
| bias_ff: false | |
| bias_attn: false | |
| bias_proj: false | |
| causal: true | |
| custom: false | |
| memory_efficient: true | |
| attention_as_float32: false | |
| layer_scale: null | |
| positional_embedding: sin | |
| xpos: false | |
| checkpointing: torch | |
| weight_init: gaussian | |
| depthwise_init: current | |
| zero_bias_init: true | |
| norm: layer_norm | |
| cross_attention: false | |
| qk_layer_norm: false | |
| qk_layer_norm_cross: false | |
| attention_dropout: null | |
| kv_repeat: 1 | |
| codebooks_pattern: | |
| modeling: delay | |
| delay: | |
| delays: [ 0, 250, 250 ] | |
| flatten_first: 0 | |
| empty_initial: 0 | |
| # ================ Conditioners ===================== # | |
| classifier_free_guidance: | |
| # drop all conditions simultaneously | |
| training_dropout: 0.15 | |
| inference_coef: 1.5 | |
| attribute_dropout: | |
| # drop each condition separately | |
| args: | |
| active_on_eval: false | |
| text: | |
| description: 0.0 | |
| type_info: 0.5 | |
| audio: | |
| prompt_audio: 0.0 | |
| use_text_training: True | |
| fuser: | |
| sum: [] | |
| prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order | |
| conditioners: | |
| prompt_audio: | |
| model: qt_embedding | |
| qt_embedding: | |
| code_size: 16384 | |
| code_depth: 3 | |
| max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1 | |
| description: | |
| model: QwTokenizer | |
| QwTokenizer: | |
| token_path: third_party/Qwen2-7B | |
| max_len: 300 | |
| add_token_list: ${load_yaml:conf/vocab.yaml} | |
| type_info: | |
| model: QwTextTokenizer | |
| QwTextTokenizer: | |
| token_path: third_party/Qwen2-7B | |
| max_len: 50 | |