File size: 3,271 Bytes
defaults:
- common

train:
  batch_size: 128
  betas: [0.8, 0.99]
  c_kl: 1.0
  c_mel: 45
  distributed: false  # BUG: multi-gpu is not working
  use_multiprocessing: false # BUG: multi-gpu is not working
  epochs: 20
  eps: 1e-9
  fp16_run: false
  init_lr_ratio: 1
  raise_error: false
  learning_rate: 2e-4
  log_interval: 10
  log_level: ${log_level}
  lr_decay: 0.98
  max_speclen: 128
  port: 8005
  resume_training: false  # set to false to finetune from a model
  seed: 1234
  segment_size: 8960
  use_sr: false
  valid_epoch_interval: 1
  valid_steps_interval: 1000
  save_epoch_interval: 10
  save_steps_interval: 1000
  warmup_epochs: 0
  # weighted_batch_speaker_sampling : false
  # weighted_batch_lang_sampling : false
  weighted_batch_speaker_sampling : 0.5
  weighted_batch_lang_sampling : 0.5

data:
  dataset_dir: /raid/lucasgris/free-svc/data
  filter_length: 1280
  hop_length: 320
  max_wav_value: 32768.0
  mel_fmax: null
  mel_fmin: 0.0
  n_mel_channels: 80
  num_workers: 64
  # For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk)
  pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe
  pitch_features_dir: ${data.dataset_dir}/pitch_features/
  sampling_rate: 24000
  spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space
  # For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward
  use_spk_emb: true
  spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings
  # SR augmentation is deprecated, set use_sr to False
  sr_min_max: [68, 92]
  # For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward
  content_feature_dir: null
  training_files: data/train.csv
  validation_files: data/valid.csv
  win_length: 1280

model:
  save_dir: null
  filter_channels: 768
  finetune_from_model:
    discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth
    generator: /raid/lucasgris/free-svc/freevc-24.pth
  hidden_channels: 192
  inter_channels: 192
  kernel_size: 3
  n_heads: 2
  n_layers_q: 3
  n_layers: 6
  p_dropout: 0.1
  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
  resblock_kernel_sizes: [3,7,11]
  resblock: 1
  c_dim: 768
  upsample_initial_channel: 512
  upsample_kernel_sizes: [16,16,4,4]
  upsample_rates: [10,8,2,2]
  use_spectral_norm: false
  freeze_external_spk: true
  device: cuda
  # For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type
  use_spk_emb: false
  gin_channels: null # gin_channels = spk_encoder.embedding_dim
  spk_encoder_type: null # ECAPA2SpeakerEncoder16k |
  # For content feature extraction, set the content_encoder_type and content_encoder_ckpt
  content_encoder_type: null # load from disk (data) - hubert | wavlm
  content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best
  post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck
  coarse_f0: true
  cond_f0_on_flow: false