Spaces:
Runtime error
Runtime error
File size: 3,562 Bytes
64e7f2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#############
# Custom dataset preprocess
#############
audio_num_mel_bins: 80
audio_sample_rate: 22050
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600 # To be increased/reduced depending on data.
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
ref_level_db: 20
griffin_lim_iters: 60
num_spk: 1 # number of speakers
mel_vmin: -6
mel_vmax: 1.5
#############
# FastDiff Model
#############
audio_channels: 1
inner_channels: 32
cond_channels: 80
upsample_ratios: [8, 8, 4]
lvc_layers_each_block: 4
lvc_kernel_size: 3
kpnet_hidden_channels: 64
kpnet_conv_size: 3
dropout: 0.0
diffusion_step_embed_dim_in: 128
diffusion_step_embed_dim_mid: 512
diffusion_step_embed_dim_out: 512
use_weight_norm: True
###########
# Diffusion
###########
T: 1000
beta_0: 0.000001
beta_T: 0.01
noise_schedule: ''
N: ''
###########
# train and eval
###########
task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
max_updates: 1000000 # max training steps
max_samples: 25600 # audio length in training
max_sentences: 20 # max batch size in training
num_sanity_val_steps: -1
max_valid_sentences: 1
valid_infer_interval: 10000
val_check_interval: 2000
num_test_samples: 0
num_valid_plots: 10
#############
# Stage 1 of data processing
#############
pre_align_cls: egs.datasets.audio.pre_align.PreAlign
pre_align_args:
nsample_per_mfa_group: 1000
txt_processor: en
use_tone: true # for ZH
sox_resample: false
sox_to_wav: false
allow_no_txt: true
trim_sil: false
denoise: false
#############
# Stage 2 of data processing
#############
binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
binarization_args:
with_wav: true
with_spk_embed: false
with_align: false
with_word: false
with_txt: false
with_f0: false
shuffle: false
with_spk_id: true
with_f0cwt: false
with_linear: false
trim_eos_bos: false
reset_phone_dict: true
reset_word_dict: true
###########
# optimization
###########
lr: 2e-4 # learning rate
weight_decay: 0
scheduler: rsqrt # rsqrt|none
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
clip_grad_norm: 1
clip_grad_value: 0
#############
# Setting for this Pytorch framework
#############
max_input_tokens: 1550
frames_multiple: 1
use_word_input: false
vocoder: FastDiff
vocoder_ckpt: checkpoints/FastDiff
vocoder_denoise_c: 0.0
max_tokens: 30000
max_valid_tokens: 60000
test_ids: [ ]
profile_infer: false
out_wav_norm: false
save_gt: true
save_f0: false
aux_context_window: 0
test_input_dir: '' # 'wavs' # wav->wav inference
test_mel_dir: '' # 'mels' # mel->wav inference
use_wav: True # mel->wav inference
pitch_extractor: parselmouth
loud_norm: false
endless_ds: true
test_num: 100
min_frames: 0
max_frames: 1548
ds_workers: 1
gen_dir_name: ''
accumulate_grad_batches: 1
tb_log_interval: 100
print_nan_grads: false
work_dir: '' # experiment directory.
infer: false # inference
amp: false
debug: false
save_codes: []
save_best: true
num_ckpt_keep: 3
sort_by_len: true
load_ckpt: ''
check_val_every_n_epoch: 10
max_epochs: 1000
eval_max_batches: -1
resume_from_checkpoint: 0
rename_tmux: true
valid_monitor_key: 'val_loss'
valid_monitor_mode: 'min'
train_set_name: 'train'
train_sets: ''
valid_set_name: 'valid'
test_set_name: 'test'
seed: 1234 |