| config: conf/ar_prior_train.yaml | |
| print_config: false | |
| log_level: INFO | |
| dry_run: false | |
| iterator_type: sequence | |
| output_dir: exp/tts_finetune_ar_prior | |
| ngpu: 1 | |
| seed: 0 | |
| num_workers: 1 | |
| num_att_plot: 3 | |
| dist_backend: nccl | |
| dist_init_method: env:// | |
| dist_world_size: null | |
| dist_rank: null | |
| local_rank: 0 | |
| dist_master_addr: null | |
| dist_master_port: null | |
| dist_launcher: null | |
| multiprocessing_distributed: false | |
| unused_parameters: false | |
| sharded_ddp: false | |
| cudnn_enabled: true | |
| cudnn_benchmark: false | |
| cudnn_deterministic: true | |
| collect_stats: false | |
| write_collected_feats: false | |
| max_epoch: 500 | |
| patience: null | |
| val_scheduler_criterion: | |
| - valid | |
| - loss | |
| early_stopping_criterion: | |
| - valid | |
| - loss | |
| - min | |
| best_model_criterion: | |
| - - valid | |
| - loss | |
| - min | |
| - - train | |
| - loss | |
| - min | |
| keep_nbest_models: 5 | |
| grad_clip: 1.0 | |
| grad_clip_type: 2.0 | |
| grad_noise: false | |
| accum_grad: 8 | |
| no_forward_run: false | |
| resume: true | |
| train_dtype: float32 | |
| use_amp: false | |
| log_interval: null | |
| use_tensorboard: true | |
| use_wandb: false | |
| wandb_project: null | |
| wandb_id: null | |
| detect_anomaly: false | |
| pretrain_path: null | |
| init_param: | |
| - /data/leuven/339/vsc33942/espnet-mirror/egs2/acapela_blizzard/tts1/exp/tts_train_raw_phn_none/valid.loss.best.pth:::tts.prosody_encoder.ar_prior | |
| freeze_param: | |
| - encoder.,prosody_encoder.ref_encoder.,prosody_encoder.fg_encoder.,prosody_encoder.global_encoder.,prosody_encoder.global_projection.,prosody_encoder.vq_layer.,prosody_encoder.qfg_projection,duration_predictor.,length_regulator,decoder.,feat_out,postnet | |
| num_iters_per_epoch: 50 | |
| batch_size: 20 | |
| valid_batch_size: null | |
| batch_bins: 3000000 | |
| valid_batch_bins: null | |
| train_shape_file: | |
| - exp/tts_stats_raw_phn_none/train/text_shape.phn | |
| - exp/tts_stats_raw_phn_none/train/speech_shape | |
| valid_shape_file: | |
| - exp/tts_stats_raw_phn_none/valid/text_shape.phn | |
| - exp/tts_stats_raw_phn_none/valid/speech_shape | |
| batch_type: numel | |
| valid_batch_type: null | |
| fold_length: | |
| - 150 | |
| - 204800 | |
| sort_in_batch: descending | |
| sort_batch: descending | |
| multiple_iterator: false | |
| chunk_length: 500 | |
| chunk_shift_ratio: 0.5 | |
| num_cache_chunks: 1024 | |
| train_data_path_and_name_and_type: | |
| - - dump/raw/tr_no_dev/text | |
| - text | |
| - text | |
| - - data/durations/tr_no_dev/durations | |
| - durations | |
| - text_int | |
| - - dump/raw/tr_no_dev/wav.scp | |
| - speech | |
| - sound | |
| valid_data_path_and_name_and_type: | |
| - - dump/raw/dev/text | |
| - text | |
| - text | |
| - - data/durations/dev/durations | |
| - durations | |
| - text_int | |
| - - dump/raw/dev/wav.scp | |
| - speech | |
| - sound | |
| allow_variable_data_keys: false | |
| max_cache_size: 0.0 | |
| max_cache_fd: 32 | |
| valid_max_cache_size: null | |
| optim: adam | |
| optim_conf: | |
| lr: 1.0 | |
| scheduler: noamlr | |
| scheduler_conf: | |
| model_size: 384 | |
| warmup_steps: 4000 | |
| token_list: | |
| - <blank> | |
| - <unk> | |
| - n | |
| - '@' | |
| - t | |
| - _ | |
| - s | |
| - I | |
| - r | |
| - d | |
| - l | |
| - m | |
| - i | |
| - '{' | |
| - z | |
| - D | |
| - w | |
| - r= | |
| - f | |
| - v | |
| - E1 | |
| - b | |
| - t_h | |
| - h | |
| - V | |
| - u | |
| - k | |
| - I1 | |
| - '{1' | |
| - k_h | |
| - N | |
| - EI1 | |
| - V1 | |
| - O1 | |
| - AI | |
| - H | |
| - S | |
| - p_h | |
| - '@U1' | |
| - i1 | |
| - g | |
| - AI1 | |
| - j | |
| - O | |
| - p | |
| - u1 | |
| - r=1 | |
| - tS | |
| - Or | |
| - '4' | |
| - A | |
| - Or1 | |
| - E | |
| - dZ | |
| - T | |
| - aU1 | |
| - U | |
| - Er1 | |
| - '@U' | |
| - U1 | |
| - Ar1 | |
| - Er | |
| - aU | |
| - EI | |
| - ir1 | |
| - l= | |
| - OI1 | |
| - Ar | |
| - Ur1 | |
| - n= | |
| - A1 | |
| - Z | |
| - '?' | |
| - ir | |
| - Ur | |
| - OI | |
| - <sos/eos> | |
| odim: null | |
| model_conf: {} | |
| use_preprocessor: true | |
| token_type: phn | |
| bpemodel: null | |
| non_linguistic_symbols: null | |
| cleaner: null | |
| g2p: null | |
| feats_extract: fbank | |
| feats_extract_conf: | |
| fs: 22050 | |
| fmin: 80 | |
| fmax: 7600 | |
| n_mels: 80 | |
| hop_length: 256 | |
| n_fft: 1024 | |
| win_length: null | |
| normalize: global_mvn | |
| normalize_conf: | |
| stats_file: feats_stats.npz | |
| tts: fastespeech | |
| tts_conf: | |
| adim: 128 | |
| aheads: 2 | |
| elayers: 4 | |
| eunits: 1536 | |
| dlayers: 4 | |
| dunits: 1536 | |
| positionwise_layer_type: conv1d | |
| positionwise_conv_kernel_size: 3 | |
| duration_predictor_layers: 2 | |
| duration_predictor_chans: 128 | |
| duration_predictor_kernel_size: 3 | |
| duration_predictor_dropout_rate: 0.2 | |
| postnet_layers: 5 | |
| postnet_filts: 5 | |
| postnet_chans: 256 | |
| use_masking: true | |
| use_scaled_pos_enc: true | |
| encoder_normalize_before: true | |
| decoder_normalize_before: true | |
| reduction_factor: 1 | |
| init_type: xavier_uniform | |
| init_enc_alpha: 1.0 | |
| init_dec_alpha: 1.0 | |
| transformer_enc_dropout_rate: 0.2 | |
| transformer_enc_positional_dropout_rate: 0.2 | |
| transformer_enc_attn_dropout_rate: 0.2 | |
| transformer_dec_dropout_rate: 0.2 | |
| transformer_dec_positional_dropout_rate: 0.2 | |
| transformer_dec_attn_dropout_rate: 0.2 | |
| ref_enc_conv_layers: 2 | |
| ref_enc_conv_kernel_size: 3 | |
| ref_enc_conv_stride: 2 | |
| ref_enc_gru_layers: 1 | |
| ref_enc_gru_units: 32 | |
| ref_emb_integration_type: add | |
| prosody_num_embs: 32 | |
| prosody_hidden_dim: 3 | |
| prosody_emb_integration_type: add | |
| pitch_extract: null | |
| pitch_extract_conf: {} | |
| pitch_normalize: null | |
| pitch_normalize_conf: {} | |
| energy_extract: null | |
| energy_extract_conf: {} | |
| energy_normalize: null | |
| energy_normalize_conf: {} | |
| required: | |
| - output_dir | |
| - token_list | |
| version: 0.9.9 | |
| distributed: false | |