tts / recipes /vctk /resnet_speaker_encoder /train_encoder.py

Upload 542 files

127d53c over 2 years ago

4.79 kB

	import os

	from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig

	# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
	from TTS.tts.configs.shared_configs import BaseDatasetConfig

	CURRENT_PATH = os.getcwd()
	# change the root path to the TTS root path
	os.chdir("../../../")

	### Definitions ###
	# dataset
	VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
	RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/
	MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/

	# training
	OUTPUT_PATH = os.path.join(
	CURRENT_PATH, "resnet_speaker_encoder_training_output/"
	) # path to save the train logs and checkpoint
	CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
	RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore

	# instance the config
	# to speaker encoder
	config = SpeakerEncoderConfig()
	# to emotion encoder
	# config = EmotionEncoderConfig()


	#### DATASET CONFIG ####
	# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder
	dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)

	# add the dataset to the config
	config.datasets = [dataset_config]


	#### TRAINING CONFIG ####
	# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
	# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker

	# number total of speaker in batch in training
	config.num_classes_in_batch = 100
	# number of utterance per class/speaker in the batch in training
	config.num_utter_per_class = 4
	# final batch size = config.num_classes_in_batch * config.num_utter_per_class

	# number total of speaker in batch in evaluation
	config.eval_num_classes_in_batch = 100
	# number of utterance per class/speaker in the batch in evaluation
	config.eval_num_utter_per_class = 4

	# number of data loader workers
	config.num_loader_workers = 8
	config.num_val_loader_workers = 8

	# number of epochs
	config.epochs = 10000
	# loss to be used in training
	config.loss = "softmaxproto"

	# run eval
	config.run_eval = False

	# output path for the checkpoints
	config.output_path = OUTPUT_PATH

	# Save local checkpoint every save_step steps
	config.save_step = 2000

	### Model Config ###
	config.model_params = {
	"model_name": "resnet", # supported "lstm" and "resnet"
	"input_dim": 64,
	"use_torch_spec": True,
	"log_input": True,
	"proj_dim": 512, # embedding dim
	}

	### Audio Config ###
	# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
	config.voice_len = 2.0
	# all others configs
	config.audio = {
	"fft_size": 512,
	"win_length": 400,
	"hop_length": 160,
	"frame_shift_ms": None,
	"frame_length_ms": None,
	"stft_pad_mode": "reflect",
	"sample_rate": 16000,
	"resample": False,
	"preemphasis": 0.97,
	"ref_level_db": 20,
	"do_sound_norm": False,
	"do_trim_silence": False,
	"trim_db": 60,
	"power": 1.5,
	"griffin_lim_iters": 60,
	"num_mels": 64,
	"mel_fmin": 0.0,
	"mel_fmax": 8000.0,
	"spec_gain": 20,
	"signal_norm": False,
	"min_level_db": -100,
	"symmetric_norm": False,
	"max_norm": 4.0,
	"clip_norm": False,
	"stats_path": None,
	"do_rms_norm": True,
	"db_level": -27.0,
	}


	### Augmentation Config ###
	config.audio_augmentation = {
	# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
	"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled
	"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/
	"additive": {
	"sounds_path": MUSAN_PATH,
	"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
	"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
	"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
	},
	"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
	}

	config.save_json(CONFIG_OUT_PATH)

	print(CONFIG_OUT_PATH)
	if RESTORE_PATH is not None:
	command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
	else:
	command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"

	os.system(command)