diff --git a/ckpts_freevc/freevc.json b/ckpts_freevc/freevc.json new file mode 100644 index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1 --- /dev/null +++ b/ckpts_freevc/freevc.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/ckpts_freevc/freevc.pth b/ckpts_freevc/freevc.pth new file mode 100644 index 0000000000000000000000000000000000000000..976143bef5d846836704a38f7ad57cb0535d40b8 --- /dev/null +++ b/ckpts_freevc/freevc.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2cc2d047f63b80d1d6780e37611cec11a01d597560393b1fe6118158b3bd47f +size 472644351 diff --git a/dreamvoice/freevc/.gitattributes b/dreamvoice/freevc/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26 --- /dev/null +++ b/dreamvoice/freevc/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/dreamvoice/freevc/.gitignore b/dreamvoice/freevc/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4 --- /dev/null +++ b/dreamvoice/freevc/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +flagged \ No newline at end of file diff --git a/dreamvoice/freevc/README.md b/dreamvoice/freevc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9 --- /dev/null +++ b/dreamvoice/freevc/README.md @@ -0,0 +1,13 @@ +--- +title: FreeVC +emoji: 🚀 +colorFrom: gray +colorTo: red +sdk: gradio +sdk_version: 3.13.0 +app_file: app.py +pinned: false +license: mit +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/dreamvoice/freevc/app.py b/dreamvoice/freevc/app.py new file mode 100644 index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79 --- /dev/null +++ b/dreamvoice/freevc/app.py @@ -0,0 +1,92 @@ +import os +import torch +import librosa +import gradio as gr +from scipy.io.wavfile import write +from transformers import WavLMModel + +import utils +from models import SynthesizerTrn +from mel_processing import mel_spectrogram_torch +from speaker_encoder.voice_encoder import SpeakerEncoder + +''' +def get_wavlm(): + os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU') + shutil.move('WavLM-Large.pt', 'wavlm') +''' + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# print("Loading FreeVC...") +# hps = utils.get_hparams_from_file("configs/freevc.json") +# freevc = SynthesizerTrn( +# hps.data.filter_length // 2 + 1, +# hps.train.segment_size // hps.data.hop_length, +# **hps.model).to(device) +# _ = freevc.eval() +# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) +smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + +print("Loading FreeVC(24k)...") +hps = utils.get_hparams_from_file("configs/freevc-24.json") +freevc_24 = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +_ = freevc_24.eval() +_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) + +# print("Loading FreeVC-s...") +# hps = utils.get_hparams_from_file("configs/freevc-s.json") +# freevc_s = SynthesizerTrn( +# hps.data.filter_length // 2 + 1, +# hps.train.segment_size // hps.data.hop_length, +# **hps.model).to(device) +# _ = freevc_s.eval() +# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None) +# +# print("Loading WavLM for content...") +cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + +def convert(model, cmodel, src, tgt): + with torch.no_grad(): + # tgt + wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + + # src + wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) + c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) + # infer + if model == "FreeVC": + audio = freevc.infer(c, g=g_tgt) + elif model == "FreeVC-s": + audio = freevc_s.infer(c, mel=mel_tgt) + else: + audio = freevc_24.infer(c, g=g_tgt) + audio = audio[0][0].data.cpu().float().numpy() + if model == "FreeVC" or model == "FreeVC-s": + write("out.wav", hps.data.sampling_rate, audio) + else: + write("out.wav", 24000, audio) + out = "out.wav" + return out + +# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model") +# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath') +# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath') +# inputs = [model, audio1, audio2] +# outputs = gr.outputs.Audio(label="Output Audio", type='filepath') +# +# title = "FreeVC" +# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting." +# article = "

Paper | Github Repo

" +# +# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']] +# +# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch() +convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav') \ No newline at end of file diff --git a/dreamvoice/freevc/commons.py b/dreamvoice/freevc/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1 --- /dev/null +++ b/dreamvoice/freevc/commons.py @@ -0,0 +1,171 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d( + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2,3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm diff --git a/dreamvoice/freevc/configs/freevc-24.json b/dreamvoice/freevc/configs/freevc-24.json new file mode 100644 index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2 --- /dev/null +++ b/dreamvoice/freevc/configs/freevc-24.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8640, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8008" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,4,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/freevc/configs/freevc-s.json b/dreamvoice/freevc/configs/freevc-s.json new file mode 100644 index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1 --- /dev/null +++ b/dreamvoice/freevc/configs/freevc-s.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": false + } +} diff --git a/dreamvoice/freevc/configs/freevc.json b/dreamvoice/freevc/configs/freevc.json new file mode 100644 index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1 --- /dev/null +++ b/dreamvoice/freevc/configs/freevc.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/freevc/mel_processing.py b/dreamvoice/freevc/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54 --- /dev/null +++ b/dreamvoice/freevc/mel_processing.py @@ -0,0 +1,112 @@ +import math +import os +import random +import torch +from torch import nn +import torch.nn.functional as F +import torch.utils.data +import numpy as np +import librosa +import librosa.util as librosa_util +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + '_' + str(spec.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/dreamvoice/freevc/models.py b/dreamvoice/freevc/models.py new file mode 100644 index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361 --- /dev/null +++ b/dreamvoice/freevc/models.py @@ -0,0 +1,351 @@ +import copy +import math +import torch +from torch import nn +from torch.nn import functional as F + +from .commons import sequence_mask, rand_slice_segments +from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .commons import init_weights, get_padding + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) + resblock = ResBlock1 if resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append(weight_norm( + ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), + k, u, padding=(k-u)//2))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2,3,5,7,11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class SpeakerEncoder(torch.nn.Module): + def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): + mel_slices = [] + for i in range(0, total_frames-partial_frames, partial_hop): + mel_range = torch.arange(i, i+partial_frames) + mel_slices.append(mel_range) + + return mel_slices + + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): + mel_len = mel.size(1) + last_mel = mel[:,-partial_frames:] + + if mel_len > partial_frames: + mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) + mels = list(mel[:,s] for s in mel_slices) + mels.append(last_mel) + mels = torch.stack(tuple(mels), 0).squeeze(1) + + with torch.no_grad(): + partial_embeds = self(mels) + embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) + #embed = embed / torch.linalg.norm(embed, 2) + else: + with torch.no_grad(): + embed = self(last_mel) + + return embed + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + use_spk, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + self.use_spk = use_spk + + self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16) + self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + if not self.use_spk: + self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels) + + def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if spec_lengths == None: + spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) + + if not self.use_spk: + g = self.enc_spk(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + _, m_p, logs_p, _ = self.enc_p(c, c_lengths) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + z_p = self.flow(z, spec_mask, g=g) + + z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size) + o = self.dec(z_slice, g=g) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, c, g=None, mel=None, c_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if not self.use_spk: + g = self.enc_spk.embed_utterance(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths) + z = self.flow(z_p, c_mask, g=g, reverse=True) + o = self.dec(z * c_mask, g=g) + + return o diff --git a/dreamvoice/freevc/modules.py b/dreamvoice/freevc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5 --- /dev/null +++ b/dreamvoice/freevc/modules.py @@ -0,0 +1,341 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm + +from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers-1): + self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size ** i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, + groups=channels, dilation=dilation, padding=padding + )) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + self.hidden_channels =hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + else: + g_l = torch.zeros_like(x_in) + + acts = fused_add_tanh_sigmoid_multiply( + x_in, + g_l, + n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:,:self.hidden_channels,:] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:,self.hidden_channels:,:] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels,1)) + self.logs = nn.Parameter(torch.zeros(channels,1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1,2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels]*2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels]*2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1,2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/dreamvoice/freevc/requirements.txt b/dreamvoice/freevc/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c --- /dev/null +++ b/dreamvoice/freevc/requirements.txt @@ -0,0 +1,8 @@ +altair +httpx==0.24.1 +numpy +scipy +torch +transformers +librosa +webrtcvad==2.0.10 diff --git a/dreamvoice/freevc/utils.py b/dreamvoice/freevc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69 --- /dev/null +++ b/dreamvoice/freevc/utils.py @@ -0,0 +1,305 @@ +import os +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch +from torch.nn import functional as F +from .commons import sequence_mask + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def get_cmodel(rank): + checkpoint = torch.load('wavlm/WavLM-Large.pt') + cfg = WavLMConfig(checkpoint['cfg']) + cmodel = WavLM(cfg).cuda(rank) + cmodel.load_state_dict(checkpoint['model']) + cmodel.eval() + return cmodel + + +def get_content(cmodel, y): + with torch.no_grad(): + c = cmodel.extract_features(y.squeeze(1))[0] + c = c.transpose(1, 2) + return c + + +def get_vocoder(rank): + with open("hifigan/config.json", "r") as f: + config = json.load(f) + config = hifigan.AttrDict(config) + vocoder = hifigan.Generator(config) + ckpt = torch.load("hifigan/generator_v1") + vocoder.load_state_dict(ckpt["generator"]) + vocoder.eval() + vocoder.remove_weight_norm() + vocoder.cuda(rank) + return vocoder + + +def transform(mel, height): # 68-92 + #r = np.random.random() + #rate = r * 0.3 + 0.85 # 0.85-1.15 + #height = int(mel.size(-2) * rate) + tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1))) + if height >= mel.size(-2): + return tgt[:, :mel.size(-2), :] + else: + silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) + silence += torch.randn_like(silence) / 10 + return torch.cat((tgt, silence), 1) + + +def stretch(mel, width): # 0.5-2 + return torchvision.transforms.functional.resize(mel, (mel.size(-2), width)) + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + if optimizer is not None: + optimizer.load_state_dict(checkpoint_dict['optimizer']) + saved_state_dict = checkpoint_dict['model'] + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict= {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info("Loaded checkpoint '{}' (iteration {})" .format( + checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info("Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path)) + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save({'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate}, checkpoint_path) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10,2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding='utf-8') as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default="./configs/base.json", + help='JSON file for configuration') + parser.add_argument('-m', '--model', type=str, required=True, + help='Model name') + + args = parser.parse_args() + model_dir = os.path.join("./logs", args.model) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + )) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn("git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/dreamvoice/freevc_wrapper.py b/dreamvoice/freevc_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..958cd74a44e119cbafb2365ca1ebb4a7eac66c7e --- /dev/null +++ b/dreamvoice/freevc_wrapper.py @@ -0,0 +1,63 @@ +import os +import torch +import librosa +import soundfile as sf +from pathlib import Path + +from transformers import WavLMModel +from .freevc.utils import load_checkpoint, get_hparams_from_file +from .freevc.models import SynthesizerTrn +# from mel_processing import mel_spectrogram_torch +# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder +# from speaker_encoder.voice_encoder import SpeakerEncoder + + +def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'): + hps = get_hparams_from_file(f"{path}/freevc.json") + freevc = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) + freevc.eval() + load_checkpoint(f"{path}/freevc.pth", freevc, None) + + cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + cmodel.eval() + + # smodel = spk_encoder.load_model(Path(speaker_path), device) + # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda') + # smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device) + + return freevc, cmodel, hps + + +@torch.no_grad() +def convert(freevc, content, speaker): + audio = freevc.infer(content, g=speaker) + audio = audio[0][0].data.cpu().float().numpy() + return audio, 16000 + + +if __name__ == '__main__': + freevc_24, cmodel, smodel, hps = get_freevc_models() + + tgt = 'p226_002.wav' + # src = 'p226_002.wav' + src = 'p225_001.wav' + device = 'cuda' + + # tgt + wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda()) + + # src + wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) + content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) + + output, sr = convert(freevc_24, content, g_tgt) + + sf.write('output.wav', output, sr) \ No newline at end of file diff --git a/dreamvoice/plugin.py b/dreamvoice/plugin.py index 12243ecb47d63270aef13fd44c4dbd040198879d..aeef16a90eaa8851293ca2090bdcecb5544dde02 100644 --- a/dreamvoice/plugin.py +++ b/dreamvoice/plugin.py @@ -108,7 +108,6 @@ class DreamVoice_Plugin: self.spk_encoder = spk_encoder self.spk_embed_cache = None - @torch.no_grad() def gen_spk(self, prompt, prompt_guidance_scale=3, prompt_guidance_rescale=0.0, diff --git a/dreamvoice/plugin_ckpts/freevc.pt b/dreamvoice/plugin_ckpts/freevc.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5676c4bbc95085ed5a7da8b7d1d479849b1bd39 --- /dev/null +++ b/dreamvoice/plugin_ckpts/freevc.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0589fd38d965a7f8aab6eb3bedae5d1c007acb0f305e04bbe0fd4a771fff717d +size 104892189 diff --git a/dreamvoice/plugin_freevc.yaml b/dreamvoice/plugin_freevc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e67b8c26e8d4c0eb36d0650639e8a547f6e90691 --- /dev/null +++ b/dreamvoice/plugin_freevc.yaml @@ -0,0 +1,8 @@ +version: 1.1 + +lm_path: 'google/flan-t5-base' + +dreamvg: + config_path: 'src/configs/plugin_cross_freevc.yaml' + ckpt_path: 'plugin_ckpts/freevc.pt' + ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/freevc.pt' \ No newline at end of file diff --git a/dreamvoice/src/configs/plugin_cross.yaml b/dreamvoice/src/configs/plugin_cross_freevc.yaml similarity index 100% rename from dreamvoice/src/configs/plugin_cross.yaml rename to dreamvoice/src/configs/plugin_cross_freevc.yaml diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/README.md @@ -0,0 +1,13 @@ +--- +title: FreeVC +emoji: 🚀 +colorFrom: gray +colorTo: red +sdk: gradio +sdk_version: 3.13.0 +app_file: app.py +pinned: false +license: mit +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/app.py b/dreamvoice/train_utils/prepare_freevc/freevc/app.py new file mode 100644 index 0000000000000000000000000000000000000000..040c13a7f789e9edf88565c756d1059c2a3f1e01 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/app.py @@ -0,0 +1,103 @@ +import os +import torch +import librosa +import gradio as gr +from scipy.io.wavfile import write +from transformers import WavLMModel + +import utils +from models import SynthesizerTrn +from mel_processing import mel_spectrogram_torch +from speaker_encoder.voice_encoder import SpeakerEncoder + +''' +def get_wavlm(): + os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU') + shutil.move('WavLM-Large.pt', 'wavlm') +''' + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +print("Loading FreeVC...") +hps = utils.get_hparams_from_file("configs/freevc.json") +freevc = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +_ = freevc.eval() +_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) +smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + +print("Loading FreeVC(24k)...") +hps = utils.get_hparams_from_file("configs/freevc-24.json") +freevc_24 = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +_ = freevc_24.eval() +_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) + +print("Loading FreeVC-s...") +hps = utils.get_hparams_from_file("configs/freevc-s.json") +freevc_s = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +_ = freevc_s.eval() +_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None) + +print("Loading WavLM for content...") +cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + +def convert(model, src, tgt): + with torch.no_grad(): + # tgt + wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + if model == "FreeVC" or model == "FreeVC (24kHz)": + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + else: + wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device) + mel_tgt = mel_spectrogram_torch( + wav_tgt, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + # src + wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) + c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) + # infer + if model == "FreeVC": + audio = freevc.infer(c, g=g_tgt) + elif model == "FreeVC-s": + audio = freevc_s.infer(c, mel=mel_tgt) + else: + audio = freevc_24.infer(c, g=g_tgt) + audio = audio[0][0].data.cpu().float().numpy() + if model == "FreeVC" or model == "FreeVC-s": + write("out.wav", hps.data.sampling_rate, audio) + else: + write("out.wav", 24000, audio) + out = "out.wav" + return out + +model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model") +audio1 = gr.Audio(label="Source Audio", type='filepath') +audio2 = gr.Audio(label="Reference Audio", type='filepath') +inputs = [model, audio1, audio2] +outputs = gr.Audio(label="Output Audio", type='filepath') + +title = "FreeVC" +description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting." +article = "

Paper | Github Repo

" + +examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']] + +gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch() diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/commons.py b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py @@ -0,0 +1,171 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d( + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2,3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json new file mode 100644 index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8640, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8008" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,4,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json new file mode 100644 index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": false + } +} diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json new file mode 100644 index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..e16a7adcabb167ddc2c95e6d4bc722542f5fb716 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py @@ -0,0 +1,69 @@ +import os +import torch +import torch.nn.functional as F +import librosa +import sounddevice as sd +from transformers import WavLMModel +from scipy.io.wavfile import write +from models import SynthesizerTrn +from speaker_encoder.voice_encoder import SpeakerEncoder +import utils +import numpy as np +from transformers import T5Tokenizer, T5EncoderModel +from src.plugin_wrapper import DreamVG +import soundfile as sf + + +# Load configurations and models +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +print("Loading FreeVC...") +hps = utils.get_hparams_from_file("configs/freevc.json") +freevc = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +freevc.eval() +utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) + +print("Loading Speaker Encoder...") +smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + +print("Loading WavLM for content...") +cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + +lm_path = 'google/flan-t5-base' +tokenizer = T5Tokenizer.from_pretrained(lm_path) +text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() + +dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml', + ckpt_path='checkpoints/dreamvc_plugin.pt', + device=device) + + +prompt = "girl's voice, very young and cute" +prompt_guidance_scale = 3.0 + +text_batch = tokenizer(prompt, max_length=32, + padding='max_length', truncation=True, return_tensors="pt") +text, text_mask = text_batch.input_ids.to(device), \ + text_batch.attention_mask.to(device) +text = text_encoder(input_ids=text, attention_mask=text_mask)[0] +target_embedding = dreamvg.inference([text, text_mask], + guidance_scale=prompt_guidance_scale, + guidance_rescale=0.0, + ddim_steps=100, eta=1, + random_seed=None) + +# Convert to tensor and pad +audio, sr = librosa.load('segment_1.mp3', sr=16000) +audio = torch.from_numpy(audio).unsqueeze(0).to(device).float() +audio = F.pad(audio, (40, 40)) + +# Extract content features using WavLM +c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device) + +audio = freevc.infer(c, g=target_embedding) +audio = audio[0][0].data.cpu().float().numpy() + +sf.write('freevc_out.wav', audio, 16000) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py @@ -0,0 +1,112 @@ +import math +import os +import random +import torch +from torch import nn +import torch.nn.functional as F +import torch.utils.data +import numpy as np +import librosa +import librosa.util as librosa_util +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + '_' + str(spec.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/models.py new file mode 100644 index 0000000000000000000000000000000000000000..f732af47416bc0ed884a821e063fed5b7eab7957 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/models.py @@ -0,0 +1,351 @@ +import copy +import math +import torch +from torch import nn +from torch.nn import functional as F + +import commons +import modules + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from commons import init_weights, get_padding + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) + resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append(weight_norm( + ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), + k, u, padding=(k-u)//2))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2,3,5,7,11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class SpeakerEncoder(torch.nn.Module): + def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): + mel_slices = [] + for i in range(0, total_frames-partial_frames, partial_hop): + mel_range = torch.arange(i, i+partial_frames) + mel_slices.append(mel_range) + + return mel_slices + + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): + mel_len = mel.size(1) + last_mel = mel[:,-partial_frames:] + + if mel_len > partial_frames: + mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) + mels = list(mel[:,s] for s in mel_slices) + mels.append(last_mel) + mels = torch.stack(tuple(mels), 0).squeeze(1) + + with torch.no_grad(): + partial_embeds = self(mels) + embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) + #embed = embed / torch.linalg.norm(embed, 2) + else: + with torch.no_grad(): + embed = self(last_mel) + + return embed + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + use_spk, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + self.use_spk = use_spk + + self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16) + self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + if not self.use_spk: + self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels) + + def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if spec_lengths == None: + spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) + + if not self.use_spk: + g = self.enc_spk(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + _, m_p, logs_p, _ = self.enc_p(c, c_lengths) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + z_p = self.flow(z, spec_mask, g=g) + + z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size) + o = self.dec(z_slice, g=g) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, c, g=None, mel=None, c_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if not self.use_spk: + g = self.enc_spk.embed_utterance(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths) + z = self.flow(z_p, c_mask, g=g, reverse=True) + o = self.dec(z * c_mask, g=g) + + return o diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/modules.py b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..1eeb47c190cdc4d42d5de5fa47f94ecc1b931c5d --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py @@ -0,0 +1,342 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm + +import commons +from commons import init_weights, get_padding + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers-1): + self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size ** i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, + groups=channels, dilation=dilation, padding=padding + )) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + self.hidden_channels =hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply( + x_in, + g_l, + n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:,:self.hidden_channels,:] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:,self.hidden_channels:,:] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels,1)) + self.logs = nn.Parameter(torch.zeros(channels,1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1,2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels]*2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels]*2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1,2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt @@ -0,0 +1,8 @@ +altair +httpx==0.24.1 +numpy +scipy +torch +transformers +librosa +webrtcvad==2.0.10 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml @@ -0,0 +1,47 @@ +version: 1.0 + +system: "base" + +model: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d8b894cd095accdcb9eab7788e8088d0430eae1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml @@ -0,0 +1,34 @@ +version: 1.0 + +system: "base" + +diffwrap: + cls_embedding: + speaker_dim: 256 + feature_dim: 512 + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [128, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 512] + down_block_types: + [ + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ] + up_block_types: + [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D" + ] + attention_head_dim: 32 + class_embed_type: 'identity' \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml @@ -0,0 +1,45 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: false + + unet: + sample_size: [128, 256] + in_channels: 257 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 256, 512] + down_block_types: + [ + "DownBlock2D", + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 2.75 + shift: 5.80 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml @@ -0,0 +1,33 @@ +version: 1.0 + +system: "cross" + +diffwrap: + cls_embedding: + content_dim: 768 + content_hidden: 256 + use_pitch: true + pitch_dim: 1 + pitch_hidden: 128 + + unet: + sample_size: [100, 256] + in_channels: 385 + out_channels: 1 + layers_per_block: 2 + block_out_channels: [128, 256, 512] + down_block_types: + [ + "DownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "UpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml @@ -0,0 +1,39 @@ +version: 1.0 + +system: "cross" + +model: + cls_embedding: + content_dim: 768 + content_hidden: 256 + + unet: + sample_size: [1, 1] + in_channels: 256 + out_channels: 256 + layers_per_block: 2 + block_out_channels: [256] + down_block_types: + [ + "CrossAttnDownBlock2D", + ] + up_block_types: + [ + "CrossAttnUpBlock2D", + ] + attention_head_dim: 32 + cross_attention_dim: 768 + +scheduler: + num_train_steps: 1000 + beta_schedule: 'linear' + beta_start: 0.0001 + beta_end: 0.02 + num_infer_steps: 50 + rescale_betas_zero_snr: true + timestep_spacing: "trailing" + clip_sample: false + prediction_type: 'v_prediction' + scale: 0.05 + shift: -0.035 + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py @@ -0,0 +1,103 @@ +import os +import torch +import librosa +import numpy as np +import soundfile as sf +import pandas as pd +# from feats.hubert_model import get_soft_model, get_hubert_soft_content +from feats.contentvec_hf import get_content_model, get_content +# from modules.speaker_encoder.encoder import inference as spk_encoder +# from pathlib import Path +from tqdm import tqdm +from multiprocessing import Process +import pyworld as pw + + +def resample_save(infolder, audio_path, model, + audio_sr=24000, content_sr=16000, min_length=1.92, + content_resolution=50, + save_path='features'): + if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False: + audio, sr = librosa.load(infolder + audio_path, sr=content_sr) + final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution) + # final_length = final_length / content_sr + + length = max(round(min_length*content_sr), round(final_length)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + + # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = get_content(model, torch.tensor(audio_save).unsqueeze(0)) + content = content.cpu() + os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True) + torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt') + # print(audio_save.shape) + # print(content.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr)) + # print(save_path + '/' + 'audio_16k/' + audio_path) + + audio, sr = librosa.load(infolder + audio_path, sr=audio_sr) + length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr)) + assert length % 10 == 0 + audio = audio[:length] + audio_save = np.zeros(length, dtype=audio.dtype) + audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]] + # print(audio_save.shape) + os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True) + sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr)) + + +def extract_f0(in_folder, audio_path, save_path): + audio, sr = librosa.load(in_folder + audio_path, sr=None) + assert sr == 16000 + if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False: + # wav = audio + # wav = np.pad(wav, int((1024-320)/2), mode='reflect') + # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr, + # fmin=librosa.note_to_hz('C2'), + # fmax=librosa.note_to_hz('C6')) + + _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000) + f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1] + + f0 = np.nan_to_num(f0) + os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True) + # print(save_path + '/' + 'f0/' + audio_path + '.pt') + torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt') + + +def chunks(arr, m): + result = [[] for i in range(m)] + for i in range(len(arr)): + result[i%m].append(arr[i]) + return result + + +def extract_f0_main(in_folder, audio_paths, save_path): + for audio_path in tqdm(audio_paths): + extract_f0(in_folder, audio_path, save_path) + + +if __name__ == '__main__': + df = pd.read_csv('../test_data/vc_meta.csv') + # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda') + model = get_content_model().to('cuda') + # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda") + for i in tqdm(range(len(df))): + row = df.iloc[i] + in_path = row['path'] + resample_save('../test_data/', in_path, model, save_path='../features/') + + in_folder = '../features/audio_16k/' + audio_files = list(df['path']) + save_path = '../features/' + cores = 6 + + subsets = chunks(audio_files, cores) + + for subset in subsets: + t = Process(target=extract_f0_main, args=(in_folder, subset, save_path)) + t.start() \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py new file mode 100644 index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py @@ -0,0 +1,42 @@ +import torch +import librosa +from fairseq import checkpoint_utils +import torch.nn.functional as F + + +def get_model(vec_path): + print("load model(s) from {}".format(vec_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + model = models[0] + model.eval() + return model + + +@torch.no_grad() +def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + feats = wav_16k_tensor + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav_16k_tensor.device), + "padding_mask": padding_mask.to(wav_16k_tensor.device), + "output_layer": layer + } + logits = hmodel.extract_features(**inputs)[0] + # feats = hmodel.final_proj(logits[0]) + return logits + + +if __name__ == '__main__': + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + model = get_model('../../ckpts/checkpoint_best_legacy_500.pt') + model = model.cuda() + content = get_content(model, torch.tensor([audio])) + print(content) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py @@ -0,0 +1,40 @@ +from transformers import HubertModel +import torch.nn as nn +import torch +import torch.nn.functional as F +import librosa + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + + # The final projection layer is only used for backward compatibility. + # Following https://github.com/auspicious3000/contentvec/issues/6 + # Remove this layer is necessary to achieve the desired outcome. + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def get_content_model(config='lengyue233/content-vec-best'): + model = HubertModelWithFinalProj.from_pretrained(config) + model.eval() + return model + + +@torch.no_grad() +def get_content(model, wav_16k_tensor, device='cuda'): + # print(layer) + wav_16k_tensor = wav_16k_tensor.to(device) + # so that the output shape will be len(audio//320) + wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2)) + logits = model(wav_16k_tensor)['last_hidden_state'] + return logits + + +if __name__ == '__main__': + model = get_content_model().cuda() + audio, sr = librosa.load('test.wav', sr=16000) + audio = audio[:100*320] + audio = torch.tensor([audio]) + content = get_content(model, audio, 'cuda') + print(content) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0202868f93e8b1be2f925f2ec6b22f3df691e8c3 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# VSCode project settings +.vscode + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Benjamin van Niekerk + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..68602858ed726acd4f99ce9fecca008f3511dc90 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md @@ -0,0 +1,161 @@ +# HuBERT + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2111.02392) +[![demo](https://img.shields.io/static/v1?message=Audio%20Samples&logo=Github&labelColor=grey&color=blue&logoColor=white&label=%20&style=flat)](https://bshall.github.io/soft-vc/) +[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb) + +Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484). +For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb). + +
+ Soft-VC +
+
+ + Fig 1: Architecture of the voice conversion system. a) The discrete content encoder clusters audio features to produce a sequence of discrete speech units. b) The soft content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform. + +
+ +## Example Usage + +### Programmatic Usage + +```python +import torch, torchaudio + +# Load checkpoint (either hubert_soft or hubert_discrete) +hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda() + +# Load audio +wav, sr = torchaudio.load("path/to/wav") +assert sr == 16000 +wav = wav.unsqueeze(0).cuda() + +# Extract speech units +units = hubert.units(x) +``` + +### Script-Based Usage + +``` +usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir + +Encode an audio dataset. + +positional arguments: + {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete) + in-dir path to the dataset directory. + out-dir path to the output directory. + +optional arguments: + -h, --help show this help message and exit + --extension EXTENSION + extension of the audio files (defaults to .flac). +``` + +## Training + +### Step 1: Dataset Preparation + +Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory: + +``` +│ lengths.json +│ +└───wavs + ├───dev-* + │ ├───84 + │ ├───... + │ └───8842 + └───train-* + ├───19 + ├───... + └───8975 +``` + +The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples: + +```json +{ + "dev-clean/1272/128104/1272-128104-0000": 93680, + "dev-clean/1272/128104/1272-128104-0001": 77040, +} +``` + +### Step 2: Extract Discrete Speech Units + +Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script: + +``` +usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir + +Encode an audio dataset. + +positional arguments: + {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete) + in-dir path to the dataset directory. + out-dir path to the output directory. + +optional arguments: + -h, --help show this help message and exit + --extension EXTENSION + extension of the audio files (defaults to .flac). +``` + +for example: + +``` +python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete +``` + +At this point the directory tree should look like: + +``` +│ lengths.json +│ +├───discrete +│ ├───... +└───wavs + ├───... +``` + +### Step 3: Train the HuBERT-Soft Content Encoder + +``` +usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir + +Train HuBERT soft content encoder. + +positional arguments: + dataset-dir path to the data directory. + checkpoint-dir path to the checkpoint directory. + +optional arguments: + -h, --help show this help message and exit + --resume RESUME path to the checkpoint to resume from. + --warmstart whether to initialize from the fairseq HuBERT checkpoint. + --mask whether to use input masking. + --alpha ALPHA weight for the masked loss. +``` + +## Links + +- [Soft-VC repo](https://github.com/bshall/soft-vc) +- [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484) +- [Official HuBERT repo](https://github.com/pytorch/fairseq) +- [HuBERT paper](https://arxiv.org/abs/2106.07447) + +## Citation + +If you found this work helpful please consider citing our paper: + +``` +@inproceedings{ + soft-vc-2022, + author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman}, + booktitle={ICASSP}, + title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion}, + year={2022} +} +``` diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..18b754c73c63b79e943d51e76414f0056f05589f --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py @@ -0,0 +1,66 @@ +from pathlib import Path +import logging +import argparse + +import torch +import numpy as np +from sklearn.cluster import KMeans + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def cluster(args): + with open(args.subset) as file: + subset = [line.strip() for line in file] + + logger.info(f"Loading features from {args.in_dir}") + features = [] + for path in subset: + in_path = args.in_dir / path + features.append(np.load(in_path.with_suffix(".npy"))) + features = np.concatenate(features, axis=0) + + logger.info(f"Clustering features of shape: {features.shape}") + kmeans = KMeans(n_clusters=args.n_clusters).fit(features) + + checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt" + checkpoint_path.parent.mkdir(exist_ok=True, parents=True) + torch.save( + checkpoint_path, + { + "n_features_in_": kmeans.n_features_in_, + "_n_threads": kmeans._n_threads, + "cluster_centers_": kmeans.cluster_centers_, + }, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Cluster speech features features.") + parser.add_argument( + "in_dir", + metavar="in-dir", + help="path to the encoded dataset", + type=Path, + ) + parser.add_argument( + "subset", + matavar="subset", + help="path to the .txt file containing the list of files to cluster", + type=Path, + ) + parser.add_argument( + "checkpoint_dir", + metavar="checkpoint-dir", + help="path to the checkpoint directory", + type=Path, + ) + parser.add_argument( + "--n-clusters", + help="number of clusters", + type=int, + default=100, + ) + args = parser.parse_args() + cluster(args) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png new file mode 100644 index 0000000000000000000000000000000000000000..fc59d538a9383896cf0c36e1d4a3f5030fce38fe Binary files /dev/null and b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png differ diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py new file mode 100644 index 0000000000000000000000000000000000000000..14246e985fb0e9dc157d290853af6dcf6036f61c --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py @@ -0,0 +1,60 @@ +import argparse +import logging +import numpy as np +from pathlib import Path +from tqdm import tqdm + +import torch +import torchaudio +from torchaudio.functional import resample + + +def encode_dataset(args): + print(f"Loading hubert checkpoint") + hubert = torch.hub.load( + "bshall/hubert:main", + f"hubert_{args.model}", + trust_repo=True, + ).cuda() + + print(f"Encoding dataset at {args.in_dir}") + for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))): + wav, sr = torchaudio.load(in_path) + wav = resample(wav, sr, 16000) + wav = wav.unsqueeze(0).cuda() + + with torch.inference_mode(): + units = hubert.units(wav) + + out_path = args.out_dir / in_path.relative_to(args.in_dir) + out_path.parent.mkdir(parents=True, exist_ok=True) + np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Encode an audio dataset.") + parser.add_argument( + "model", + help="available models (HuBERT-Soft or HuBERT-Discrete)", + choices=["soft", "discrete"], + ) + parser.add_argument( + "in_dir", + metavar="in-dir", + help="path to the dataset directory.", + type=Path, + ) + parser.add_argument( + "out_dir", + metavar="out-dir", + help="path to the output directory.", + type=Path, + ) + parser.add_argument( + "--extension", + help="extension of the audio files (defaults to .flac).", + default=".flac", + type=str, + ) + args = parser.parse_args() + encode_dataset(args) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..b58749e4a40b29eab470686b27e06a97bfecb321 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py @@ -0,0 +1,80 @@ +dependencies = ["torch", "torchaudio", "sklearn"] + +URLS = { + "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt", + "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt", + "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt", +} + +import torch +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + +from sklearn.cluster import KMeans + +from hubert import HubertDiscrete, HubertSoft + + +def hubert_discrete( + pretrained: bool = True, + progress: bool = True, +) -> HubertDiscrete: + r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + pretrained (bool): load pretrained weights into the model + progress (bool): show progress bar when downloading model + """ + kmeans = kmeans100(pretrained=pretrained, progress=progress) + hubert = HubertDiscrete(kmeans) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-discrete"], progress=progress + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +def hubert_soft( + pretrained: bool = True, + progress: bool = True, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + pretrained (bool): load pretrained weights into the model. + progress (bool): show progress bar when downloading model. + """ + hubert = HubertSoft() + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-soft"], + progress=progress, + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +def _kmeans( + num_clusters: int, pretrained: bool = True, progress: bool = True +) -> KMeans: + kmeans = KMeans(num_clusters) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS[f"kmeans{num_clusters}"], progress=progress + ) + kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"] + kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"] + kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy() + return kmeans + + +def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans: + r""" + k-means checkpoint for HuBERT-Discrete with 100 clusters. + Args: + pretrained (bool): load pretrained weights into the model + progress (bool): show progress bar when downloading model + """ + return _kmeans(100, pretrained, progress) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e07f859e99f51dcf35639f26a3eb53a81c993f3 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py @@ -0,0 +1,5 @@ +from .model import ( + Hubert, + HubertDiscrete, + HubertSoft, +) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3ac2b84f95340e088913e06db8e5db0a68e83c2e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py @@ -0,0 +1,91 @@ +import random +from pathlib import Path +import numpy as np +import json + +import torch +import torch.nn.functional as F +from torch.utils.data import Dataset +import torchaudio + + +class AcousticUnitsDataset(Dataset): + def __init__( + self, + root: Path, + sample_rate: int = 16000, + label_rate: int = 50, + min_samples: int = 32000, + max_samples: int = 250000, + train: bool = True, + ): + self.wavs_dir = root / "wavs" + self.units_dir = root / "discrete" + + with open(root / "lengths.json") as file: + self.lenghts = json.load(file) + + pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac" + metadata = ( + (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix()) + for path in self.wavs_dir.rglob(pattern) + ) + metadata = ((path, key) for path, key in metadata if key in self.lenghts) + self.metadata = [ + path for path, key in metadata if self.lenghts[key] > min_samples + ] + + self.sample_rate = sample_rate + self.label_rate = label_rate + self.min_samples = min_samples + self.max_samples = max_samples + self.train = train + + def __len__(self): + return len(self.metadata) + + def __getitem__(self, index): + wav_path = self.metadata[index] + units_path = self.units_dir / wav_path.relative_to(self.wavs_dir) + + wav, _ = torchaudio.load(wav_path) + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + codes = np.load(units_path.with_suffix(".npy")) + + return wav, torch.from_numpy(codes).long() + + def collate(self, batch): + wavs, codes = zip(*batch) + wavs, codes = list(wavs), list(codes) + + wav_lengths = [wav.size(-1) for wav in wavs] + code_lengths = [code.size(-1) for code in codes] + + wav_frames = min(self.max_samples, *wav_lengths) + + collated_wavs, wav_offsets = [], [] + for wav in wavs: + wav_diff = wav.size(-1) - wav_frames + wav_offset = random.randint(0, wav_diff) + wav = wav[:, wav_offset : wav_offset + wav_frames] + + collated_wavs.append(wav) + wav_offsets.append(wav_offset) + + rate = self.label_rate / self.sample_rate + code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets] + code_frames = round(wav_frames * rate) + remaining_code_frames = [ + length - offset for length, offset in zip(code_lengths, code_offsets) + ] + code_frames = min(code_frames, *remaining_code_frames) + + collated_codes = [] + for code, code_offset in zip(codes, code_offsets): + code = code[code_offset : code_offset + code_frames] + collated_codes.append(code) + + wavs = torch.stack(collated_wavs, dim=0) + codes = torch.stack(collated_codes, dim=0) + + return wavs, codes diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py new file mode 100644 index 0000000000000000000000000000000000000000..523dd95633ba73babff8b6836324ae0a7c2d267f --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py @@ -0,0 +1,241 @@ +import copy +from typing import Optional, Tuple +import random + +from sklearn.cluster import KMeans + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Hubert(nn.Module): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu", batch_first=True + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) + x[mask] = self.masked_spec_embed.to(x.dtype) + return x, mask + + def encode( + self, x: torch.Tensor, layer: Optional[int] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose(1, 2)) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: torch.Tensor) -> torch.Tensor: + logits = torch.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + dim=-1, + ) + return logits / 0.1 + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x, mask = self.encode(x) + x = self.proj(x) + logits = self.logits(x) + return logits, mask + + +class HubertSoft(Hubert): + """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.""" + + def __init__(self): + super().__init__() + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.Tensor: + """Extract soft speech units. + + Args: + wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples. + + Returns: + Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions. + """ + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + +class HubertDiscrete(Hubert): + """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.""" + + def __init__(self, kmeans: KMeans): + super().__init__(504) + self.kmeans = kmeans + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.LongTensor: + """Extract discrete speech units. + + Args: + wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples. + + Returns: + LongTensor: soft speech units of shape (N,), where N is the number of frames. + """ + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav, layer=7) + x = self.kmeans.predict(x.squeeze().cpu().numpy()) + return torch.tensor(x, dtype=torch.long, device=wav.device) + + +class FeatureExtractor(nn.Module): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) + self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.gelu(self.norm0(self.conv0(x))) + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = F.gelu(self.conv3(x)) + x = F.gelu(self.conv4(x)) + x = F.gelu(self.conv5(x)) + x = F.gelu(self.conv6(x)) + return x + + +class FeatureProjection(nn.Module): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x.transpose(1, 2)) + x = F.gelu(x[:, :, :-1]) + return x.transpose(1, 2) + + +class TransformerEncoder(nn.Module): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, + ) -> torch.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, +) -> torch.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = torch.ones( + (batch_size, sequence_length - (mask_length - 1)), device=device + ) + + # get random indices to mask + mask_indices = torch.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + torch.arange(mask_length, device=device)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d42ba3acb822938f246dba27b3de81ec51aa72b0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py @@ -0,0 +1,61 @@ +import torch + + +class Metric: + def __init__(self): + self.steps = 0 + self.value = 0 + + def update(self, value): + self.steps += 1 + self.value += (value - self.value) / self.steps + return self.value + + def reset(self): + self.steps = 0 + self.value = 0 + + +def save_checkpoint( + checkpoint_dir, + hubert, + optimizer, + scaler, + step, + loss, + best, + logger, +): + state = { + "hubert": hubert.state_dict(), + "optimizer": optimizer.state_dict(), + "scaler": scaler.state_dict(), + "step": step, + "loss": loss, + } + checkpoint_dir.mkdir(exist_ok=True, parents=True) + checkpoint_path = checkpoint_dir / f"model-{step}.pt" + torch.save(state, checkpoint_path) + if best: + best_path = checkpoint_dir / "model-best.pt" + torch.save(state, best_path) + logger.info(f"Saved checkpoint: {checkpoint_path.stem}") + + +def load_checkpoint( + load_path, + hubert, + optimizer, + scaler, + rank, + logger, +): + logger.info(f"Loading checkpoint from {load_path}") + checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"}) + hubert.load_state_dict(checkpoint["hubert"]) + if "scaler" in checkpoint: + scaler.load_state_dict(checkpoint["scaler"]) + if "optimizer" in checkpoint: + optimizer.load_state_dict(checkpoint["optimizer"]) + step, loss = checkpoint.get("step", 0), checkpoint.get("loss", float("inf")) + return step, loss diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ff5ca9de087f72e343ffb4e5ef00cdbb90765097 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py @@ -0,0 +1,459 @@ +import argparse +import logging +from pathlib import Path + +import torch +import torch.cuda.amp as amp +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.distributed as dist +from torch.utils.data.distributed import DistributedSampler +import torch.multiprocessing as mp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + +from hubert.model import Hubert, URLS +from hubert.dataset import AcousticUnitsDataset +from hubert.utils import Metric, save_checkpoint, load_checkpoint + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +######################################################################################## +# Define hyperparameters for training: +######################################################################################## + +BATCH_SIZE = 32 +LEARNING_RATE = 2e-5 +BETAS = (0.9, 0.98) +EPS = 1e-06 +WEIGHT_DECAY = 1e-2 +MAX_NORM = 10 +STEPS = 25000 +LOG_INTERVAL = 5 +VALIDATION_INTERVAL = 1000 +CHECKPOINT_INTERVAL = 5000 +BACKEND = "nccl" +INIT_METHOD = "tcp://localhost:54321" + + +def train(rank, world_size, args): + dist.init_process_group( + BACKEND, + rank=rank, + world_size=world_size, + init_method=INIT_METHOD, + ) + + #################################################################################### + # Setup logging utilities: + #################################################################################### + + log_dir = args.checkpoint_dir / "logs" + log_dir.mkdir(exist_ok=True, parents=True) + + if rank == 0: + logger.setLevel(logging.INFO) + handler = logging.FileHandler(log_dir / f"{args.checkpoint_dir.stem}.log") + handler.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + else: + logger.setLevel(logging.ERROR) + + writer = SummaryWriter(log_dir) if rank == 0 else None + + #################################################################################### + # Initialize models + #################################################################################### + + hubert = Hubert(mask=args.mask).to(rank) + + if args.warmstart: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-discrete"], map_location={"cuda:0": f"cuda:{rank}"} + ) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + + # don't use warmstart weights for label embeddings and proj layer + del checkpoint["hubert"]["label_embedding.weight"] + del checkpoint["hubert"]["proj.weight"] + del checkpoint["hubert"]["proj.bias"] + + hubert.load_state_dict(checkpoint["hubert"], strict=False) + + hubert = DDP(hubert, device_ids=[rank]) + + #################################################################################### + # Initialze optimizer and grad scaler + #################################################################################### + + optimizer = optim.AdamW( + hubert.parameters(), + lr=LEARNING_RATE, + betas=BETAS, + eps=EPS, + weight_decay=WEIGHT_DECAY, + ) + scaler = amp.GradScaler() + + #################################################################################### + # Initialize datasets and dataloaders + #################################################################################### + + train_dataset = AcousticUnitsDataset( + root=args.dataset_dir, + train=True, + ) + train_sampler = DistributedSampler(train_dataset, drop_last=True) + train_loader = DataLoader( + train_dataset, + collate_fn=train_dataset.collate, + batch_size=BATCH_SIZE, + sampler=train_sampler, + num_workers=8, + pin_memory=True, + shuffle=False, + drop_last=True, + ) + + validation_dataset = AcousticUnitsDataset( + root=args.dataset_dir, + train=False, + ) + validation_loader = DataLoader( + validation_dataset, + batch_size=1, + shuffle=False, + num_workers=8, + pin_memory=True, + ) + + #################################################################################### + # Load checkpoint if args.resume is set + #################################################################################### + + if args.resume is not None: + global_step, best_loss = load_checkpoint( + load_path=args.resume, + hubert=hubert, + optimizer=optimizer, + scaler=scaler, + rank=rank, + logger=logger, + ) + else: + global_step, best_loss = 0, float("inf") + + # =================================================================================# + # Start training loop + # =================================================================================# + + n_epochs = STEPS // len(train_loader) + 1 + start_epoch = global_step // len(train_loader) + 1 + + logger.info("**" * 40) + logger.info(f"PyTorch version: {torch.__version__}") + logger.info(f"CUDA version: {torch.version.cuda}") + logger.info(f"CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}") + logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}") + logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}") + logger.info(f"# of GPUS: {torch.cuda.device_count()}") + logger.info(f"batch size: {BATCH_SIZE}") + logger.info(f"iterations per epoch: {len(train_loader)}") + logger.info(f"# of epochs: {n_epochs}") + logger.info(f"started at epoch: {start_epoch}") + logger.info("**" * 40 + "\n") + + if args.mask: + average_masked_loss = Metric() + average_unmasked_loss = Metric() + average_masked_accuracy = Metric() + average_unmasked_accuracy = Metric() + + epoch_masked_loss = Metric() + epoch_unmasked_loss = Metric() + epoch_masked_accuracy = Metric() + epoch_unmasked_accuracy = Metric() + else: + average_loss = Metric() + average_accuracy = Metric() + + epoch_loss = Metric() + epoch_accuracy = Metric() + + validation_loss = Metric() + validation_accuracy = Metric() + + for epoch in range(start_epoch, n_epochs + 1): + train_sampler.set_epoch(epoch) + + hubert.train() + if args.mask: + epoch_masked_loss.reset() + epoch_unmasked_loss.reset() + epoch_masked_accuracy.reset() + epoch_unmasked_accuracy.reset() + else: + epoch_loss.reset() + epoch_accuracy.reset() + + for wavs, codes in train_loader: + global_step += 1 + wavs, codes = wavs.to(rank), codes.to(rank) + + ############################################################################ + # Compute training loss + ############################################################################ + + optimizer.zero_grad() + + with amp.autocast(): + logits, mask = hubert(wavs) + length = min( + mask.size(-1) if args.mask else float("inf"), codes.size(-1) + ) + logits = logits[:, :length, :] + codes = codes[:, :length] + if args.mask: + mask = mask[:, :length] + + if args.mask: + masked_loss = F.cross_entropy(logits[mask], codes[mask]) + unmasked_loss = F.cross_entropy(logits[~mask], codes[~mask]) + loss = args.alpha * masked_loss + (1 - args.alpha) * unmasked_loss + else: + loss = F.cross_entropy(logits.transpose(1, 2), codes) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + + nn.utils.clip_grad_norm_(hubert.parameters(), MAX_NORM) + + scaler.step(optimizer) + scaler.update() + + if args.mask: + masked_accuracy = logits[mask].argmax(dim=-1) == codes[mask] + masked_accuracy = torch.mean(masked_accuracy.float()) + + unmasked_accuracy = logits[~mask].argmax(dim=-1) == codes[~mask] + unmasked_accuracy = torch.mean(unmasked_accuracy.float()) + else: + accuracy = logits.argmax(dim=-1) == codes + accuracy = torch.mean(accuracy.float()) + + ############################################################################ + # Update and log training metrics + ############################################################################ + + if args.mask: + average_masked_loss.update(masked_loss.item()) + average_unmasked_loss.update(unmasked_loss.item()) + average_masked_accuracy.update(masked_accuracy.item()) + average_unmasked_accuracy.update(unmasked_accuracy.item()) + + epoch_masked_loss.update(masked_loss.item()) + epoch_unmasked_loss.update(unmasked_loss.item()) + epoch_masked_accuracy.update(masked_accuracy.item()) + epoch_unmasked_accuracy.update(unmasked_accuracy.item()) + else: + average_loss.update(loss.item()) + average_accuracy.update(accuracy.item()) + + epoch_loss.update(loss.item()) + epoch_accuracy.update(accuracy.item()) + + if rank == 0 and global_step % LOG_INTERVAL == 0: + if args.mask: + writer.add_scalar( + "train/masked_loss", + average_masked_loss.value, + global_step, + ) + writer.add_scalar( + "train/unmasked_loss", + average_unmasked_loss.value, + global_step, + ) + writer.add_scalar( + "train/masked_accuracy", + average_masked_accuracy.value * 100, + global_step, + ) + writer.add_scalar( + "train/unmasked_accuracy", + average_unmasked_accuracy.value * 100, + global_step, + ) + average_masked_loss.reset() + average_unmasked_loss.reset() + average_masked_accuracy.reset() + average_unmasked_accuracy.reset() + else: + writer.add_scalar( + "train/loss", + average_loss.value, + global_step, + ) + writer.add_scalar( + "train/accuracy", + average_accuracy.value, + global_step, + ) + average_loss.reset() + average_accuracy.reset() + + # --------------------------------------------------------------------------# + # Start validation loop + # --------------------------------------------------------------------------# + + if global_step % VALIDATION_INTERVAL == 0: + hubert.eval() + validation_loss.reset() + validation_accuracy.reset() + for wavs, codes in validation_loader: + wavs, codes = wavs.to(rank), codes.to(rank) + + with torch.no_grad(): + logits, _ = hubert(wavs) + logits = logits.transpose(1, 2) + + loss = F.cross_entropy(logits, codes) + + accuracy = logits.argmax(dim=1) == codes + accuracy = torch.mean(accuracy.float()) + + #################################################################### + # Update validation metrics + #################################################################### + + validation_loss.update(loss.item()) + validation_accuracy.update(accuracy.item()) + + hubert.train() + + ############################################################################ + # Log validation metrics + ############################################################################ + + if rank == 0: + writer.add_scalar( + "validation/unit_loss", + validation_loss.value, + global_step, + ) + writer.add_scalar( + "validation/unit_accuracy", + validation_accuracy.value * 100, + global_step, + ) + logger.info( + f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}, accuracy: {validation_accuracy.value * 100:.2f}" + ) + + ############################################################################ + # Save model checkpoint + ############################################################################ + + new_best = best_loss > validation_loss.value + if new_best or global_step % CHECKPOINT_INTERVAL == 0: + if new_best: + logger.info("-------- new best model found!") + best_loss = validation_loss.value + + if rank == 0: + save_checkpoint( + checkpoint_dir=args.checkpoint_dir, + hubert=hubert, + optimizer=optimizer, + scaler=scaler, + step=global_step, + loss=validation_loss.value, + best=new_best, + logger=logger, + ) + + # -----------------------------------------------------------------------------# + # End validation loop + # -----------------------------------------------------------------------------# + + #################################################################################### + # Log training metrics + #################################################################################### + + logger.info( + f""" + train -- epoch: {epoch}, masked loss: {epoch_masked_loss.value:.4f}, unmasked loss: {epoch_unmasked_loss.value:.4f}, + masked accuracy: {epoch_masked_accuracy.value * 100:.2f}, umasked accuracy: {epoch_unmasked_accuracy.value * 100:.2f} + """ + ) + + # ==================================================================================# + # End training loop + # ==================================================================================# + + dist.destroy_process_group() + + +def train_hubert(args): + world_size = torch.cuda.device_count() + mp.spawn( + train, + args=(world_size, args), + nprocs=world_size, + join=True, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Train HuBERT soft content encoder.") + parser.add_argument( + "dataset_dir", + metavar="dataset-dir", + help="path to the data directory.", + type=Path, + ) + parser.add_argument( + "checkpoint_dir", + metavar="checkpoint-dir", + help="path to the checkpoint directory.", + type=Path, + ) + parser.add_argument( + "--resume", + help="path to the checkpoint to resume from.", + type=Path, + ) + parser.add_argument( + "--warmstart", + help="whether to initialize from the fairseq HuBERT checkpoint.", + action="store_true", + ) + parser.add_argument( + "--mask", + help="whether to use input masking.", + action="store_true", + ) + parser.add_argument( + "--alpha", + help="weight for the masked loss.", + default=1, + type=float, + ) + args = parser.parse_args() + + world_size = torch.cuda.device_count() + mp.spawn( + train, + args=(world_size, args), + nprocs=world_size, + join=True, + ) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py @@ -0,0 +1,24 @@ +import torch, torchaudio +from .hubert.hubert import HubertSoft +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present +import librosa + + +def get_soft_model(model_path): + hubert = HubertSoft() + # Load checkpoint (either hubert_soft or hubert_discrete) + # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) + checkpoint = torch.load(model_path) + consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.") + hubert.load_state_dict(checkpoint["hubert"]) + hubert.eval() + return hubert + + +@torch.no_grad() +def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'): + wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1) + # print(wav_16k_tensor.shape) + units = hmodel.units(wav_16k_tensor) + # print(units.shape) + return units.cpu() \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class DiffVC(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.speaker_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim'])) + self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) / + self.config['cls_embedding']['speaker_dim'] ** 0.5) + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, speaker, pitch, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype) + batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device) + speaker = torch.where(batch_mask, uncond, speaker) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + speaker = self.speaker_embedding(speaker) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_base_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + p = torch.rand(2, 256, 1).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + spk = torch.rand(2, 256).to(device) + + output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py new file mode 100644 index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class DiffVC_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + self.content_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden'])) + + if self.config['cls_embedding']['use_pitch']: + self.pitch_control = True + self.pitch_embedding = nn.Sequential( + nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']), + nn.SiLU(), + nn.Linear(self.config['cls_embedding']['pitch_hidden'], + self.config['cls_embedding']['pitch_hidden'])) + + self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) / + self.config['cls_embedding']['pitch_hidden'] ** 0.5) + else: + print('no pitch module') + self.pitch_control = False + + def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None, + train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0): + B, C, M, L = target.shape + content = self.content_embedding(content) + content = repeat(content, "b t c-> b c m t", m=M) + target = target.to(content.dtype) + x = torch.cat([target, content], dim=1) + + if self.pitch_control: + if pitch is not None: + pitch = self.pitch_embedding(pitch) + else: + pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + + if train_cfg: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + if self.pitch_control: + batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device) + pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype) + pitch = torch.where(batch_mask, pitch_uncond, pitch) + + prompt = self.context_embedding(prompt) + + if self.pitch_control: + pitch = repeat(pitch, "b t c-> b c m t", m=M) + x = torch.cat([x, pitch], dim=1) + + output = self.unet(sample=x, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output + + +if __name__ == "__main__": + with open('diffvc_cross_pitch.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = DiffVC_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 1, 100, 256)).to(device) + y = torch.rand((2, 256, 768)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + p = torch.rand(2, 256, 1).to(device) + + output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py new file mode 100644 index 0000000000000000000000000000000000000000..23e878e4daa06309e7ca9b6d970f333bcf9d4524 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn +from diffusers import UNet2DModel, UNet2DConditionModel +import yaml +from einops import repeat, rearrange + +from typing import Any +from torch import Tensor + + +def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor: + if proba == 1: + return torch.ones(shape, device=device, dtype=torch.bool) + elif proba == 0: + return torch.zeros(shape, device=device, dtype=torch.bool) + else: + return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool) + + +class FixedEmbedding(nn.Module): + def __init__(self, features=128): + super().__init__() + self.embedding = nn.Embedding(1, features) + + def forward(self, y): + B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device + embed = self.embedding(torch.zeros(B, device=device).long()) + fixed_embedding = repeat(embed, "b c -> b l c", l=L) + return fixed_embedding + + +class P2E_Cross(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.unet = UNet2DConditionModel(**self.config['unet']) + # self.unet.set_use_memory_efficient_attention_xformers(True) + self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim']) + + self.context_embedding = nn.Sequential( + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']), + nn.SiLU(), + nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim'])) + + def forward(self, target, t, prompt, prompt_mask=None, + train_cfg=False, cfg_prob=0.0): + B, C = target.shape + target = target.unsqueeze(-1).unsqueeze(-1) + + if train_cfg: + if cfg_prob > 0.0: + # Randomly mask embedding + batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device) + fixed_embedding = self.cfg_embedding(prompt).to(target.dtype) + prompt = torch.where(batch_mask, fixed_embedding, prompt) + + prompt = self.context_embedding(prompt) + # fix the bug that prompt will copy dtype from target in diffusers + target = target.to(prompt.dtype) + + output = self.unet(sample=target, timestep=t, + encoder_hidden_states=prompt, + encoder_attention_mask=prompt_mask)['sample'] + + return output.squeeze(-1).squeeze(-1) + + +if __name__ == "__main__": + with open('p2e_cross.yaml', 'r') as fp: + config = yaml.safe_load(fp) + device = 'cuda' + + model = P2E_Cross(config['diffwrap']).to(device) + + x = torch.rand((2, 256)).to(device) + t = torch.randint(0, 1000, (2,)).long().to(device) + prompt = torch.rand(2, 64, 768).to(device) + prompt_mask = torch.ones(2, 64).to(device) + + output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 NVIDIA CORPORATION. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md @@ -0,0 +1,95 @@ +## BigVGAN: A Universal Neural Vocoder with Large-Scale Training +#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon + +
+ + +### [Paper](https://arxiv.org/abs/2206.04658) +### [Audio demo](https://bigvgan-demo.github.io/) + +## Installation +Clone the repository and install dependencies. +```shell +# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries +git clone https://github.com/NVIDIA/BigVGAN +pip install -r requirements.txt +``` + +Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset. +``` shell +cd LibriTTS && \ +ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \ +ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \ +ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \ +ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \ +ln -s /path/to/your/LibriTTS/dev-other dev-other && \ +ln -s /path/to/your/LibriTTS/test-clean test-clean && \ +ln -s /path/to/your/LibriTTS/test-other test-other && \ +cd .. +``` + +## Training +Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input. +```shell +python train.py \ +--config configs/bigvgan_24khz_100band.json \ +--input_wavs_dir LibriTTS \ +--input_training_file LibriTTS/train-full.txt \ +--input_validation_file LibriTTS/val-full.txt \ +--list_input_unseen_wavs_dir LibriTTS LibriTTS \ +--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \ +--checkpoint_path exp/bigvgan +``` + +## Synthesis +Synthesize from BigVGAN model. Below is an example command for generating audio from the model. +It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`. +```shell +python inference.py \ +--checkpoint_file exp/bigvgan/g_05000000 \ +--input_wavs_dir /path/to/your/input_wav \ +--output_dir /path/to/your/output_wav +``` + +`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`. +It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`. + +Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model. +```shell +python inference_e2e.py \ +--checkpoint_file exp/bigvgan/g_05000000 \ +--input_mels_dir /path/to/your/input_mel \ +--output_dir /path/to/your/output_wav +``` + +## Pretrained Models +We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq). +One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders. + +|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned| +|------|---|---|---|---|------|---| +|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No| +|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No| +|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No| +|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No| + +The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset. +We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications. +Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality. + + +## TODO + +Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. + + +## References +* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator) + +* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation) + +* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing) + +* [Julius](https://github.com/adefossez/julius) (for low-pass filter) + +* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py @@ -0,0 +1,120 @@ +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter + + +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + self.beta = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + self.beta = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py @@ -0,0 +1,6 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +from .filter import * +from .resample import * +from .act import * \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py new file mode 100644 index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py @@ -0,0 +1,28 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from .resample import UpSample1d, DownSample1d + + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py @@ -0,0 +1,95 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), + mode=self.padding_mode) + out = F.conv1d(x, self.filter.expand(C, -1, -1), + stride=self.stride, groups=C) + + return out \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py @@ -0,0 +1,49 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from torch.nn import functional as F +from .filter import LowPassFilter1d +from .filter import kaiser_sinc_filter1d + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size) + + def forward(self, x): + xx = self.lowpass(x) + + return xx \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py new file mode 100644 index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py @@ -0,0 +1,18 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py @@ -0,0 +1,36 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import os +import argparse +import json +import torch +from scipy.io.wavfile import write +from .env import AttrDict +from .utils import MAX_WAV_VALUE +from .models import BigVGAN as Generator +import librosa + + +def load_model(model_path, device='cuda'): + config_file = os.path.join(os.path.split(model_path)[0], 'config.json') + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + + h = AttrDict(json_config) + + generator = Generator(h).to(device) + + cp_dict = torch.load(model_path, map_location=device) + generator.load_state_dict(cp_dict['generator']) + generator.eval() + generator.remove_weight_norm() + del cp_dict + return generator, h + diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py @@ -0,0 +1,381 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. + +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + + +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm + +from .activations import activations +from .utils import init_weights, get_padding +from .alias_free_torch import * + +LRELU_SLOPE = 0.1 + + +class AMPBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None): + super(AMPBlock1, self).__init__() + self.h = h + + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class AMPBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None): + super(AMPBlock2, self).__init__() + self.h = h + + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + self.num_layers = len(self.convs) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + def forward(self, x): + for c, a in zip (self.convs, self.activations): + xt = a(x) + xt = c(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class BigVGAN(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, h): + super(BigVGAN, self).__init__() + self.h = h + + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + + # pre conv + self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) + + # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default + resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2 + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append(nn.ModuleList([ + weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i), + h.upsample_initial_channel // (2 ** (i + 1)), + k, u, padding=(k - u) // 2)) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d, activation=h.activation)) + + # post conv + if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing + activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing + activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + else: + raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.") + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + + # weight initialization + for i in range(len(self.ups)): + self.ups[i].apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + # pre conv + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + for l_i in l: + remove_weight_norm(l_i) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.d_mult = h.discriminator_channel_mult + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, h): + super(MultiPeriodDiscriminator, self).__init__() + self.mpd_reshapes = h.mpd_reshapes + print("mpd_reshapes: {}".format(self.mpd_reshapes)) + discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes] + self.discriminators = nn.ModuleList(discriminators) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorR(nn.Module): + def __init__(self, cfg, resolution): + super().__init__() + + self.resolution = resolution + assert len(self.resolution) == 3, \ + "MRD layer requires list with len=3, got {}".format(self.resolution) + self.lrelu_slope = LRELU_SLOPE + + norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm + if hasattr(cfg, "mrd_use_spectral_norm"): + print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm)) + norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm + self.d_mult = cfg.discriminator_channel_mult + if hasattr(cfg, "mrd_channel_mult"): + print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult)) + self.d_mult = cfg.mrd_channel_mult + + self.convs = nn.ModuleList([ + norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))), + ]) + self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1))) + + def forward(self, x): + fmap = [] + + x = self.spectrogram(x) + x = x.unsqueeze(1) + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, self.lrelu_slope) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + def spectrogram(self, x): + n_fft, hop_length, win_length = self.resolution + x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') + x = x.squeeze(1) + x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True) + x = torch.view_as_real(x) # [B, F, TT, 2] + mag = torch.norm(x, p=2, dim =-1) #[B, F, TT] + + return mag + + +class MultiResolutionDiscriminator(nn.Module): + def __init__(self, cfg, debug=False): + super().__init__() + self.resolutions = cfg.resolutions + assert len(self.resolutions) == 3,\ + "MRD requires list of list with len=3, each element having a list with len=3. got {}".\ + format(self.resolutions) + self.discriminators = nn.ModuleList( + [DiscriminatorR(cfg, resolution) for resolution in self.resolutions] + ) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(x=y) + y_d_g, fmap_g = d(x=y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss*2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1-dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1-dg)**2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py @@ -0,0 +1,81 @@ +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm +matplotlib.use("Agg") +import matplotlib.pylab as plt +from scipy.io.wavfile import write + +MAX_WAV_VALUE = 32768.0 + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def plot_spectrogram_clipped(spectrogram, clip_max=2.): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none', vmin=1e-6, vmax=clip_max) + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] + +def save_audio(audio, path, sr): + # wav: torch with 1d shape + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + write(path, sr, audio) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py new file mode 100644 index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F +import torchaudio +import torchaudio.transforms as transforms + + +class LogMelSpectrogram(torch.nn.Module): + def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,): + super().__init__() + self.frame_length = frame_length + self.hop_length = hop_length + self.mel = transforms.MelSpectrogram( + sample_rate=sr, + n_fft=frame_length, + win_length=frame_length, + hop_length=hop_length, + center=False, + power=1.0, + norm="slaney", + n_mels=n_mel, + mel_scale="slaney", + f_min=f_min, + f_max=f_max + ) + + @torch.no_grad() + def forward(self, x, target_length=None): + x = F.pad(x, ((self.frame_length - self.hop_length) // 2, + (self.frame_length - self.hop_length) // 2), "reflect") + mel = self.mel(x) + + target_length = mel.shape[-1] if target_length is None else target_length + logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device) + logmel[:, :, :mel.shape[2]] = mel + + logmel = torch.log(torch.clamp(logmel, min=1e-5)) + return logmel \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE @@ -0,0 +1,24 @@ +MIT License + +Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) +Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) +Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) +Original work Copyright (c) 2015 braindead (https://github.com/braindead) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95663cf5b29be905a8422176f661a8f7745b5cb0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md @@ -0,0 +1,64 @@ +# Real-Time Voice Cloning +This repository is an implementation of [Transfer Learning from Speaker Verification to +Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). + +SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text. + +**Video demonstration** (click the picture): + +[![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) + + + +### Papers implemented +| URL | Designation | Title | Implementation source | +| --- | ----------- | ----- | --------------------- | +|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | +|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | +|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) +|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | + +## News +**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion. + +**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below. + +**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish. + +**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this: +- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors. +- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info. + +**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it. + + +## Setup + +### 1. Install Requirements +1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. +2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. +3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. +4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. +5. Install the remaining requirements with `pip install -r requirements.txt` + +### 2. (Optional) Download Pretrained Models +Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). + +### 3. (Optional) Test Configuration +Before you download any dataset, you can begin by testing your configuration with: + +`python demo_cli.py` + +If all tests pass, you're good to go. + +### 4. (Optional) Download Datasets +For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. + +### 5. Launch the Toolbox +You can then try the toolbox: + +`python demo_toolbox.py -d ` +or +`python demo_toolbox.py` + +depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py @@ -0,0 +1 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..de650b972fc7a4f3f8a698c128ee4642a373a6d6 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py @@ -0,0 +1,157 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from scipy.ndimage.morphology import binary_dilation +from .params_data import * +from pathlib import Path +from typing import Optional, Union +import numpy as np +import webrtcvad +import librosa +import struct + +import torch +from torchaudio.transforms import Resample +from librosa.filters import mel as librosa_mel_fn + + +int16_max = (2 ** 15) - 1 + + +def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], + source_sr: Optional[int] = None): + """ + Applies the preprocessing operations used in training the Speaker Encoder to a waveform + either on disk or in memory. The waveform will be resampled to match the data hyperparameters. + + :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not + just .wav), either the waveform as a numpy array of floats. + :param source_sr: if passing an audio waveform, the sampling rate of the waveform before + preprocessing. After preprocessing, the waveform's sampling rate will match the data + hyperparameters. If passing a filepath, the sampling rate will be automatically detected and + this argument will be ignored. + """ + # Load the wav from disk if needed + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + wav, source_sr = librosa.load(fpath_or_wav, sr=None) + else: + wav = fpath_or_wav + + # Resample the wav if needed + if source_sr is not None and source_sr != sampling_rate: + wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate) + + # Apply the preprocessing: normalize volume and shorten long silences + wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) + wav = trim_long_silences(wav) + + return wav + + +def preprocess_wav_batch(wavs, source_sr=22050): + # This torch version is designed to cope with a batch of same lengths wavs + if sampling_rate != source_sr: + resample = Resample(source_sr, sampling_rate) + wavs = resample(wavs) + wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, + increase_only=True) + # Trimming silence is not implemented in this version yet! + return wavs_preprocessed + + +def wav_to_mel_spectrogram(wav): + """ + Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. + Note: this not a log-mel spectrogram. + """ + frames = librosa.feature.melspectrogram( + y=wav, + sr=sampling_rate, + n_fft=int(sampling_rate * mel_window_length / 1000), + hop_length=int(sampling_rate * mel_window_step / 1000), + n_mels=mel_n_channels + ) + return frames.astype(np.float32).T + + +def wav_to_mel_spectrogram_batch(wavs): + # This torch version is designed to cope with a batch of same lengths wavs + n_fft = int(sampling_rate * mel_window_length / 1000) + hop_length = int(sampling_rate * mel_window_step / 1000) + win_length = int(sampling_rate * mel_window_length / 1000) + window = torch.hann_window(n_fft).to(wavs) + mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, + n_mels=mel_n_channels)).to(wavs) + s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, + win_length=win_length, window=window, center=True, return_complex=False) + real_part, imag_part = s.unbind(-1) + stftm = real_part**2 + imag_part**2 + mels = torch.matmul(mel_basis, stftm) + return torch.transpose(mels, 1, 2) + + +def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) + if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): + return wav + return wav * (10 ** (dBFS_change / 20)) + + +def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False): + # This torch version is designed to cope with a batch of same lengths wavs + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1)) + scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype) + if increase_only: + mask = (dBFS_change > 0).to(scales) + elif decrease_only: + mask = (dBFS_change < 0).to(scales) + else: + mask = torch.zeros_like(scales) + scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0) + return wavs * scales.unsqueeze(-1) + + +def trim_long_silences(wav): + """ + Ensures that segments without voice in the waveform remain no longer than a + threshold determined by the VAD parameters in params.py. + + :param wav: the raw waveform as a numpy array of floats + :return: the same waveform with silences trimmed away (length <= original wav length) + """ + # Compute the voice detection window size + samples_per_window = (vad_window_length * sampling_rate) // 1000 + + # Trim the end of the audio to have a multiple of the window size + wav = wav[:len(wav) - (len(wav) % samples_per_window)] + + # Convert the float waveform to 16-bit mono PCM + pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) + + # Perform voice activation detection + voice_flags = [] + vad = webrtcvad.Vad(mode=3) + for window_start in range(0, len(wav), samples_per_window): + window_end = window_start + samples_per_window + voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], + sample_rate=sampling_rate)) + voice_flags = np.array(voice_flags) + + # Smooth the voice detection with a moving average + def moving_average(array, width): + array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) + ret = np.cumsum(array_padded, dtype=float) + ret[width:] = ret[width:] - ret[:-width] + return ret[width - 1:] / width + + audio_mask = moving_average(voice_flags, vad_moving_average_width) + audio_mask = np.round(audio_mask).astype(np.bool) + + # Dilate the voiced regions + audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) + audio_mask = np.repeat(audio_mask, samples_per_window) + + return wav[audio_mask == True] diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ce1f5aab0d3899c5e5045b40d4cecee1a11d844c --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py @@ -0,0 +1,47 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +librispeech_datasets = { + "train": { + "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], + "other": ["LibriSpeech/train-other-500"] + }, + "test": { + "clean": ["LibriSpeech/test-clean"], + "other": ["LibriSpeech/test-other"] + }, + "dev": { + "clean": ["LibriSpeech/dev-clean"], + "other": ["LibriSpeech/dev-other"] + }, +} +libritts_datasets = { + "train": { + "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], + "other": ["LibriTTS/train-other-500"] + }, + "test": { + "clean": ["LibriTTS/test-clean"], + "other": ["LibriTTS/test-other"] + }, + "dev": { + "clean": ["LibriTTS/dev-clean"], + "other": ["LibriTTS/dev-other"] + }, +} +voxceleb_datasets = { + "voxceleb1" : { + "train": ["VoxCeleb1/wav"], + "test": ["VoxCeleb1/test_wav"] + }, + "voxceleb2" : { + "train": ["VoxCeleb2/dev/aac"], + "test": ["VoxCeleb2/test_wav"] + } +} + +other_datasets = [ + "LJSpeech-1.1", + "VCTK-Corpus/wav48", +] + +anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9af30b406f2a8debe81a8275cb2682cbd896245a --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py @@ -0,0 +1,4 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .speaker_verification_dataset import SpeakerVerificationDataset +from .speaker_verification_dataset import SpeakerVerificationDataLoader diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd5bb005923852327581e2dcaa03fec7dbce5b8 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py @@ -0,0 +1,39 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import random + +class RandomCycler: + """ + Creates an internal copy of a sequence and allows access to its items in a constrained random + order. For a source sequence of n items and one or several consecutive queries of a total + of m items, the following guarantees hold (one implies the other): + - Each item will be returned between m // n and ((m - 1) // n) + 1 times. + - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. + """ + + def __init__(self, source): + if len(source) == 0: + raise Exception("Can't create RandomCycler from an empty collection") + self.all_items = list(source) + self.next_items = [] + + def sample(self, count: int): + shuffle = lambda l: random.sample(l, len(l)) + + out = [] + while count > 0: + if count >= len(self.all_items): + out.extend(shuffle(list(self.all_items))) + count -= len(self.all_items) + continue + n = min(count, len(self.next_items)) + out.extend(self.next_items[:n]) + count -= n + self.next_items = self.next_items[n:] + if len(self.next_items) == 0: + self.next_items = shuffle(list(self.all_items)) + return out + + def __next__(self): + return self.sample(1)[0] + diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d189c835859efefa686d49b53f4e79aa444d96 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py @@ -0,0 +1,42 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .random_cycler import RandomCycler +from .utterance import Utterance +from pathlib import Path + +# Contains the set of utterances of a single speaker +class Speaker: + def __init__(self, root: Path): + self.root = root + self.name = root.name + self.utterances = None + self.utterance_cycler = None + + def _load_utterances(self): + with self.root.joinpath("_sources.txt").open("r") as sources_file: + sources = [l.split(",") for l in sources_file] + sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} + self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] + self.utterance_cycler = RandomCycler(self.utterances) + + def random_partial(self, count, n_frames): + """ + Samples a batch of unique partial utterances from the disk in a way that all + utterances come up at least once every two cycles and in a random order every time. + + :param count: The number of partial utterances to sample from the set of utterances from + that speaker. Utterances are guaranteed not to be repeated if is not larger than + the number of utterances available. + :param n_frames: The number of frames in the partial utterance. + :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, + frames are the frames of the partial utterances and range is the range of the partial + utterance with regard to the complete utterance. + """ + if self.utterances is None: + self._load_utterances() + + utterances = self.utterance_cycler.sample(count) + + a = [(u,) + u.random_partial(n_frames) for u in utterances] + + return a diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4080d636338bedcb8d1b8fc77945057027fd0ac1 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py @@ -0,0 +1,14 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np +from typing import List +from .speaker import Speaker + +class SpeakerBatch: + def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): + self.speakers = speakers + self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} + + # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with + # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) + self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc31fee9e0d62545caa2599aebc22decfb50aa0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py @@ -0,0 +1,58 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .random_cycler import RandomCycler +from .speaker_batch import SpeakerBatch +from .speaker import Speaker +from ..params_data import partials_n_frames +from torch.utils.data import Dataset, DataLoader +from pathlib import Path + +# TODO: improve with a pool of speakers for data efficiency + +class SpeakerVerificationDataset(Dataset): + def __init__(self, datasets_root: Path): + self.root = datasets_root + speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] + if len(speaker_dirs) == 0: + raise Exception("No speakers found. Make sure you are pointing to the directory " + "containing all preprocessed speaker directories.") + self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] + self.speaker_cycler = RandomCycler(self.speakers) + + def __len__(self): + return int(1e10) + + def __getitem__(self, index): + return next(self.speaker_cycler) + + def get_logs(self): + log_string = "" + for log_fpath in self.root.glob("*.txt"): + with log_fpath.open("r") as log_file: + log_string += "".join(log_file.readlines()) + return log_string + + +class SpeakerVerificationDataLoader(DataLoader): + def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, + batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, + worker_init_fn=None): + self.utterances_per_speaker = utterances_per_speaker + + super().__init__( + dataset=dataset, + batch_size=speakers_per_batch, + shuffle=False, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=self.collate, + pin_memory=pin_memory, + drop_last=False, + timeout=timeout, + worker_init_fn=worker_init_fn + ) + + def collate(self, speakers): + return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py new file mode 100644 index 0000000000000000000000000000000000000000..2b878c58fd7d70d3ba0b33def66912adc1c1a45d --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py @@ -0,0 +1,28 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np + + +class Utterance: + def __init__(self, frames_fpath, wave_fpath): + self.frames_fpath = frames_fpath + self.wave_fpath = wave_fpath + + def get_frames(self): + return np.load(self.frames_fpath) + + def random_partial(self, n_frames): + """ + Crops the frames into a partial utterance of n_frames + + :param n_frames: The number of frames of the partial utterance + :return: the partial utterance frames and a tuple indicating the start and end of the + partial utterance in the complete utterance. + """ + frames = self.get_frames() + if frames.shape[0] == n_frames: + start = 0 + else: + start = np.random.randint(0, frames.shape[0] - n_frames) + end = start + n_frames + return frames[start:end], (start, end) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py @@ -0,0 +1,211 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .params_data import * +from .model import SpeakerEncoder +from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram +from matplotlib import cm +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import torch + +_model = None # type: SpeakerEncoder +_device = None # type: torch.device + + +def load_model(weights_fpath: Path, device="cpu"): + """ + Loads the model in memory. If this function is not explicitely called, it will be run on the + first call to embed_frames() with the default weights file. + + :param weights_fpath: the path to saved model weights. + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The + model will be loaded and will run on this device. Outputs will however always be on the cpu. + If None, will default to your GPU if it"s available, otherwise your CPU. + """ + # TODO: I think the slow loading of the encoder might have something to do with the device it + # was saved on. Worth investigating. + global _model, _device + if device is None: + _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + _device = torch.device(device) + _model = SpeakerEncoder(_device, torch.device("cpu")) + checkpoint = torch.load(weights_fpath, map_location="cpu") + _model.load_state_dict(checkpoint["model_state"]) + _model.eval() + _model = _model.to(device) + print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) + + +def is_loaded(): + return _model is not None + + +@torch.no_grad() +def embed_frames_batch(frames, use_torch=False): + if _model is None: + raise Exception("Model was not loaded. Call load_model() before inference.") + + if not use_torch: + frames = torch.from_numpy(frames) + frames = frames.to(_device) + + embeds = _model.forward(frames) + if not use_torch: + embeds = embeds.detach().cpu().numpy() + return embeds + + +def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, + min_pad_coverage=0.75, overlap=0.5): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain + partial utterances of each. Both the waveform and the mel + spectrogram slices are returned, so as to make each partial utterance waveform correspond to + its spectrogram. This function assumes that the mel spectrogram parameters used are those + defined in params_data.py. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial + utterance + :param min_pad_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + :param overlap: by how much the partial utterance should overlap. If set to 0, the partial + utterances are entirely disjoint. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 <= overlap < 1 + assert 0 < min_pad_coverage <= 1 + + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partial_utterance_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_pad_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + +@torch.no_grad() +def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): + """ + Computes an embedding for a single utterance. + + # TODO: handle multiple wavs to benefit from batching on GPU + :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 + :param using_partials: if True, then the utterance is split in partial utterances of + frames and the utterance embedding is computed from their + normalized average. If False, the utterance is instead computed from feeding the entire + spectogram to the network. + :param return_partials: if True, the partial embeddings will also be returned along with the + wav slices that correspond to the partial embeddings. + :param kwargs: additional arguments to compute_partial_splits() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. If is simultaneously set to False, both these values will be None + instead. + """ + # Process the entire utterance if not using partials + if not using_partials: + frames = wav_to_mel_spectrogram(wav) + embed = embed_frames_batch(frames[None, ...])[0] + if return_partials: + return embed, None, None + return embed + + # Compute where to split the utterance into partials and pad if necessary + wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials + frames = wav_to_mel_spectrogram(wav) + frames_batch = np.array([frames[s] for s in mel_slices]) + partial_embeds = embed_frames_batch(frames_batch) + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wave_slices + return embed + + +@torch.no_grad() +def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs): + # This torch version is designed to cope with a batch of same lengths wavs + if not using_partials: + frames = wav_to_mel_spectrogram_batch(wavs) + embeds = embed_frames_batch(frames) + if return_partials: + return embeds, None, None + return embeds + + wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= wavs.shape[-1]: + wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), + dtype=wavs.dtype, device=wavs.device)], 1) + + frames = wav_to_mel_spectrogram_batch(wavs) + frames_batch = [] + for i in range(len(frames)): + frames_batch += [frames[i][s] for s in mel_slices] + frames_batch = torch.stack(frames_batch, 0) + partial_embeds = embed_frames_batch(frames_batch, use_torch=True) + partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1) + + raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False) + embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True) + + if return_partials: + return embeds, partial_embeds, wave_slices + return embeds + + +def embed_speaker(wavs, **kwargs): + raise NotImplemented() + + +def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): + if ax is None: + ax = plt.gca() + + if shape is None: + height = int(np.sqrt(len(embed))) + shape = (height, -1) + embed = embed.reshape(shape) + + cmap = cm.get_cmap() + mappable = ax.imshow(embed, cmap=cmap) + cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) + cbar.set_clim(*color_range) + + ax.set_xticks([]), ax.set_yticks([]) + ax.set_title(title) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8d246bc359ce1ffc6229ba8a4ced24d07b77e703 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py @@ -0,0 +1,137 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .params_model import * +from .params_data import * +from scipy.interpolate import interp1d +from sklearn.metrics import roc_curve +from torch.nn.utils import clip_grad_norm_ +from scipy.optimize import brentq +from torch import nn +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, device, loss_device): + super().__init__() + self.loss_device = loss_device + + # Network defition + self.lstm = nn.LSTM(input_size=mel_n_channels, + hidden_size=model_hidden_size, + num_layers=model_num_layers, + batch_first=True).to(device) + self.linear = nn.Linear(in_features=model_hidden_size, + out_features=model_embedding_size).to(device) + self.relu = torch.nn.ReLU().to(device) + + # Cosine similarity scaling (with fixed initial parameter values) + self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) + self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) + + # Loss + self.loss_fn = nn.CrossEntropyLoss().to(loss_device) + + def do_gradient_ops(self): + # Gradient scale + self.similarity_weight.grad *= 0.01 + self.similarity_bias.grad *= 0.01 + + # Gradient clipping + clip_grad_norm_(self.parameters(), 3, norm_type=2) + + def forward(self, utterances, hidden_init=None): + """ + Computes the embeddings of a batch of utterance spectrograms. + + :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape + (batch_size, n_frames, n_channels) + :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, + batch_size, hidden_size). Will default to a tensor of zeros if None. + :return: the embeddings as a tensor of shape (batch_size, embedding_size) + """ + # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state + # and the final cell state. + out, (hidden, cell) = self.lstm(utterances, hidden_init) + + # We take only the hidden state of the last layer + embeds_raw = self.relu(self.linear(hidden[-1])) + + # L2-normalize it + embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + return embeds + + def similarity_matrix(self, embeds): + """ + Computes the similarity matrix according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the similarity matrix as a tensor of shape (speakers_per_batch, + utterances_per_speaker, speakers_per_batch) + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation + centroids_incl = torch.mean(embeds, dim=1, keepdim=True) + centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) + + # Exclusive centroids (1 per utterance) + centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) + centroids_excl /= (utterances_per_speaker - 1) + centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) + + # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot + # product of these vectors (which is just an element-wise multiplication reduced by a sum). + # We vectorize the computation for efficiency. + sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, + speakers_per_batch).to(self.loss_device) + mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) + for j in range(speakers_per_batch): + mask = np.where(mask_matrix[j])[0] + sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) + sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) + + ## Even more vectorized version (slower maybe because of transpose) + # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker + # ).to(self.loss_device) + # eye = np.eye(speakers_per_batch, dtype=np.int) + # mask = np.where(1 - eye) + # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) + # mask = np.where(eye) + # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) + # sim_matrix2 = sim_matrix2.transpose(1, 2) + + sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias + return sim_matrix + + def loss(self, embeds): + """ + Computes the softmax loss according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the loss and the EER for this batch of embeddings. + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Loss + sim_matrix = self.similarity_matrix(embeds) + sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, + speakers_per_batch)) + ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) + target = torch.from_numpy(ground_truth).long().to(self.loss_device) + loss = self.loss_fn(sim_matrix, target) + + # EER (not backpropagated) + with torch.no_grad(): + inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] + labels = np.array([inv_argmax(i) for i in ground_truth]) + preds = sim_matrix.detach().cpu().numpy() + + # Snippet from https://yangcha.github.io/EER-ROC/ + fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) + eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + + return loss, eer \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py new file mode 100644 index 0000000000000000000000000000000000000000..62d04121aed3d7862889ad6c771055db9b74ab6e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py @@ -0,0 +1,30 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms +# Number of spectrogram frames at inference +inference_n_frames = 80 # 800 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9c535205028bfec75ba7c58ea7e750ba3fff1633 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py @@ -0,0 +1,12 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 + + +## Training parameters +learning_rate_init = 1e-4 +speakers_per_batch = 64 +utterances_per_speaker = 10 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py @@ -0,0 +1,177 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from multiprocess.pool import ThreadPool +from .params_data import * +from .config import librispeech_datasets, anglophone_nationalites +from datetime import datetime +from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch +from pathlib import Path +from tqdm import tqdm +import numpy as np + + +class DatasetLog: + """ + Registers metadata about the dataset in a text file. + """ + def __init__(self, root, name): + self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") + self.sample_data = dict() + + start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Creating dataset %s on %s" % (name, start_time)) + self.write_line("-----") + self._log_params() + + def _log_params(self): + from encoder import params_data + self.write_line("Parameter values:") + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + self.write_line("\t%s: %s" % (param_name, value)) + self.write_line("-----") + + def write_line(self, line): + self.text_file.write("%s\n" % line) + + def add_sample(self, **kwargs): + for param_name, value in kwargs.items(): + if not param_name in self.sample_data: + self.sample_data[param_name] = [] + self.sample_data[param_name].append(value) + + def finalize(self): + self.write_line("Statistics:") + for param_name, values in self.sample_data.items(): + self.write_line("\t%s:" % param_name) + self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) + self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) + self.write_line("-----") + end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Finished on %s" % end_time) + self.text_file.close() + + +def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): + dataset_root = datasets_root.joinpath(dataset_name) + if not dataset_root.exists(): + print("Couldn\'t find %s, skipping this dataset." % dataset_root) + return None, None + return dataset_root, DatasetLog(out_dir, dataset_name) + + +def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + + # Function to preprocess utterances for one speaker + def preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + if sources_fpath.exists(): + try: + with sources_fpath.open("r") as sources_file: + existing_fnames = {line.split(",")[0] for line in sources_file} + except: + existing_fnames = {} + else: + existing_fnames = {} + + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + + # Process the utterances for each speaker + with ThreadPool(8) as pool: + list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + unit="speakers")) + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): + for dataset_name in librispeech_datasets["train"]["other"]: + # Initialize the preprocessing + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Preprocess all speakers + speaker_dirs = list(dataset_root.glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", + skip_existing, logger) + + +def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb1" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the contents of the meta file + with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: + metadata = [line.split("\t") for line in metafile][1:] + + # Select the ID and the nationality, filter out non-anglophone speakers + nationalities = {line[0]: line[3] for line in metadata} + keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if + nationality.lower() in anglophone_nationalites] + print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % + (len(keep_speaker_ids), len(nationalities))) + + # Get the speaker directories for anglophone speakers only + speaker_dirs = dataset_root.joinpath("wav").glob("*") + speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if + speaker_dir.name in keep_speaker_ids] + print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % + (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) + + # Preprocess all speakers + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", + skip_existing, logger) + + +def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb2" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the speaker directories + # Preprocess all speakers + speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", + skip_existing, logger) diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py new file mode 100644 index 0000000000000000000000000000000000000000..250d038a33b72d09dfe67811c917708aa0ea6714 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py @@ -0,0 +1,127 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .visualizations import Visualizations +from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset +from .params_model import * +from .model import SpeakerEncoder +from .utils.profiler import Profiler +from pathlib import Path +import torch + +def sync(device: torch.device): + # FIXME + return + # For correct profiling (cuda operations are async) + if device.type == "cuda": + torch.cuda.synchronize(device) + +def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, + backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, + no_visdom: bool): + # Create a dataset and a dataloader + dataset = SpeakerVerificationDataset(clean_data_root) + loader = SpeakerVerificationDataLoader( + dataset, + speakers_per_batch, + utterances_per_speaker, + num_workers=8, + ) + + # Setup the device on which to run the forward pass and the loss. These can be different, + # because the forward pass is faster on the GPU whereas the loss is often (depending on your + # hyperparameters) faster on the CPU. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # FIXME: currently, the gradient is None if loss_device is cuda + loss_device = torch.device("cpu") + + # Create the model and the optimizer + model = SpeakerEncoder(device, loss_device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) + init_step = 1 + + # Configure file path for the model + state_fpath = models_dir.joinpath(run_id + ".pt") + backup_dir = models_dir.joinpath(run_id + "_backups") + + # Load any existing model + if not force_restart: + if state_fpath.exists(): + print("Found existing model \"%s\", loading it and resuming training." % run_id) + checkpoint = torch.load(state_fpath) + init_step = checkpoint["step"] + model.load_state_dict(checkpoint["model_state"]) + optimizer.load_state_dict(checkpoint["optimizer_state"]) + optimizer.param_groups[0]["lr"] = learning_rate_init + else: + print("No model \"%s\" found, starting training from scratch." % run_id) + else: + print("Starting the training from scratch.") + model.train() + + # Initialize the visualization environment + vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) + vis.log_dataset(dataset) + vis.log_params() + device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") + vis.log_implementation({"Device": device_name}) + + # Training loop + profiler = Profiler(summarize_every=10, disabled=False) + for step, speaker_batch in enumerate(loader, init_step): + profiler.tick("Blocking, waiting for batch (threaded)") + + # Forward pass + inputs = torch.from_numpy(speaker_batch.data).to(device) + sync(device) + profiler.tick("Data to %s" % device) + embeds = model(inputs) + sync(device) + profiler.tick("Forward pass") + embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) + loss, eer = model.loss(embeds_loss) + sync(loss_device) + profiler.tick("Loss") + + # Backward pass + model.zero_grad() + loss.backward() + profiler.tick("Backward pass") + model.do_gradient_ops() + optimizer.step() + profiler.tick("Parameter update") + + # Update visualizations + # learning_rate = optimizer.param_groups[0]["lr"] + vis.update(loss.item(), eer, step) + + # Draw projections and save them to the backup folder + if umap_every != 0 and step % umap_every == 0: + print("Drawing and saving projections (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) + embeds = embeds.detach().cpu().numpy() + vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) + vis.save() + + # Overwrite the latest version of the model + if save_every != 0 and step % save_every == 0: + print("Saving the model (step %d)" % step) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, state_fpath) + + # Make a backup + if backup_every != 0 and step % backup_every == 0: + print("Making a backup (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, backup_fpath) + + profiler.tick("Extras (visualizations, saving)") + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py @@ -0,0 +1 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py new file mode 100644 index 0000000000000000000000000000000000000000..6de50f3ec61f6b61798299726b13a1caa1638abb --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py @@ -0,0 +1,42 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from pathlib import Path +import numpy as np +import argparse + +_type_priorities = [ # In decreasing order + Path, + str, + int, + float, + bool, +] + +def _priority(o): + p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) + if p is not None: + return p + p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) + if p is not None: + return p + return len(_type_priorities) + +def print_args(args: argparse.Namespace, parser=None): + args = vars(args) + if parser is None: + priorities = list(map(_priority, args.values())) + else: + all_params = [a.dest for g in parser._action_groups for a in g._group_actions ] + priority = lambda p: all_params.index(p) if p in all_params else len(all_params) + priorities = list(map(priority, args.keys())) + + pad = max(map(len, args.keys())) + 3 + indices = np.lexsort((list(args.keys()), priorities)) + items = list(args.items()) + + print("Arguments:") + for i in indices: + param, value = items[i] + print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) + print("") + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py new file mode 100644 index 0000000000000000000000000000000000000000..43de43e4c29821df5d20d8303ce491101a041a86 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py @@ -0,0 +1,222 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +import numpy as np +import math +from scipy.special import expn +from collections import namedtuple + +NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2") + + +def profile_noise(noise, sampling_rate, window_size=0): + """ + Creates a profile of the noise in a given waveform. + + :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. + :param sampling_rate: the sampling rate of the audio + :param window_size: the size of the window the logmmse algorithm operates on. A default value + will be picked if left as 0. + :return: a NoiseProfile object + """ + noise, dtype = to_float(noise) + noise += np.finfo(np.float64).eps + + if window_size == 0: + window_size = int(math.floor(0.02 * sampling_rate)) + + if window_size % 2 == 1: + window_size = window_size + 1 + + perc = 50 + len1 = int(math.floor(window_size * perc / 100)) + len2 = int(window_size - len1) + + win = np.hanning(window_size) + win = win * len2 / np.sum(win) + n_fft = 2 * window_size + + noise_mean = np.zeros(n_fft) + n_frames = len(noise) // window_size + for j in range(0, window_size * n_frames, window_size): + noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0)) + noise_mu2 = (noise_mean / n_frames) ** 2 + + return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2) + + +def denoise(wav, noise_profile: NoiseProfile, eta=0.15): + """ + Cleans the noise from a speech waveform given a noise profile. The waveform must have the + same sampling rate as the one used to create the noise profile. + + :param wav: a speech waveform as a numpy array of floats or ints. + :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of + the same) waveform. + :param eta: voice threshold for noise update. While the voice activation detection value is + below this threshold, the noise profile will be continuously updated throughout the audio. + Set to 0 to disable updating the noise profile. + :return: the clean wav as a numpy array of floats or ints of the same length. + """ + wav, dtype = to_float(wav) + wav += np.finfo(np.float64).eps + p = noise_profile + + nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2)) + x_final = np.zeros(nframes * p.len2) + + aa = 0.98 + mu = 0.98 + ksi_min = 10 ** (-25 / 10) + + x_old = np.zeros(p.len1) + xk_prev = np.zeros(p.len1) + noise_mu2 = p.noise_mu2 + for k in range(0, nframes * p.len2, p.len2): + insign = p.win * wav[k:k + p.window_size] + + spec = np.fft.fft(insign, p.n_fft, axis=0) + sig = np.absolute(spec) + sig2 = sig ** 2 + + gammak = np.minimum(sig2 / noise_mu2, 40) + + if xk_prev.all() == 0: + ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) + else: + ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) + ksi = np.maximum(ksi_min, ksi) + + log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi) + vad_decision = np.sum(log_sigma_k) / p.window_size + if vad_decision < eta: + noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 + + a = ksi / (1 + ksi) + vk = a * gammak + ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) + hw = a * np.exp(ei_vk) + sig = sig * hw + xk_prev = sig ** 2 + xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0) + xi_w = np.real(xi_w) + + x_final[k:k + p.len2] = x_old + xi_w[0:p.len1] + x_old = xi_w[p.len1:p.window_size] + + output = from_float(x_final, dtype) + output = np.pad(output, (0, len(wav) - len(output)), mode="constant") + return output + + +## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that +## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of +## webrctvad +# def vad(wav, sampling_rate, eta=0.15, window_size=0): +# """ +# TODO: fix doc +# Creates a profile of the noise in a given waveform. +# +# :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. +# :param sampling_rate: the sampling rate of the audio +# :param window_size: the size of the window the logmmse algorithm operates on. A default value +# will be picked if left as 0. +# :param eta: voice threshold for noise update. While the voice activation detection value is +# below this threshold, the noise profile will be continuously updated throughout the audio. +# Set to 0 to disable updating the noise profile. +# """ +# wav, dtype = to_float(wav) +# wav += np.finfo(np.float64).eps +# +# if window_size == 0: +# window_size = int(math.floor(0.02 * sampling_rate)) +# +# if window_size % 2 == 1: +# window_size = window_size + 1 +# +# perc = 50 +# len1 = int(math.floor(window_size * perc / 100)) +# len2 = int(window_size - len1) +# +# win = np.hanning(window_size) +# win = win * len2 / np.sum(win) +# n_fft = 2 * window_size +# +# wav_mean = np.zeros(n_fft) +# n_frames = len(wav) // window_size +# for j in range(0, window_size * n_frames, window_size): +# wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0)) +# noise_mu2 = (wav_mean / n_frames) ** 2 +# +# wav, dtype = to_float(wav) +# wav += np.finfo(np.float64).eps +# +# nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2)) +# vad = np.zeros(nframes * len2, dtype=np.bool) +# +# aa = 0.98 +# mu = 0.98 +# ksi_min = 10 ** (-25 / 10) +# +# xk_prev = np.zeros(len1) +# noise_mu2 = noise_mu2 +# for k in range(0, nframes * len2, len2): +# insign = win * wav[k:k + window_size] +# +# spec = np.fft.fft(insign, n_fft, axis=0) +# sig = np.absolute(spec) +# sig2 = sig ** 2 +# +# gammak = np.minimum(sig2 / noise_mu2, 40) +# +# if xk_prev.all() == 0: +# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) +# else: +# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) +# ksi = np.maximum(ksi_min, ksi) +# +# log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi) +# vad_decision = np.sum(log_sigma_k) / window_size +# if vad_decision < eta: +# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 +# print(vad_decision) +# +# a = ksi / (1 + ksi) +# vk = a * gammak +# ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) +# hw = a * np.exp(ei_vk) +# sig = sig * hw +# xk_prev = sig ** 2 +# +# vad[k:k + len2] = vad_decision >= eta +# +# vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant") +# return vad + + +def to_float(_input): + if _input.dtype == np.float64: + return _input, _input.dtype + elif _input.dtype == np.float32: + return _input.astype(np.float64), _input.dtype + elif _input.dtype == np.uint8: + return (_input - 128) / 128., _input.dtype + elif _input.dtype == np.int16: + return _input / 32768., _input.dtype + elif _input.dtype == np.int32: + return _input / 2147483648., _input.dtype + raise ValueError('Unsupported wave file format') + + +def from_float(_input, dtype): + if dtype == np.float64: + return _input, np.float64 + elif dtype == np.float32: + return _input.astype(np.float32) + elif dtype == np.uint8: + return ((_input * 128) + 128).astype(np.uint8) + elif dtype == np.int16: + return (_input * 32768).astype(np.int16) + elif dtype == np.int32: + print(_input) + return (_input * 2147483648).astype(np.int32) + raise ValueError('Unsupported wave file format') diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..f0176f632b58dfde15e31c04e79543b629bd4499 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py @@ -0,0 +1,47 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from time import perf_counter as timer +from collections import OrderedDict +import numpy as np + + +class Profiler: + def __init__(self, summarize_every=5, disabled=False): + self.last_tick = timer() + self.logs = OrderedDict() + self.summarize_every = summarize_every + self.disabled = disabled + + def tick(self, name): + if self.disabled: + return + + # Log the time needed to execute that function + if not name in self.logs: + self.logs[name] = [] + if len(self.logs[name]) >= self.summarize_every: + self.summarize() + self.purge_logs() + self.logs[name].append(timer() - self.last_tick) + + self.reset_timer() + + def purge_logs(self): + for name in self.logs: + self.logs[name].clear() + + def reset_timer(self): + self.last_tick = timer() + + def summarize(self): + n = max(map(len, self.logs.values())) + assert n == self.summarize_every + print("\nAverage execution time over %d steps:" % n) + + name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()] + pad = max(map(len, name_msgs)) + for name_msg, deltas in zip(name_msgs, self.logs.values()): + print(" %s mean: %4.0fms std: %4.0fms" % + (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) + print("", flush=True) + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..e8b0ffc1f3c54d85158521cac6d09f05dd21de6d --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py @@ -0,0 +1,180 @@ +""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ + +from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from datetime import datetime +from time import perf_counter as timer +import matplotlib.pyplot as plt +import numpy as np +# import webbrowser +import visdom +import umap + +colormap = np.array([ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], +], dtype=np.float) / 255 + + +class Visualizations: + def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): + # Tracking data + self.last_update_timestamp = timer() + self.update_every = update_every + self.step_times = [] + self.losses = [] + self.eers = [] + print("Updating the visualizations every %d steps." % update_every) + + # If visdom is disabled TODO: use a better paradigm for that + self.disabled = disabled + if self.disabled: + return + + # Set the environment name + now = str(datetime.now().strftime("%d-%m %Hh%M")) + if env_name is None: + self.env_name = now + else: + self.env_name = "%s (%s)" % (env_name, now) + + # Connect to visdom and open the corresponding window in the browser + try: + self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) + except ConnectionError: + raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " + "start it.") + # webbrowser.open("http://localhost:8097/env/" + self.env_name) + + # Create the windows + self.loss_win = None + self.eer_win = None + # self.lr_win = None + self.implementation_win = None + self.projection_win = None + self.implementation_string = "" + + def log_params(self): + if self.disabled: + return + from encoder import params_data + from encoder import params_model + param_string = "Model parameters:
" + for param_name in (p for p in dir(params_model) if not p.startswith("__")): + value = getattr(params_model, param_name) + param_string += "\t%s: %s
" % (param_name, value) + param_string += "Data parameters:
" + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + param_string += "\t%s: %s
" % (param_name, value) + self.vis.text(param_string, opts={"title": "Parameters"}) + + def log_dataset(self, dataset: SpeakerVerificationDataset): + if self.disabled: + return + dataset_string = "" + dataset_string += "Speakers: %s\n" % len(dataset.speakers) + dataset_string += "\n" + dataset.get_logs() + dataset_string = dataset_string.replace("\n", "
") + self.vis.text(dataset_string, opts={"title": "Dataset"}) + + def log_implementation(self, params): + if self.disabled: + return + implementation_string = "" + for param, value in params.items(): + implementation_string += "%s: %s\n" % (param, value) + implementation_string = implementation_string.replace("\n", "
") + self.implementation_string = implementation_string + self.implementation_win = self.vis.text( + implementation_string, + opts={"title": "Training implementation"} + ) + + def update(self, loss, eer, step): + # Update the tracking data + now = timer() + self.step_times.append(1000 * (now - self.last_update_timestamp)) + self.last_update_timestamp = now + self.losses.append(loss) + self.eers.append(eer) + print(".", end="") + + # Update the plots every steps + if step % self.update_every != 0: + return + time_string = "Step time: mean: %5dms std: %5dms" % \ + (int(np.mean(self.step_times)), int(np.std(self.step_times))) + print("\nStep %6d Loss: %.4f EER: %.4f %s" % + (step, np.mean(self.losses), np.mean(self.eers), time_string)) + if not self.disabled: + self.loss_win = self.vis.line( + [np.mean(self.losses)], + [step], + win=self.loss_win, + update="append" if self.loss_win else None, + opts=dict( + legend=["Avg. loss"], + xlabel="Step", + ylabel="Loss", + title="Loss", + ) + ) + self.eer_win = self.vis.line( + [np.mean(self.eers)], + [step], + win=self.eer_win, + update="append" if self.eer_win else None, + opts=dict( + legend=["Avg. EER"], + xlabel="Step", + ylabel="EER", + title="Equal error rate" + ) + ) + if self.implementation_win is not None: + self.vis.text( + self.implementation_string + ("%s" % time_string), + win=self.implementation_win, + opts={"title": "Training implementation"}, + ) + + # Reset the tracking + self.losses.clear() + self.eers.clear() + self.step_times.clear() + + def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, + max_speakers=10): + max_speakers = min(max_speakers, len(colormap)) + embeds = embeds[:max_speakers * utterances_per_speaker] + + n_speakers = len(embeds) // utterances_per_speaker + ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) + colors = [colormap[i] for i in ground_truth] + + reducer = umap.UMAP() + projected = reducer.fit_transform(embeds) + plt.scatter(projected[:, 0], projected[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection (step %d)" % step) + if not self.disabled: + self.projection_win = self.vis.matplot(plt, win=self.projection_win) + if out_fpath is not None: + plt.savefig(out_fpath) + plt.clf() + + def save(self): + if not self.disabled: + self.vis.save([self.env_name]) + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py @@ -0,0 +1,76 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.p2e_cross import P2E_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class DreamVG(object): + def __init__(self, + config_path='configs/plugin_cross.yaml', + ckpt_path='../ckpts/dreamvc_plugin.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = P2E_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.spk_shape = config['model']['unet']['in_channels'] + + @torch.no_grad() + def inference(self, text, + guidance_scale=5, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023, + ): + text, text_mask = text + self.model.eval() + + gen_shape = (1, self.spk_shape) + + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, text, text_mask, train_cfg=False) + output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, text, text_mask, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5) + pred = scale_shift_re(latents, 1/self.scale, self.shift) + # pred = torch.clip(pred, min=0.0, max=0.5) + return pred \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.io import wavfile +import torch + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +def scale_shift(x, scale, shift): + return (x+shift) * scale + + +def scale_shift_re(x, scale, shift): + return (x/scale) - shift + + +def align_seq(source, target_length, mapping_method='hard'): + source_len = source.shape[1] + if mapping_method == 'hard': + mapping_idx = np.round(np.arange(target_length) * source_len / target_length) + output = source[:, mapping_idx] + else: + # TBD + raise NotImplementedError + + return output + + +def save_plot(tensor, savepath): + tensor = tensor.squeeze().cpu() + plt.style.use('default') + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') + plt.colorbar(im, ax=ax) + plt.tight_layout() + fig.canvas.draw() + plt.savefig(savepath) + plt.close() + + +def save_audio(file_path, sampling_rate, audio): + audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999) + wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16")) + + +def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, vmin, vmax) + tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1 + return tensor + + +def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor: + tensor = torch.clip(tensor, -1.0, 1.0) + tensor = (tensor + 1) / 2 + tensor = tensor * (vmax - vmin) + vmin + return tensor + + +if __name__ == "__main__": + + a = torch.rand(2, 10) + target_len = 15 + + b = align_seq(a, target_len) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py @@ -0,0 +1,144 @@ +import yaml +import torch +from diffusers import DDIMScheduler +from .model.model import DiffVC +from .model.model_cross import DiffVC_Cross +from .utils import scale_shift, scale_shift_re, rescale_noise_cfg + + +class ReDiffVC(object): + def __init__(self, + config_path='configs/diffvc_base.yaml', + ckpt_path='../ckpts/dreamvc_base.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + spk_embed, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + + +class DreamVC(object): + def __init__(self, + config_path='configs/diffvc_cross.yaml', + ckpt_path='../ckpts/dreamvc_cross.pt', + device='cpu'): + + with open(config_path, 'r') as fp: + config = yaml.safe_load(fp) + + self.device = device + self.model = DiffVC_Cross(config['model']).to(device) + self.model.load_state_dict(torch.load(ckpt_path)['model']) + self.model.eval() + + noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'], + beta_start=config['scheduler']['beta_start'], + beta_end=config['scheduler']['beta_end'], + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + self.noise_scheduler = noise_scheduler + self.scale = config['scheduler']['scale'] + self.shift = config['scheduler']['shift'] + self.melshape = config['model']['unet']['sample_size'][0] + + @torch.no_grad() + def inference(self, + text, content_clip, f0_clip=None, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023): + + text, text_mask = text + self.model.eval() + if random_seed is not None: + generator = torch.Generator(device=self.device).manual_seed(random_seed) + else: + generator = torch.Generator(device=self.device) + generator.seed() + + self.noise_scheduler.set_timesteps(ddim_steps) + + # init noise + gen_shape = (1, 1, self.melshape, content_clip.shape[-2]) + noise = torch.randn(gen_shape, generator=generator, device=self.device) + latents = noise + + for t in self.noise_scheduler.timesteps: + latents = self.noise_scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True, + speaker_cfg=1.0, pitch_cfg=0.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False) + + latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift) + return pred + diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/stream.py b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py new file mode 100644 index 0000000000000000000000000000000000000000..5e83847c3ed3e2db37c1adcef4c635b4ea30ebd0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py @@ -0,0 +1,158 @@ +import os +import torch +import torch.nn.functional as F +import librosa +import sounddevice as sd +from transformers import WavLMModel +from scipy.io.wavfile import write +from models import SynthesizerTrn +from speaker_encoder.voice_encoder import SpeakerEncoder +import utils +import numpy as np +from transformers import T5Tokenizer, T5EncoderModel +from src.plugin_wrapper import DreamVG + + +# Load configurations and models +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +print("Loading FreeVC...") +hps = utils.get_hparams_from_file("configs/freevc.json") +freevc = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +freevc.eval() +utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) + +print("Loading Speaker Encoder...") +smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + +print("Loading WavLM for content...") +cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + +lm_path = 'google/flan-t5-base' +tokenizer = T5Tokenizer.from_pretrained(lm_path) +text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval() + +dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml', + ckpt_path='checkpoints/dreamvc_plugin.pt', + device=device) + + +# Constants for overlap-add +CHUNK_SIZE = 47040 +OVERLAP = 960 +BUFFER_SIZE = OVERLAP + CHUNK_SIZE +fade_size = OVERLAP +HANN_WINDOW = np.ones(BUFFER_SIZE) +HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size)) +HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1] + +# Initialize buffers +input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32) +output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32) + + +@torch.no_grad() +def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel): + """Process audio in chunks with overlap and manage input/output buffers.""" + global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE + + # Add incoming audio chunk to input buffer + input_buffer[:OVERLAP] = input_buffer[-OVERLAP:] + input_buffer[OVERLAP:] = audio_chunk + + # Downsample to 16,000 Hz + chunk = input_buffer + chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000) + + # Convert to tensor and pad + chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float() + chunk_tensor = F.pad(chunk_tensor, (40, 40)) + + # Extract content features using WavLM + c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device) + + # Generate converted audio using FreeVC + audio = freevc.infer(c, g=tgt_embedding) + audio = audio[0][0].data.cpu().float().numpy() + + # Upsample back to 48,000 Hz + audio = librosa.resample(audio, orig_sr=16000, target_sr=48000) + + # Apply Hann window to the output + windowed_output = audio * HANN_WINDOW + + # Add the new processed audio to the output buffer with overlap + output_buffer[:OVERLAP] = output_buffer[-OVERLAP:] + output_buffer[OVERLAP:] = 0 + output_buffer += windowed_output + + normalization_factors = np.zeros(BUFFER_SIZE) + normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:] + normalization_factors += HANN_WINDOW + normalization_factors = np.clip(normalization_factors, 1e-6, None) + # output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE] + + return output_buffer[:CHUNK_SIZE] + + +def prepare_target_embedding(tgt_audio_path): + """Preprocess target audio and get speaker embedding.""" + wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + return g_tgt + + +# Prepare the target speaker embedding +# target_audio = "p225_001.wav" # Target speaker audio +# target_embedding = prepare_target_embedding(target_audio) +prompt = "A young girl voice, very cute" +prompt_guidance_scale = 3.0 + +text_batch = tokenizer(prompt, max_length=32, + padding='max_length', truncation=True, return_tensors="pt") +text, text_mask = text_batch.input_ids.to(device), \ + text_batch.attention_mask.to(device) +text = text_encoder(input_ids=text, attention_mask=text_mask)[0] +target_embedding = dreamvg.inference([text, text_mask], + guidance_scale=prompt_guidance_scale, + guidance_rescale=0.0, + ddim_steps=100, eta=1, + random_seed=None) + +# Stream settings +SAMPLING_RATE = 48000 +INPUT_DEVICE = 69 +OUTPUT_DEVICE = 58 + + +def audio_callback(indata, outdata, frames, time, status): + """Callback function for real-time audio processing with input and output buffers.""" + global input_buffer, output_buffer + + if status: + print(f"Status: {status}") + # Reshape and process input audio + indata = indata[:, 0] # Mono input + converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel) + # Write the converted audio to the output stream + outdata[:] = converted_audio.reshape(-1, 1) + + +# Start the audio stream with the updated callback +with sd.Stream( + samplerate=SAMPLING_RATE, + blocksize=CHUNK_SIZE, + channels=1, + dtype='float32', + latency='low', + device=(INPUT_DEVICE, OUTPUT_DEVICE), + callback=audio_callback): + try: + sd.sleep(1000000) + except KeyboardInterrupt: + print("Voice conversion stopped.") diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ff09995743b34dc0c96c81a5fc0ae72c3eda5843 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py @@ -0,0 +1,305 @@ +import os +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch +from torch.nn import functional as F +from commons import sequence_mask + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def get_cmodel(rank): + checkpoint = torch.load('wavlm/WavLM-Large.pt') + cfg = WavLMConfig(checkpoint['cfg']) + cmodel = WavLM(cfg).cuda(rank) + cmodel.load_state_dict(checkpoint['model']) + cmodel.eval() + return cmodel + + +def get_content(cmodel, y): + with torch.no_grad(): + c = cmodel.extract_features(y.squeeze(1))[0] + c = c.transpose(1, 2) + return c + + +def get_vocoder(rank): + with open("hifigan/config.json", "r") as f: + config = json.load(f) + config = hifigan.AttrDict(config) + vocoder = hifigan.Generator(config) + ckpt = torch.load("hifigan/generator_v1") + vocoder.load_state_dict(ckpt["generator"]) + vocoder.eval() + vocoder.remove_weight_norm() + vocoder.cuda(rank) + return vocoder + + +def transform(mel, height): # 68-92 + #r = np.random.random() + #rate = r * 0.3 + 0.85 # 0.85-1.15 + #height = int(mel.size(-2) * rate) + tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1))) + if height >= mel.size(-2): + return tgt[:, :mel.size(-2), :] + else: + silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) + silence += torch.randn_like(silence) / 10 + return torch.cat((tgt, silence), 1) + + +def stretch(mel, width): # 0.5-2 + return torchvision.transforms.functional.resize(mel, (mel.size(-2), width)) + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + if optimizer is not None: + optimizer.load_state_dict(checkpoint_dict['optimizer']) + saved_state_dict = checkpoint_dict['model'] + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict= {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info("Loaded checkpoint '{}' (iteration {})" .format( + checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info("Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path)) + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save({'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate}, checkpoint_path) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10,2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding='utf-8') as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default="./configs/base.json", + help='JSON file for configuration') + parser.add_argument('-m', '--model', type=str, required=True, + help='Model name') + + args = parser.parse_args() + model_dir = os.path.join("./logs", args.model) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + )) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn("git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/dreamvoice/train_utils/prepare_freevc/get_dist.py b/dreamvoice/train_utils/prepare_freevc/get_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..f9ad1dcbbc5a83c38ceb9101c5ae6cd744959f6e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/get_dist.py @@ -0,0 +1,49 @@ +import os +import torch +import random +import numpy as np + + +# Function to recursively find all .pt files in a directory +def find_pt_files(root_dir): + pt_files = [] + for dirpath, _, filenames in os.walk(root_dir): + for file in filenames: + if file.endswith('.pt'): + pt_files.append(os.path.join(dirpath, file)) + return pt_files + + +# Function to compute statistics for a given tensor list +def compute_statistics(tensor_list): + all_data = torch.cat(tensor_list) + mean = torch.mean(all_data).item() + std = torch.std(all_data).item() + max_val = torch.max(all_data).item() + min_val = torch.min(all_data).item() + return mean, std, max_val, min_val + + +# Root directory containing .pt files in subfolders +root_dir = "spk" + +# Find all .pt files +pt_files = find_pt_files(root_dir) + +# Randomly sample 1000 .pt files (or fewer if less than 1000 files are available) +sampled_files = random.sample(pt_files, min(1000, len(pt_files))) + +# Load tensors from sampled files +tensor_list = [] +for file in sampled_files: + tensor = torch.load(file) + tensor_list.append(tensor.view(-1)) # Flatten the tensor + +# Compute statistics +mean, std, max_val, min_val = compute_statistics(tensor_list) + +# Print the results +print(f"Mean: {mean}") +print(f"Std: {std}") +print(f"Max: {max_val}") +print(f"Min: {min_val}") \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py @@ -0,0 +1,107 @@ +from scipy.ndimage.morphology import binary_dilation +from speaker_encoder.params_data import * +from pathlib import Path +from typing import Optional, Union +import numpy as np +import webrtcvad +import librosa +import struct + +int16_max = (2 ** 15) - 1 + + +def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], + source_sr: Optional[int] = None): + """ + Applies the preprocessing operations used in training the Speaker Encoder to a waveform + either on disk or in memory. The waveform will be resampled to match the data hyperparameters. + + :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not + just .wav), either the waveform as a numpy array of floats. + :param source_sr: if passing an audio waveform, the sampling rate of the waveform before + preprocessing. After preprocessing, the waveform's sampling rate will match the data + hyperparameters. If passing a filepath, the sampling rate will be automatically detected and + this argument will be ignored. + """ + # Load the wav from disk if needed + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + wav, source_sr = librosa.load(fpath_or_wav, sr=None) + else: + wav = fpath_or_wav + + # Resample the wav if needed + if source_sr is not None and source_sr != sampling_rate: + wav = librosa.resample(wav, source_sr, sampling_rate) + + # Apply the preprocessing: normalize volume and shorten long silences + wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) + wav = trim_long_silences(wav) + + return wav + + +def wav_to_mel_spectrogram(wav): + """ + Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. + Note: this not a log-mel spectrogram. + """ + frames = librosa.feature.melspectrogram( + y=wav, + sr=sampling_rate, + n_fft=int(sampling_rate * mel_window_length / 1000), + hop_length=int(sampling_rate * mel_window_step / 1000), + n_mels=mel_n_channels + ) + return frames.astype(np.float32).T + + +def trim_long_silences(wav): + """ + Ensures that segments without voice in the waveform remain no longer than a + threshold determined by the VAD parameters in params.py. + + :param wav: the raw waveform as a numpy array of floats + :return: the same waveform with silences trimmed away (length <= original wav length) + """ + # Compute the voice detection window size + samples_per_window = (vad_window_length * sampling_rate) // 1000 + + # Trim the end of the audio to have a multiple of the window size + wav = wav[:len(wav) - (len(wav) % samples_per_window)] + + # Convert the float waveform to 16-bit mono PCM + pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) + + # Perform voice activation detection + voice_flags = [] + vad = webrtcvad.Vad(mode=3) + for window_start in range(0, len(wav), samples_per_window): + window_end = window_start + samples_per_window + voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], + sample_rate=sampling_rate)) + voice_flags = np.array(voice_flags) + + # Smooth the voice detection with a moving average + def moving_average(array, width): + array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) + ret = np.cumsum(array_padded, dtype=float) + ret[width:] = ret[width:] - ret[:-width] + return ret[width - 1:] / width + + audio_mask = moving_average(voice_flags, vad_moving_average_width) + audio_mask = np.round(audio_mask).astype(np.bool) + + # Dilate the voiced regions + audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) + audio_mask = np.repeat(audio_mask, samples_per_window) + + return wav[audio_mask == True] + + +def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) + if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): + return wav + return wav * (10 ** (dBFS_change / 20)) diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt new file mode 100644 index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca +size 17090379 diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py @@ -0,0 +1,40 @@ +from speaker_encoder import inference as encoder +from multiprocessing.pool import Pool +from functools import partial +from pathlib import Path +# from utils import logmmse +# from tqdm import tqdm +# import numpy as np +# import librosa + + +def embed_utterance(fpaths, encoder_model_fpath): + if not encoder.is_loaded(): + encoder.load_model(encoder_model_fpath) + + # Compute the speaker embedding of the utterance + wav_fpath, embed_fpath = fpaths + wav = np.load(wav_fpath) + wav = encoder.preprocess_wav(wav) + embed = encoder.embed_utterance(wav) + np.save(embed_fpath, embed, allow_pickle=False) + + +def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): + + wav_dir = outdir_root.joinpath("audio") + metadata_fpath = synthesizer_root.joinpath("train.txt") + assert wav_dir.exists() and metadata_fpath.exists() + embed_dir = synthesizer_root.joinpath("embeds") + embed_dir.mkdir(exist_ok=True) + + # Gather the input wave filepath and the target output embed filepath + with metadata_fpath.open("r") as metadata_file: + metadata = [line.split("|") for line in metadata_file] + fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] + + # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. + # Embed the utterances in separate threads + func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) + job = Pool(n_processes).imap(func, fpaths) + list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py @@ -0,0 +1,45 @@ +librispeech_datasets = { + "train": { + "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], + "other": ["LibriSpeech/train-other-500"] + }, + "test": { + "clean": ["LibriSpeech/test-clean"], + "other": ["LibriSpeech/test-other"] + }, + "dev": { + "clean": ["LibriSpeech/dev-clean"], + "other": ["LibriSpeech/dev-other"] + }, +} +libritts_datasets = { + "train": { + "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], + "other": ["LibriTTS/train-other-500"] + }, + "test": { + "clean": ["LibriTTS/test-clean"], + "other": ["LibriTTS/test-other"] + }, + "dev": { + "clean": ["LibriTTS/dev-clean"], + "other": ["LibriTTS/dev-other"] + }, +} +voxceleb_datasets = { + "voxceleb1" : { + "train": ["VoxCeleb1/wav"], + "test": ["VoxCeleb1/test_wav"] + }, + "voxceleb2" : { + "train": ["VoxCeleb2/dev/aac"], + "test": ["VoxCeleb2/test_wav"] + } +} + +other_datasets = [ + "LJSpeech-1.1", + "VCTK-Corpus/wav48", +] + +anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py @@ -0,0 +1,2 @@ +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py new file mode 100644 index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py @@ -0,0 +1,37 @@ +import random + +class RandomCycler: + """ + Creates an internal copy of a sequence and allows access to its items in a constrained random + order. For a source sequence of n items and one or several consecutive queries of a total + of m items, the following guarantees hold (one implies the other): + - Each item will be returned between m // n and ((m - 1) // n) + 1 times. + - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. + """ + + def __init__(self, source): + if len(source) == 0: + raise Exception("Can't create RandomCycler from an empty collection") + self.all_items = list(source) + self.next_items = [] + + def sample(self, count: int): + shuffle = lambda l: random.sample(l, len(l)) + + out = [] + while count > 0: + if count >= len(self.all_items): + out.extend(shuffle(list(self.all_items))) + count -= len(self.all_items) + continue + n = min(count, len(self.next_items)) + out.extend(self.next_items[:n]) + count -= n + self.next_items = self.next_items[n:] + if len(self.next_items) == 0: + self.next_items = shuffle(list(self.all_items)) + return out + + def __next__(self): + return self.sample(1)[0] + diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py @@ -0,0 +1,40 @@ +from speaker_encoder.data_objects.random_cycler import RandomCycler +from speaker_encoder.data_objects.utterance import Utterance +from pathlib import Path + +# Contains the set of utterances of a single speaker +class Speaker: + def __init__(self, root: Path): + self.root = root + self.name = root.name + self.utterances = None + self.utterance_cycler = None + + def _load_utterances(self): + with self.root.joinpath("_sources.txt").open("r") as sources_file: + sources = [l.split(",") for l in sources_file] + sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} + self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] + self.utterance_cycler = RandomCycler(self.utterances) + + def random_partial(self, count, n_frames): + """ + Samples a batch of unique partial utterances from the disk in a way that all + utterances come up at least once every two cycles and in a random order every time. + + :param count: The number of partial utterances to sample from the set of utterances from + that speaker. Utterances are guaranteed not to be repeated if is not larger than + the number of utterances available. + :param n_frames: The number of frames in the partial utterance. + :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, + frames are the frames of the partial utterances and range is the range of the partial + utterance with regard to the complete utterance. + """ + if self.utterances is None: + self._load_utterances() + + utterances = self.utterance_cycler.sample(count) + + a = [(u,) + u.random_partial(n_frames) for u in utterances] + + return a diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py @@ -0,0 +1,12 @@ +import numpy as np +from typing import List +from speaker_encoder.data_objects.speaker import Speaker + +class SpeakerBatch: + def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): + self.speakers = speakers + self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} + + # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with + # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) + self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py @@ -0,0 +1,56 @@ +from speaker_encoder.data_objects.random_cycler import RandomCycler +from speaker_encoder.data_objects.speaker_batch import SpeakerBatch +from speaker_encoder.data_objects.speaker import Speaker +from speaker_encoder.params_data import partials_n_frames +from torch.utils.data import Dataset, DataLoader +from pathlib import Path + +# TODO: improve with a pool of speakers for data efficiency + +class SpeakerVerificationDataset(Dataset): + def __init__(self, datasets_root: Path): + self.root = datasets_root + speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] + if len(speaker_dirs) == 0: + raise Exception("No speakers found. Make sure you are pointing to the directory " + "containing all preprocessed speaker directories.") + self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] + self.speaker_cycler = RandomCycler(self.speakers) + + def __len__(self): + return int(1e10) + + def __getitem__(self, index): + return next(self.speaker_cycler) + + def get_logs(self): + log_string = "" + for log_fpath in self.root.glob("*.txt"): + with log_fpath.open("r") as log_file: + log_string += "".join(log_file.readlines()) + return log_string + + +class SpeakerVerificationDataLoader(DataLoader): + def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, + batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, + worker_init_fn=None): + self.utterances_per_speaker = utterances_per_speaker + + super().__init__( + dataset=dataset, + batch_size=speakers_per_batch, + shuffle=False, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=self.collate, + pin_memory=pin_memory, + drop_last=False, + timeout=timeout, + worker_init_fn=worker_init_fn + ) + + def collate(self, speakers): + return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py new file mode 100644 index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py @@ -0,0 +1,26 @@ +import numpy as np + + +class Utterance: + def __init__(self, frames_fpath, wave_fpath): + self.frames_fpath = frames_fpath + self.wave_fpath = wave_fpath + + def get_frames(self): + return np.load(self.frames_fpath) + + def random_partial(self, n_frames): + """ + Crops the frames into a partial utterance of n_frames + + :param n_frames: The number of frames of the partial utterance + :return: the partial utterance frames and a tuple indicating the start and end of the + partial utterance in the complete utterance. + """ + frames = self.get_frames() + if frames.shape[0] == n_frames: + start = 0 + else: + start = np.random.randint(0, frames.shape[0] - n_frames) + end = start + n_frames + return frames[start:end], (start, end) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py new file mode 100644 index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py @@ -0,0 +1,31 @@ +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py @@ -0,0 +1,177 @@ +from speaker_encoder.params_data import * +from speaker_encoder.model import SpeakerEncoder +from speaker_encoder.audio import preprocess_wav # We want to expose this function from here +from matplotlib import cm +from speaker_encoder import audio +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import torch + +_model = None # type: SpeakerEncoder +_device = None # type: torch.device + + +def load_model(weights_fpath: Path, device=None): + """ + Loads the model in memory. If this function is not explicitely called, it will be run on the + first call to embed_frames() with the default weights file. + + :param weights_fpath: the path to saved model weights. + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The + model will be loaded and will run on this device. Outputs will however always be on the cpu. + If None, will default to your GPU if it"s available, otherwise your CPU. + """ + # TODO: I think the slow loading of the encoder might have something to do with the device it + # was saved on. Worth investigating. + global _model, _device + if device is None: + _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + _device = torch.device(device) + _model = SpeakerEncoder(_device, torch.device("cpu")) + checkpoint = torch.load(weights_fpath) + _model.load_state_dict(checkpoint["model_state"]) + _model.eval() + print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) + + +def is_loaded(): + return _model is not None + + +def embed_frames_batch(frames_batch): + """ + Computes embeddings for a batch of mel spectrogram. + + :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape + (batch_size, n_frames, n_channels) + :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size) + """ + if _model is None: + raise Exception("Model was not loaded. Call load_model() before inference.") + + frames = torch.from_numpy(frames_batch).to(_device) + embed = _model.forward(frames).detach().cpu().numpy() + return embed + + +def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, + min_pad_coverage=0.75, overlap=0.5): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain + partial utterances of each. Both the waveform and the mel + spectrogram slices are returned, so as to make each partial utterance waveform correspond to + its spectrogram. This function assumes that the mel spectrogram parameters used are those + defined in params_data.py. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial + utterance + :param min_pad_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + :param overlap: by how much the partial utterance should overlap. If set to 0, the partial + utterances are entirely disjoint. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 <= overlap < 1 + assert 0 < min_pad_coverage <= 1 + + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partial_utterance_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_pad_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + +def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): + """ + Computes an embedding for a single utterance. + + # TODO: handle multiple wavs to benefit from batching on GPU + :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 + :param using_partials: if True, then the utterance is split in partial utterances of + frames and the utterance embedding is computed from their + normalized average. If False, the utterance is instead computed from feeding the entire + spectogram to the network. + :param return_partials: if True, the partial embeddings will also be returned along with the + wav slices that correspond to the partial embeddings. + :param kwargs: additional arguments to compute_partial_splits() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. If is simultaneously set to False, both these values will be None + instead. + """ + # Process the entire utterance if not using partials + if not using_partials: + frames = audio.wav_to_mel_spectrogram(wav) + embed = embed_frames_batch(frames[None, ...])[0] + if return_partials: + return embed, None, None + return embed + + # Compute where to split the utterance into partials and pad if necessary + wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials + frames = audio.wav_to_mel_spectrogram(wav) + frames_batch = np.array([frames[s] for s in mel_slices]) + partial_embeds = embed_frames_batch(frames_batch) + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wave_slices + return embed + + +def embed_speaker(wavs, **kwargs): + raise NotImplemented() + + +def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): + if ax is None: + ax = plt.gca() + + if shape is None: + height = int(np.sqrt(len(embed))) + shape = (height, -1) + embed = embed.reshape(shape) + + cmap = cm.get_cmap() + mappable = ax.imshow(embed, cmap=cmap) + cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) + cbar.set_clim(*color_range) + + ax.set_xticks([]), ax.set_yticks([]) + ax.set_title(title) diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py new file mode 100644 index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py @@ -0,0 +1,135 @@ +from speaker_encoder.params_model import * +from speaker_encoder.params_data import * +from scipy.interpolate import interp1d +from sklearn.metrics import roc_curve +from torch.nn.utils import clip_grad_norm_ +from scipy.optimize import brentq +from torch import nn +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, device, loss_device): + super().__init__() + self.loss_device = loss_device + + # Network defition + self.lstm = nn.LSTM(input_size=mel_n_channels, # 40 + hidden_size=model_hidden_size, # 256 + num_layers=model_num_layers, # 3 + batch_first=True).to(device) + self.linear = nn.Linear(in_features=model_hidden_size, + out_features=model_embedding_size).to(device) + self.relu = torch.nn.ReLU().to(device) + + # Cosine similarity scaling (with fixed initial parameter values) + self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) + self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) + + # Loss + self.loss_fn = nn.CrossEntropyLoss().to(loss_device) + + def do_gradient_ops(self): + # Gradient scale + self.similarity_weight.grad *= 0.01 + self.similarity_bias.grad *= 0.01 + + # Gradient clipping + clip_grad_norm_(self.parameters(), 3, norm_type=2) + + def forward(self, utterances, hidden_init=None): + """ + Computes the embeddings of a batch of utterance spectrograms. + + :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape + (batch_size, n_frames, n_channels) + :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, + batch_size, hidden_size). Will default to a tensor of zeros if None. + :return: the embeddings as a tensor of shape (batch_size, embedding_size) + """ + # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state + # and the final cell state. + out, (hidden, cell) = self.lstm(utterances, hidden_init) + + # We take only the hidden state of the last layer + embeds_raw = self.relu(self.linear(hidden[-1])) + + # L2-normalize it + embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + return embeds + + def similarity_matrix(self, embeds): + """ + Computes the similarity matrix according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the similarity matrix as a tensor of shape (speakers_per_batch, + utterances_per_speaker, speakers_per_batch) + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation + centroids_incl = torch.mean(embeds, dim=1, keepdim=True) + centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) + + # Exclusive centroids (1 per utterance) + centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) + centroids_excl /= (utterances_per_speaker - 1) + centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) + + # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot + # product of these vectors (which is just an element-wise multiplication reduced by a sum). + # We vectorize the computation for efficiency. + sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, + speakers_per_batch).to(self.loss_device) + mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) + for j in range(speakers_per_batch): + mask = np.where(mask_matrix[j])[0] + sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) + sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) + + ## Even more vectorized version (slower maybe because of transpose) + # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker + # ).to(self.loss_device) + # eye = np.eye(speakers_per_batch, dtype=np.int) + # mask = np.where(1 - eye) + # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) + # mask = np.where(eye) + # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) + # sim_matrix2 = sim_matrix2.transpose(1, 2) + + sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias + return sim_matrix + + def loss(self, embeds): + """ + Computes the softmax loss according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the loss and the EER for this batch of embeddings. + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Loss + sim_matrix = self.similarity_matrix(embeds) + sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, + speakers_per_batch)) + ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) + target = torch.from_numpy(ground_truth).long().to(self.loss_device) + loss = self.loss_fn(sim_matrix, target) + + # EER (not backpropagated) + with torch.no_grad(): + inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] + labels = np.array([inv_argmax(i) for i in ground_truth]) + preds = sim_matrix.detach().cpu().numpy() + + # Snippet from https://yangcha.github.io/EER-ROC/ + fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) + eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + + return loss, eer \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py new file mode 100644 index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py @@ -0,0 +1,29 @@ + +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms +# Number of spectrogram frames at inference +inference_n_frames = 80 # 800 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py new file mode 100644 index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py @@ -0,0 +1,11 @@ + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 + + +## Training parameters +learning_rate_init = 1e-4 +speakers_per_batch = 64 +utterances_per_speaker = 10 diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py @@ -0,0 +1,285 @@ +from multiprocess.pool import ThreadPool +from speaker_encoder.params_data import * +from speaker_encoder.config import librispeech_datasets, anglophone_nationalites +from datetime import datetime +from speaker_encoder import audio +from pathlib import Path +from tqdm import tqdm +import numpy as np + + +class DatasetLog: + """ + Registers metadata about the dataset in a text file. + """ + def __init__(self, root, name): + self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") + self.sample_data = dict() + + start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Creating dataset %s on %s" % (name, start_time)) + self.write_line("-----") + self._log_params() + + def _log_params(self): + from speaker_encoder import params_data + self.write_line("Parameter values:") + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + self.write_line("\t%s: %s" % (param_name, value)) + self.write_line("-----") + + def write_line(self, line): + self.text_file.write("%s\n" % line) + + def add_sample(self, **kwargs): + for param_name, value in kwargs.items(): + if not param_name in self.sample_data: + self.sample_data[param_name] = [] + self.sample_data[param_name].append(value) + + def finalize(self): + self.write_line("Statistics:") + for param_name, values in self.sample_data.items(): + self.write_line("\t%s:" % param_name) + self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) + self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) + self.write_line("-----") + end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Finished on %s" % end_time) + self.text_file.close() + + +def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): + dataset_root = datasets_root.joinpath(dataset_name) + if not dataset_root.exists(): + print("Couldn\'t find %s, skipping this dataset." % dataset_root) + return None, None + return dataset_root, DatasetLog(out_dir, dataset_name) + + +def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + + # Function to preprocess utterances for one speaker + def preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + if sources_fpath.exists(): + try: + with sources_fpath.open("r") as sources_file: + existing_fnames = {line.split(",")[0] for line in sources_file} + except: + existing_fnames = {} + else: + existing_fnames = {} + + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + + # Process the utterances for each speaker + with ThreadPool(8) as pool: + list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + unit="speakers")) + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +# Function to preprocess utterances for one speaker +def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + # if sources_fpath.exists(): + # try: + # with sources_fpath.open("r") as sources_file: + # existing_fnames = {line.split(",")[0] for line in sources_file} + # except: + # existing_fnames = {} + # else: + # existing_fnames = {} + existing_fnames = {} + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + # logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + return len(wav) + +def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + # from multiprocessing import Pool, cpu_count + from pathos.multiprocessing import ProcessingPool as Pool + # Function to preprocess utterances for one speaker + def __preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + existing_fnames = {} + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + wav_lens = [] + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + # logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + wav_lens.append(len(wav)) + sources_file.close() + return wav_lens + + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + # Process the utterances for each speaker + # with ThreadPool(8) as pool: + # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + # unit="speakers")) + pool = Pool(processes=20) + for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1): + for wav_len in wav_lens: + logger.add_sample(duration=wav_len / sampling_rate) + print(f'{i}/{len(speaker_dirs)} \r') + + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): + for dataset_name in librispeech_datasets["train"]["other"]: + # Initialize the preprocessing + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Preprocess all speakers + speaker_dirs = list(dataset_root.glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", + skip_existing, logger) + + +def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb1" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the contents of the meta file + with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: + metadata = [line.split("\t") for line in metafile][1:] + + # Select the ID and the nationality, filter out non-anglophone speakers + nationalities = {line[0]: line[3] for line in metadata} + # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if + # nationality.lower() in anglophone_nationalites] + keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()] + print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % + (len(keep_speaker_ids), len(nationalities))) + + # Get the speaker directories for anglophone speakers only + speaker_dirs = dataset_root.joinpath("wav").glob("*") + speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if + speaker_dir.name in keep_speaker_ids] + print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % + (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) + + # Preprocess all speakers + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", + skip_existing, logger) + + +def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb2" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the speaker directories + # Preprocess all speakers + speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) + _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", + skip_existing, logger) diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py @@ -0,0 +1,125 @@ +from speaker_encoder.visualizations import Visualizations +from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset +from speaker_encoder.params_model import * +from speaker_encoder.model import SpeakerEncoder +from utils.profiler import Profiler +from pathlib import Path +import torch + +def sync(device: torch.device): + # FIXME + return + # For correct profiling (cuda operations are async) + if device.type == "cuda": + torch.cuda.synchronize(device) + +def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, + backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, + no_visdom: bool): + # Create a dataset and a dataloader + dataset = SpeakerVerificationDataset(clean_data_root) + loader = SpeakerVerificationDataLoader( + dataset, + speakers_per_batch, # 64 + utterances_per_speaker, # 10 + num_workers=8, + ) + + # Setup the device on which to run the forward pass and the loss. These can be different, + # because the forward pass is faster on the GPU whereas the loss is often (depending on your + # hyperparameters) faster on the CPU. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # FIXME: currently, the gradient is None if loss_device is cuda + loss_device = torch.device("cpu") + + # Create the model and the optimizer + model = SpeakerEncoder(device, loss_device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) + init_step = 1 + + # Configure file path for the model + state_fpath = models_dir.joinpath(run_id + ".pt") + backup_dir = models_dir.joinpath(run_id + "_backups") + + # Load any existing model + if not force_restart: + if state_fpath.exists(): + print("Found existing model \"%s\", loading it and resuming training." % run_id) + checkpoint = torch.load(state_fpath) + init_step = checkpoint["step"] + model.load_state_dict(checkpoint["model_state"]) + optimizer.load_state_dict(checkpoint["optimizer_state"]) + optimizer.param_groups[0]["lr"] = learning_rate_init + else: + print("No model \"%s\" found, starting training from scratch." % run_id) + else: + print("Starting the training from scratch.") + model.train() + + # Initialize the visualization environment + vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) + vis.log_dataset(dataset) + vis.log_params() + device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") + vis.log_implementation({"Device": device_name}) + + # Training loop + profiler = Profiler(summarize_every=10, disabled=False) + for step, speaker_batch in enumerate(loader, init_step): + profiler.tick("Blocking, waiting for batch (threaded)") + + # Forward pass + inputs = torch.from_numpy(speaker_batch.data).to(device) + sync(device) + profiler.tick("Data to %s" % device) + embeds = model(inputs) + sync(device) + profiler.tick("Forward pass") + embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) + loss, eer = model.loss(embeds_loss) + sync(loss_device) + profiler.tick("Loss") + + # Backward pass + model.zero_grad() + loss.backward() + profiler.tick("Backward pass") + model.do_gradient_ops() + optimizer.step() + profiler.tick("Parameter update") + + # Update visualizations + # learning_rate = optimizer.param_groups[0]["lr"] + vis.update(loss.item(), eer, step) + + # Draw projections and save them to the backup folder + if umap_every != 0 and step % umap_every == 0: + print("Drawing and saving projections (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) + embeds = embeds.detach().cpu().numpy() + vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) + vis.save() + + # Overwrite the latest version of the model + if save_every != 0 and step % save_every == 0: + print("Saving the model (step %d)" % step) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, state_fpath) + + # Make a backup + if backup_every != 0 and step % backup_every == 0: + print("Making a backup (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, backup_fpath) + + profiler.tick("Extras (visualizations, saving)") + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py @@ -0,0 +1,178 @@ +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from datetime import datetime +from time import perf_counter as timer +import matplotlib.pyplot as plt +import numpy as np +# import webbrowser +import visdom +import umap + +colormap = np.array([ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], +], dtype=np.float) / 255 + + +class Visualizations: + def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): + # Tracking data + self.last_update_timestamp = timer() + self.update_every = update_every + self.step_times = [] + self.losses = [] + self.eers = [] + print("Updating the visualizations every %d steps." % update_every) + + # If visdom is disabled TODO: use a better paradigm for that + self.disabled = disabled + if self.disabled: + return + + # Set the environment name + now = str(datetime.now().strftime("%d-%m %Hh%M")) + if env_name is None: + self.env_name = now + else: + self.env_name = "%s (%s)" % (env_name, now) + + # Connect to visdom and open the corresponding window in the browser + try: + self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) + except ConnectionError: + raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " + "start it.") + # webbrowser.open("http://localhost:8097/env/" + self.env_name) + + # Create the windows + self.loss_win = None + self.eer_win = None + # self.lr_win = None + self.implementation_win = None + self.projection_win = None + self.implementation_string = "" + + def log_params(self): + if self.disabled: + return + from speaker_encoder import params_data + from speaker_encoder import params_model + param_string = "Model parameters:
" + for param_name in (p for p in dir(params_model) if not p.startswith("__")): + value = getattr(params_model, param_name) + param_string += "\t%s: %s
" % (param_name, value) + param_string += "Data parameters:
" + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + param_string += "\t%s: %s
" % (param_name, value) + self.vis.text(param_string, opts={"title": "Parameters"}) + + def log_dataset(self, dataset: SpeakerVerificationDataset): + if self.disabled: + return + dataset_string = "" + dataset_string += "Speakers: %s\n" % len(dataset.speakers) + dataset_string += "\n" + dataset.get_logs() + dataset_string = dataset_string.replace("\n", "
") + self.vis.text(dataset_string, opts={"title": "Dataset"}) + + def log_implementation(self, params): + if self.disabled: + return + implementation_string = "" + for param, value in params.items(): + implementation_string += "%s: %s\n" % (param, value) + implementation_string = implementation_string.replace("\n", "
") + self.implementation_string = implementation_string + self.implementation_win = self.vis.text( + implementation_string, + opts={"title": "Training implementation"} + ) + + def update(self, loss, eer, step): + # Update the tracking data + now = timer() + self.step_times.append(1000 * (now - self.last_update_timestamp)) + self.last_update_timestamp = now + self.losses.append(loss) + self.eers.append(eer) + print(".", end="") + + # Update the plots every steps + if step % self.update_every != 0: + return + time_string = "Step time: mean: %5dms std: %5dms" % \ + (int(np.mean(self.step_times)), int(np.std(self.step_times))) + print("\nStep %6d Loss: %.4f EER: %.4f %s" % + (step, np.mean(self.losses), np.mean(self.eers), time_string)) + if not self.disabled: + self.loss_win = self.vis.line( + [np.mean(self.losses)], + [step], + win=self.loss_win, + update="append" if self.loss_win else None, + opts=dict( + legend=["Avg. loss"], + xlabel="Step", + ylabel="Loss", + title="Loss", + ) + ) + self.eer_win = self.vis.line( + [np.mean(self.eers)], + [step], + win=self.eer_win, + update="append" if self.eer_win else None, + opts=dict( + legend=["Avg. EER"], + xlabel="Step", + ylabel="EER", + title="Equal error rate" + ) + ) + if self.implementation_win is not None: + self.vis.text( + self.implementation_string + ("%s" % time_string), + win=self.implementation_win, + opts={"title": "Training implementation"}, + ) + + # Reset the tracking + self.losses.clear() + self.eers.clear() + self.step_times.clear() + + def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, + max_speakers=10): + max_speakers = min(max_speakers, len(colormap)) + embeds = embeds[:max_speakers * utterances_per_speaker] + + n_speakers = len(embeds) // utterances_per_speaker + ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) + colors = [colormap[i] for i in ground_truth] + + reducer = umap.UMAP() + projected = reducer.fit_transform(embeds) + plt.scatter(projected[:, 0], projected[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection (step %d)" % step) + if not self.disabled: + self.projection_win = self.vis.matplot(plt, win=self.projection_win) + if out_fpath is not None: + plt.savefig(out_fpath) + plt.clf() + + def save(self): + if not self.disabled: + self.vis.save([self.env_name]) + \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py @@ -0,0 +1,173 @@ +from speaker_encoder.hparams import * +from speaker_encoder import audio +from pathlib import Path +from typing import Union, List +from torch import nn +from time import perf_counter as timer +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True): + """ + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). + If None, defaults to cuda if it is available on your machine, otherwise the model will + run on cpu. Outputs are always returned on the cpu, as numpy arrays. + """ + super().__init__() + + # Define the network + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + # Get the target device + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + device = torch.device(device) + self.device = device + + # Load the pretrained model'speaker weights + # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt") + # if not weights_fpath.exists(): + # raise Exception("Couldn't find the voice encoder pretrained model at %s." % + # weights_fpath) + + start = timer() + checkpoint = torch.load(weights_fpath, map_location="cpu") + + self.load_state_dict(checkpoint["model_state"], strict=False) + self.to(device) + + if verbose: + print("Loaded the voice encoder model on %s in %.2f seconds." % + (device.type, timer() - start)) + + def forward(self, mels: torch.FloatTensor): + """ + Computes the embeddings of a batch of utterance spectrograms. + :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape + (batch_size, n_frames, n_channels) + :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size). + Embeddings are positive and L2-normed, thus they lay in the range [0, 1]. + """ + # Pass the input through the LSTM layers and retrieve the final hidden state of the last + # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings. + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + @staticmethod + def compute_partial_slices(n_samples: int, rate, min_coverage): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to + obtain partial utterances of each. Both the waveform and the + mel spectrogram slices are returned, so as to make each partial utterance waveform + correspond to its spectrogram. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wav_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param rate: how many partial utterances should occur per second. Partial utterances must + cover the span of the entire utterance, thus the rate should not be lower than the inverse + of the duration of a partial utterance. By default, partial utterances are 1.6s long and + the minimum rate is thus 0.625. + :param min_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered by zero-padding the audio. Otherwise, + it will be discarded. If there aren't enough frames for one partial utterance, + this parameter is ignored so that the function always returns at least one slice. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 < min_coverage <= 1 + + # Compute how many frames separate two partial utterances + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) + assert 0 < frame_step, "The rate is too high" + assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \ + (sampling_rate / (samples_per_frame * partials_n_frames)) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partials_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partials_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): + """ + Computes an embedding for a single utterance. The utterance is divided in partial + utterances and an embedding is computed for each. The complete utterance embedding is the + L2-normed average embedding of the partial utterances. + + TODO: independent batched version of this function + + :param wav: a preprocessed utterance waveform as a numpy array of float32 + :param return_partials: if True, the partial embeddings will also be returned along with + the wav slices corresponding to each partial utterance. + :param rate: how many partial utterances should occur per second. Partial utterances must + cover the span of the entire utterance, thus the rate should not be lower than the inverse + of the duration of a partial utterance. By default, partial utterances are 1.6s long and + the minimum rate is thus 0.625. + :param min_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered by zero-padding the audio. Otherwise, + it will be discarded. If there aren't enough frames for one partial utterance, + this parameter is ignored so that the function always returns at least one slice. + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. + """ + # Compute where to split the utterance into partials and pad the waveform with zeros if + # the partial utterances cover a larger range. + wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage) + max_wave_length = wav_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials and forward them through the model + mel = audio.wav_to_mel_spectrogram(wav) + mels = np.array([mel[s] for s in mel_slices]) + with torch.no_grad(): + mels = torch.from_numpy(mels).to(self.device) + partial_embeds = self(mels).cpu().numpy() + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wav_slices + return embed + + def embed_speaker(self, wavs: List[np.ndarray], **kwargs): + """ + Compute the embedding of a collection of wavs (presumably from the same speaker) by + averaging their embedding and L2-normalizing it. + + :param wavs: list of wavs a numpy arrays of float32. + :param kwargs: extra arguments to embed_utterance() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). + """ + raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \ + for wav in wavs], axis=0) + return raw_embed / np.linalg.norm(raw_embed, 2) \ No newline at end of file diff --git a/dreamvoice/train_utils/prepare_freevc/spk_ext.py b/dreamvoice/train_utils/prepare_freevc/spk_ext.py new file mode 100644 index 0000000000000000000000000000000000000000..c6a71ad6a6131fb67729f1cc6f161dd3fcf276b0 --- /dev/null +++ b/dreamvoice/train_utils/prepare_freevc/spk_ext.py @@ -0,0 +1,90 @@ +import os +import torch +import librosa +from tqdm import tqdm +from speaker_encoder.voice_encoder import SpeakerEncoder +from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments + + +@torch.no_grad() +def se_extractor(audio_path, smodel): + # vad + SAMPLE_RATE = 16000 + audio_vad = get_audio_tensor(audio_path) + segments = get_vad_segments( + audio_vad, + output_sample=True, + min_speech_duration=0.1, + min_silence_duration=1, + method="silero", + ) + segments = [(seg["start"], seg["end"]) for seg in segments] + segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] + + if len(segments) == 0: + segments = [(0, len(audio_vad)/SAMPLE_RATE)] + print(segments) + + # spk + gs = [] + + audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE) + # audio = torch.tensor(audio).float().to(device) + + for s, e in segments: + y = audio[int(SAMPLE_RATE*s):int(SAMPLE_RATE*e)] + g = smodel.embed_utterance(y) + g = torch.from_numpy(g).unsqueeze(0) + gs.append(g) + + gs = torch.stack(gs).mean(0) + return gs.cpu() + + +def process_audio_folder(input_folder, output_folder, model, device): + """ + Process all audio files in a folder and its subfolders, + save the extracted features as .pt files in the output folder with the same structure. + + Args: + input_folder (str): Path to the input folder containing audio files. + output_folder (str): Path to the output folder to save .pt files. + model: Pre-trained model for feature extraction. + device: Torch device (e.g., 'cpu' or 'cuda'). + """ + # Collect all audio file paths + audio_files = [] + for root, _, files in os.walk(input_folder): + for file in files: + if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process + audio_files.append(os.path.join(root, file)) + + # Process each audio file with tqdm for progress + for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"): + # Construct output path + relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder) + output_dir = os.path.join(output_folder, relative_path) + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt') + + # Check if the .pt file already exists + if os.path.exists(output_path): + # print(f"Skipped (already exists): {output_path}") + continue # Skip processing this file + # Extract features + target_se = se_extractor(audio_path, model).to(device) + # Save the feature as .pt + torch.save(target_se, output_path) + # print(f"Processed and saved: {output_path}") + + +if __name__ == '__main__': + smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + device = 'cuda' + # input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360' + # output_folder = 'spk/LibriTTS-R/train-clean-360/' + # process_audio_folder(input_folder, output_folder, smodel, device) + + input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/' + output_folder = 'spk/VCTK/VCTK-Corpus/' + process_audio_folder(input_folder, output_folder, smodel, device) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/configs/plugin.py b/dreamvoice/train_utils/src/configs/plugin.py index 5e9a409af86ef67361bae0b7c3ee3b747ee907eb..70a55a8f5edd7d95486b9d28b076fab96b916fb7 100644 --- a/dreamvoice/train_utils/src/configs/plugin.py +++ b/dreamvoice/train_utils/src/configs/plugin.py @@ -18,7 +18,7 @@ class AttrDict(dict): all_params = { - 'Plugin_base': AttrDict( + 'Plugin_freevc': AttrDict( # Diff params diff=AttrDict( num_train_steps=1000, diff --git a/dreamvoice/train_utils/src/dataset/vcdata.py b/dreamvoice/train_utils/src/dataset/vcdata.py new file mode 100644 index 0000000000000000000000000000000000000000..c7a1d99a243869794900eed6189a32bad930aea4 --- /dev/null +++ b/dreamvoice/train_utils/src/dataset/vcdata.py @@ -0,0 +1,146 @@ +import pandas as pd +import os +import random +import ast +import numpy as np +import torch +from einops import repeat, rearrange +import librosa + +from torch.utils.data import Dataset +import torchaudio + + +def log_f0(f0, f0_min=librosa.note_to_hz('C2'), scales=4): + f0[f0 < f0_min] = 0.0 + f0_log = torch.zeros_like(f0) + f0_log[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1 + # f0_mel_min = 12*np.log2(f0_min/f0_min) + 1 + # f0_mel_max = 12*np.log2(f0_max/f0_min) + 1 + f0_log /= (scales*12) + return f0_log + + +class VCData(Dataset): + def __init__(self, + data_dir, meta_dir, subset, prompt_dir, + seg_length=1.92, speaker_length=4, + sr=24000, content_sr=50, speaker_sr=16000, + plugin_mode=False + ): + self.datadir = data_dir + meta = pd.read_csv(meta_dir) + self.meta = meta[meta['subset'] == subset] + self.subset = subset + self.prompts = pd.read_csv(prompt_dir) + self.seg_len = seg_length + self.speaker_length = speaker_length + self.sr = sr + self.content_sr = content_sr + self.speaker_sr = speaker_sr + self.plugin_mode = plugin_mode + + def get_audio_content(self, audio_path, content_path, f0_path): + audio_path = self.datadir + audio_path + audio, sr = torchaudio.load(audio_path) + assert sr == self.sr + + # 1, T, C + content = torch.load(self.datadir + content_path) + + total_length = content.shape[1] + if int(total_length - int(self.content_sr * self.seg_len)) > 0: + start = np.random.randint(0, int(total_length - self.content_sr * self.seg_len) + 1) + else: + start = 0 + end = min(start + int(self.seg_len * self.content_sr), content.shape[1]) + + # use last frame for padding + content_clip = repeat(content[:, -1, :], "b c-> b t c", t=int(self.content_sr * self.seg_len)).clone() + content_clip[:, :end - start, :] = content[:, start: end, :] + + audio_clip = torch.zeros(int(self.seg_len * self.sr)) + # print(start) + # print(end) + audio_start = round(start * self.sr / self.content_sr) + audio_end = round(end * self.sr / self.content_sr) + # print(audio_start) + # print(audio_end) + # print(audio.shape) + + audio_clip[:audio_end - audio_start] = audio[0, audio_start: audio_end].clone() + + if f0_path: + f0 = torch.load(self.datadir + f0_path).float() + f0_clip = torch.zeros(int(self.content_sr * self.seg_len)) + f0_clip[:end-start] = f0[start:end] + f0_clip = log_f0(f0_clip) + f0_clip = f0_clip.unsqueeze(-1) + else: + f0_clip = None + + return audio_clip, content_clip[0], f0_clip + + def get_speaker(self, speaker_path): + audio_path = self.datadir + speaker_path + audio, sr = torchaudio.load(audio_path) + assert sr == self.speaker_sr + # if sr != self.speaker_sr: + # resampler = torchaudio.transforms.Resample(sr, self.speaker_sr, dtype=audio.dtype) + # audio = resampler(audio) + + audio_clip = torch.zeros(self.speaker_length * self.speaker_sr) + + total_length = audio.shape[1] + if int(total_length - self.speaker_sr * self.speaker_length) > 0: + start = np.random.randint(0, int(total_length - self.speaker_sr * self.speaker_length) + 1) + else: + start = 0 + end = min(start + self.speaker_sr * self.speaker_length, total_length) + + audio_clip[:end-start] = audio[0, start: end] + + return audio_clip + + def __getitem__(self, index): + row = self.meta.iloc[index] + + if self.plugin_mode: + audio_clip, content_clip, f0_clip = [''], [''], [''] + else: + # load current audio + audio_path = row['audio_path'] + content_path = row['content_path'] + f0_path = row['f0_path'] + audio_clip, content_clip, f0_clip = self.get_audio_content(audio_path, content_path, f0_path) + + # get speaker + if self.subset == 'train': + speaker = row['speaker'] + else: + speaker = row['speaker_val'] + + speaker_row = self.meta[self.meta['speaker'] == speaker].sample(1) + speaker_path = speaker_row.iloc[0]['speaker_path'] + speaker_clip = self.get_speaker(speaker_path) + # print(speaker_clip.shape) + # print(speaker_path) + # print(speaker) + + # get prompt + prompts = self.prompts[self.prompts['ID'] == speaker]['prompts'].iloc[0] + prompts = ast.literal_eval(prompts) + prompt = random.choice(prompts) + + return audio_clip, content_clip, f0_clip, speaker_clip, prompt + + def __len__(self): + return len(self.meta) + + +if __name__ == '__main__': + from tqdm import tqdm + data = VCData('../../features/', '../../data/meta_val.csv', 'val', '../../data/speaker_gender.csv') + for i in tqdm(range(len(data))): + x = data[i] + # print(x[-1]) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/freevc/.gitattributes b/dreamvoice/train_utils/src/freevc/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/dreamvoice/train_utils/src/freevc/.gitignore b/dreamvoice/train_utils/src/freevc/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +flagged \ No newline at end of file diff --git a/dreamvoice/train_utils/src/freevc/README.md b/dreamvoice/train_utils/src/freevc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/README.md @@ -0,0 +1,13 @@ +--- +title: FreeVC +emoji: 🚀 +colorFrom: gray +colorTo: red +sdk: gradio +sdk_version: 3.13.0 +app_file: app.py +pinned: false +license: mit +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/dreamvoice/train_utils/src/freevc/app.py b/dreamvoice/train_utils/src/freevc/app.py new file mode 100644 index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/app.py @@ -0,0 +1,92 @@ +import os +import torch +import librosa +import gradio as gr +from scipy.io.wavfile import write +from transformers import WavLMModel + +import utils +from models import SynthesizerTrn +from mel_processing import mel_spectrogram_torch +from speaker_encoder.voice_encoder import SpeakerEncoder + +''' +def get_wavlm(): + os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU') + shutil.move('WavLM-Large.pt', 'wavlm') +''' + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# print("Loading FreeVC...") +# hps = utils.get_hparams_from_file("configs/freevc.json") +# freevc = SynthesizerTrn( +# hps.data.filter_length // 2 + 1, +# hps.train.segment_size // hps.data.hop_length, +# **hps.model).to(device) +# _ = freevc.eval() +# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) +smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') + +print("Loading FreeVC(24k)...") +hps = utils.get_hparams_from_file("configs/freevc-24.json") +freevc_24 = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) +_ = freevc_24.eval() +_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None) + +# print("Loading FreeVC-s...") +# hps = utils.get_hparams_from_file("configs/freevc-s.json") +# freevc_s = SynthesizerTrn( +# hps.data.filter_length // 2 + 1, +# hps.train.segment_size // hps.data.hop_length, +# **hps.model).to(device) +# _ = freevc_s.eval() +# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None) +# +# print("Loading WavLM for content...") +cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + +def convert(model, cmodel, src, tgt): + with torch.no_grad(): + # tgt + wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + + # src + wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) + c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) + # infer + if model == "FreeVC": + audio = freevc.infer(c, g=g_tgt) + elif model == "FreeVC-s": + audio = freevc_s.infer(c, mel=mel_tgt) + else: + audio = freevc_24.infer(c, g=g_tgt) + audio = audio[0][0].data.cpu().float().numpy() + if model == "FreeVC" or model == "FreeVC-s": + write("out.wav", hps.data.sampling_rate, audio) + else: + write("out.wav", 24000, audio) + out = "out.wav" + return out + +# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model") +# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath') +# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath') +# inputs = [model, audio1, audio2] +# outputs = gr.outputs.Audio(label="Output Audio", type='filepath') +# +# title = "FreeVC" +# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting." +# article = "

Paper | Github Repo

" +# +# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']] +# +# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch() +convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav') \ No newline at end of file diff --git a/dreamvoice/train_utils/src/freevc/commons.py b/dreamvoice/train_utils/src/freevc/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/commons.py @@ -0,0 +1,171 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d( + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2,3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-24.json b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json new file mode 100644 index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8640, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8008" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,4,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-s.json b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json new file mode 100644 index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": false + } +} diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc.json b/dreamvoice/train_utils/src/freevc/configs/freevc.json new file mode 100644 index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/configs/freevc.json @@ -0,0 +1,54 @@ +{ + "train": { + "log_interval": 200, + "eval_interval": 10000, + "seed": 1234, + "epochs": 10000, + "learning_rate": 2e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 64, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 8960, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 128, + "port": "8001" + }, + "data": { + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", + "max_wav_value": 32768.0, + "sampling_rate": 16000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 256, + "ssl_dim": 1024, + "use_spk": true + } +} diff --git a/dreamvoice/train_utils/src/freevc/mel_processing.py b/dreamvoice/train_utils/src/freevc/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/mel_processing.py @@ -0,0 +1,112 @@ +import math +import os +import random +import torch +from torch import nn +import torch.nn.functional as F +import torch.utils.data +import numpy as np +import librosa +import librosa.util as librosa_util +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + '_' + str(spec.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + '_' + str(y.device) + fmax_dtype_device = str(fmax) + '_' + dtype_device + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/dreamvoice/train_utils/src/freevc/models.py b/dreamvoice/train_utils/src/freevc/models.py new file mode 100644 index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/models.py @@ -0,0 +1,351 @@ +import copy +import math +import torch +from torch import nn +from torch.nn import functional as F + +from .commons import sequence_mask, rand_slice_segments +from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .commons import init_weights, get_padding + + +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class Generator(torch.nn.Module): + def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) + resblock = ResBlock1 if resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append(weight_norm( + ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), + k, u, padding=(k-u)//2))) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel//(2**(i+1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x) + else: + xs += self.resblocks[i*self.num_kernels+j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2,3,5,7,11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class SpeakerEncoder(torch.nn.Module): + def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): + mel_slices = [] + for i in range(0, total_frames-partial_frames, partial_hop): + mel_range = torch.arange(i, i+partial_frames) + mel_slices.append(mel_range) + + return mel_slices + + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): + mel_len = mel.size(1) + last_mel = mel[:,-partial_frames:] + + if mel_len > partial_frames: + mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) + mels = list(mel[:,s] for s in mel_slices) + mels.append(last_mel) + mels = torch.stack(tuple(mels), 0).squeeze(1) + + with torch.no_grad(): + partial_embeds = self(mels) + embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) + #embed = embed / torch.linalg.norm(embed, 2) + else: + with torch.no_grad(): + embed = self(last_mel) + + return embed + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + use_spk, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + self.use_spk = use_spk + + self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16) + self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + if not self.use_spk: + self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels) + + def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if spec_lengths == None: + spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) + + if not self.use_spk: + g = self.enc_spk(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + _, m_p, logs_p, _ = self.enc_p(c, c_lengths) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + z_p = self.flow(z, spec_mask, g=g) + + z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size) + o = self.dec(z_slice, g=g) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, c, g=None, mel=None, c_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if not self.use_spk: + g = self.enc_spk.embed_utterance(mel.transpose(1,2)) + g = g.unsqueeze(-1) + + z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths) + z = self.flow(z_p, c_mask, g=g, reverse=True) + o = self.dec(z * c_mask, g=g) + + return o diff --git a/dreamvoice/train_utils/src/freevc/modules.py b/dreamvoice/train_utils/src/freevc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/modules.py @@ -0,0 +1,341 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm + +from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential( + nn.ReLU(), + nn.Dropout(p_dropout)) + for _ in range(n_layers-1): + self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size ** i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, + groups=channels, dilation=dilation, padding=padding + )) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + self.hidden_channels =hidden_channels + self.kernel_size = kernel_size, + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + else: + g_l = torch.zeros_like(x_in) + + acts = fused_add_tanh_sigmoid_multiply( + x_in, + g_l, + n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:,:self.hidden_channels,:] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:,self.hidden_channels:,:] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels,1)) + self.logs = nn.Parameter(torch.zeros(channels,1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1,2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels]*2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels]*2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1,2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/dreamvoice/train_utils/src/freevc/requirements.txt b/dreamvoice/train_utils/src/freevc/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/requirements.txt @@ -0,0 +1,8 @@ +altair +httpx==0.24.1 +numpy +scipy +torch +transformers +librosa +webrtcvad==2.0.10 diff --git a/dreamvoice/train_utils/src/freevc/utils.py b/dreamvoice/train_utils/src/freevc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69 --- /dev/null +++ b/dreamvoice/train_utils/src/freevc/utils.py @@ -0,0 +1,305 @@ +import os +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch +from torch.nn import functional as F +from .commons import sequence_mask + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def get_cmodel(rank): + checkpoint = torch.load('wavlm/WavLM-Large.pt') + cfg = WavLMConfig(checkpoint['cfg']) + cmodel = WavLM(cfg).cuda(rank) + cmodel.load_state_dict(checkpoint['model']) + cmodel.eval() + return cmodel + + +def get_content(cmodel, y): + with torch.no_grad(): + c = cmodel.extract_features(y.squeeze(1))[0] + c = c.transpose(1, 2) + return c + + +def get_vocoder(rank): + with open("hifigan/config.json", "r") as f: + config = json.load(f) + config = hifigan.AttrDict(config) + vocoder = hifigan.Generator(config) + ckpt = torch.load("hifigan/generator_v1") + vocoder.load_state_dict(ckpt["generator"]) + vocoder.eval() + vocoder.remove_weight_norm() + vocoder.cuda(rank) + return vocoder + + +def transform(mel, height): # 68-92 + #r = np.random.random() + #rate = r * 0.3 + 0.85 # 0.85-1.15 + #height = int(mel.size(-2) * rate) + tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1))) + if height >= mel.size(-2): + return tgt[:, :mel.size(-2), :] + else: + silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) + silence += torch.randn_like(silence) / 10 + return torch.cat((tgt, silence), 1) + + +def stretch(mel, width): # 0.5-2 + return torchvision.transforms.functional.resize(mel, (mel.size(-2), width)) + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + if optimizer is not None: + optimizer.load_state_dict(checkpoint_dict['optimizer']) + saved_state_dict = checkpoint_dict['model'] + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict= {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + logger.info("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + logger.info("Loaded checkpoint '{}' (iteration {})" .format( + checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info("Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path)) + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save({'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate}, checkpoint_path) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10,2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding='utf-8') as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default="./configs/base.json", + help='JSON file for configuration') + parser.add_argument('-m', '--model', type=str, required=True, + help='Model name') + + args = parser.parse_args() + model_dir = os.path.join("./logs", args.model) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + config_path = args.config + config_save_path = os.path.join(model_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams =HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + )) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn("git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/dreamvoice/train_utils/src/freevc_wrapper.py b/dreamvoice/train_utils/src/freevc_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..11a46fa184110368939cfc1bf3cc9f47a9c8092d --- /dev/null +++ b/dreamvoice/train_utils/src/freevc_wrapper.py @@ -0,0 +1,63 @@ +import os +import torch +import librosa +import soundfile as sf +from pathlib import Path + +from transformers import WavLMModel +from freevc.utils import load_checkpoint, get_hparams_from_file +from freevc.models import SynthesizerTrn +# from mel_processing import mel_spectrogram_torch +# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder +from speaker_encoder.voice_encoder import SpeakerEncoder + + +def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'): + hps = get_hparams_from_file("freevc/configs/freevc.json") + freevc = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).to(device) + freevc.eval() + load_checkpoint("../prepare_freevc/ckpts/freevc.pth", freevc, None) + + cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) + cmodel.eval() + + # smodel = spk_encoder.load_model(Path(speaker_path), device) + # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda') + smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device) + + return freevc, cmodel, smodel, hps + + +@torch.no_grad() +def convert(freevc, content, speaker): + audio = freevc.infer(content, g=speaker) + audio = audio[0][0].data.cpu().float().numpy() + return audio, 24000 + + +if __name__ == '__main__': + freevc_24, cmodel, smodel, hps = get_freevc_models() + + tgt = 'p226_002.wav' + # src = 'p226_002.wav' + src = 'p225_001.wav' + device = 'cuda' + + # tgt + wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + g_tgt = smodel.embed_utterance(wav_tgt) + g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) + # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda()) + + # src + wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) + content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) + + output, sr = convert(freevc_24, content, g_tgt) + + sf.write('output.wav', output, sr) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/inference_freevc.py b/dreamvoice/train_utils/src/inference_freevc.py new file mode 100644 index 0000000000000000000000000000000000000000..0bdf218f87549460a47cb224c509c3f0fc80d6b0 --- /dev/null +++ b/dreamvoice/train_utils/src/inference_freevc.py @@ -0,0 +1,124 @@ +import os +import torch +import soundfile as sf +import pandas as pd +import librosa +from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift_re +from freevc_wrapper import convert +import time + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +@torch.no_grad() +def inference_timbre(gen_shape, text, + model, scheduler, + guidance_scale=5, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2023, + device='cuda', + ): + text, text_mask = text + model.eval() + + if random_seed is not None: + generator = torch.Generator(device=device).manual_seed(random_seed) + else: + generator = torch.Generator(device=device) + generator.seed() + + scheduler.set_timesteps(ddim_steps) + + # init noise + noise = torch.randn(gen_shape, generator=generator, device=device) + latents = noise + + for t in scheduler.timesteps: + latents = scheduler.scale_model_input(latents, t) + + if guidance_scale: + output_text = model(latents, t, text, text_mask, train_cfg=False) + output_uncond = model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0) + + output_pred = output_uncond + guidance_scale * (output_text - output_uncond) + if guidance_rescale > 0.0: + output_pred = rescale_noise_cfg(output_pred, output_text, + guidance_rescale=guidance_rescale) + else: + output_pred = model(latents, t, text, text_mask, train_cfg=False) + + latents = scheduler.step(model_output=output_pred, timestep=t, sample=latents, + eta=eta, generator=generator).prev_sample + + # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5) + pred = scale_shift_re(latents, 20, -0.035) + pred = torch.clip(pred, min=0.0, max=0.5) + return pred + + +@torch.no_grad() +def eval_plugin(freevc, cmodel, text_model, + timbre_model, timbre_scheduler, timbre_shape, + val_meta, val_folder, + guidance_scale=3, guidance_rescale=0.7, + ddim_steps=50, eta=1, random_seed=2024, + device='cuda', + epoch=0, save_path='logs/eval/', val_num=10, sr=16000): + + tokenizer, text_encoder = text_model + + df = pd.read_csv(val_meta) + + save_path = save_path + str(epoch) + '/' + os.makedirs(save_path, exist_ok=True) + + step = 0 + + for i in range(len(df)): + row = df.iloc[i] + + source_path = val_folder + row['path'] + # prompt = [row['prompt']] + prompt = ["female's voice"] + with torch.no_grad(): + text_batch = tokenizer(prompt, + max_length=32, + padding='max_length', truncation=True, return_tensors="pt") + text, text_mask = text_batch.input_ids.to(device), \ + text_batch.attention_mask.to(device) + text = text_encoder(input_ids=text, attention_mask=text_mask)[0] + + audio_clip = librosa.load(source_path, sr=16000)[0] + audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device) + + content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device) + + # start_time = time.time() + spk_embed = inference_timbre(timbre_shape, [text, text_mask], + timbre_model, timbre_scheduler, + guidance_scale=guidance_scale, guidance_rescale=guidance_rescale, + ddim_steps=ddim_steps, eta=eta, random_seed=random_seed, + device=device) + spk_embed = spk_embed.squeeze(-1) + + output, out_sr = convert(freevc, content, spk_embed) + # end_time = time.time() + # print(end_time-start_time) + # print(pred.shape) + sf.write(save_path + f'{step}_{prompt[0]}' + '.wav', output, samplerate=sr) + + step += 1 + + if step >= val_num: + break diff --git a/dreamvoice/train_utils/src/speaker_encoder/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dreamvoice/train_utils/src/speaker_encoder/audio.py b/dreamvoice/train_utils/src/speaker_encoder/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/audio.py @@ -0,0 +1,107 @@ +from scipy.ndimage.morphology import binary_dilation +from speaker_encoder.params_data import * +from pathlib import Path +from typing import Optional, Union +import numpy as np +import webrtcvad +import librosa +import struct + +int16_max = (2 ** 15) - 1 + + +def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], + source_sr: Optional[int] = None): + """ + Applies the preprocessing operations used in training the Speaker Encoder to a waveform + either on disk or in memory. The waveform will be resampled to match the data hyperparameters. + + :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not + just .wav), either the waveform as a numpy array of floats. + :param source_sr: if passing an audio waveform, the sampling rate of the waveform before + preprocessing. After preprocessing, the waveform's sampling rate will match the data + hyperparameters. If passing a filepath, the sampling rate will be automatically detected and + this argument will be ignored. + """ + # Load the wav from disk if needed + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): + wav, source_sr = librosa.load(fpath_or_wav, sr=None) + else: + wav = fpath_or_wav + + # Resample the wav if needed + if source_sr is not None and source_sr != sampling_rate: + wav = librosa.resample(wav, source_sr, sampling_rate) + + # Apply the preprocessing: normalize volume and shorten long silences + wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) + wav = trim_long_silences(wav) + + return wav + + +def wav_to_mel_spectrogram(wav): + """ + Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. + Note: this not a log-mel spectrogram. + """ + frames = librosa.feature.melspectrogram( + y=wav, + sr=sampling_rate, + n_fft=int(sampling_rate * mel_window_length / 1000), + hop_length=int(sampling_rate * mel_window_step / 1000), + n_mels=mel_n_channels + ) + return frames.astype(np.float32).T + + +def trim_long_silences(wav): + """ + Ensures that segments without voice in the waveform remain no longer than a + threshold determined by the VAD parameters in params.py. + + :param wav: the raw waveform as a numpy array of floats + :return: the same waveform with silences trimmed away (length <= original wav length) + """ + # Compute the voice detection window size + samples_per_window = (vad_window_length * sampling_rate) // 1000 + + # Trim the end of the audio to have a multiple of the window size + wav = wav[:len(wav) - (len(wav) % samples_per_window)] + + # Convert the float waveform to 16-bit mono PCM + pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) + + # Perform voice activation detection + voice_flags = [] + vad = webrtcvad.Vad(mode=3) + for window_start in range(0, len(wav), samples_per_window): + window_end = window_start + samples_per_window + voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], + sample_rate=sampling_rate)) + voice_flags = np.array(voice_flags) + + # Smooth the voice detection with a moving average + def moving_average(array, width): + array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) + ret = np.cumsum(array_padded, dtype=float) + ret[width:] = ret[width:] - ret[:-width] + return ret[width - 1:] / width + + audio_mask = moving_average(voice_flags, vad_moving_average_width) + audio_mask = np.round(audio_mask).astype(np.bool) + + # Dilate the voiced regions + audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) + audio_mask = np.repeat(audio_mask, samples_per_window) + + return wav[audio_mask == True] + + +def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): + if increase_only and decrease_only: + raise ValueError("Both increase only and decrease only are set") + dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) + if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): + return wav + return wav * (10 ** (dBFS_change / 20)) diff --git a/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt new file mode 100644 index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca +size 17090379 diff --git a/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py @@ -0,0 +1,40 @@ +from speaker_encoder import inference as encoder +from multiprocessing.pool import Pool +from functools import partial +from pathlib import Path +# from utils import logmmse +# from tqdm import tqdm +# import numpy as np +# import librosa + + +def embed_utterance(fpaths, encoder_model_fpath): + if not encoder.is_loaded(): + encoder.load_model(encoder_model_fpath) + + # Compute the speaker embedding of the utterance + wav_fpath, embed_fpath = fpaths + wav = np.load(wav_fpath) + wav = encoder.preprocess_wav(wav) + embed = encoder.embed_utterance(wav) + np.save(embed_fpath, embed, allow_pickle=False) + + +def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): + + wav_dir = outdir_root.joinpath("audio") + metadata_fpath = synthesizer_root.joinpath("train.txt") + assert wav_dir.exists() and metadata_fpath.exists() + embed_dir = synthesizer_root.joinpath("embeds") + embed_dir.mkdir(exist_ok=True) + + # Gather the input wave filepath and the target output embed filepath + with metadata_fpath.open("r") as metadata_file: + metadata = [line.split("|") for line in metadata_file] + fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] + + # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. + # Embed the utterances in separate threads + func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) + job = Pool(n_processes).imap(func, fpaths) + list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/config.py b/dreamvoice/train_utils/src/speaker_encoder/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/config.py @@ -0,0 +1,45 @@ +librispeech_datasets = { + "train": { + "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], + "other": ["LibriSpeech/train-other-500"] + }, + "test": { + "clean": ["LibriSpeech/test-clean"], + "other": ["LibriSpeech/test-other"] + }, + "dev": { + "clean": ["LibriSpeech/dev-clean"], + "other": ["LibriSpeech/dev-other"] + }, +} +libritts_datasets = { + "train": { + "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], + "other": ["LibriTTS/train-other-500"] + }, + "test": { + "clean": ["LibriTTS/test-clean"], + "other": ["LibriTTS/test-other"] + }, + "dev": { + "clean": ["LibriTTS/dev-clean"], + "other": ["LibriTTS/dev-other"] + }, +} +voxceleb_datasets = { + "voxceleb1" : { + "train": ["VoxCeleb1/wav"], + "test": ["VoxCeleb1/test_wav"] + }, + "voxceleb2" : { + "train": ["VoxCeleb2/dev/aac"], + "test": ["VoxCeleb2/test_wav"] + } +} + +other_datasets = [ + "LJSpeech-1.1", + "VCTK-Corpus/wav48", +] + +anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py @@ -0,0 +1,2 @@ +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py new file mode 100644 index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py @@ -0,0 +1,37 @@ +import random + +class RandomCycler: + """ + Creates an internal copy of a sequence and allows access to its items in a constrained random + order. For a source sequence of n items and one or several consecutive queries of a total + of m items, the following guarantees hold (one implies the other): + - Each item will be returned between m // n and ((m - 1) // n) + 1 times. + - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. + """ + + def __init__(self, source): + if len(source) == 0: + raise Exception("Can't create RandomCycler from an empty collection") + self.all_items = list(source) + self.next_items = [] + + def sample(self, count: int): + shuffle = lambda l: random.sample(l, len(l)) + + out = [] + while count > 0: + if count >= len(self.all_items): + out.extend(shuffle(list(self.all_items))) + count -= len(self.all_items) + continue + n = min(count, len(self.next_items)) + out.extend(self.next_items[:n]) + count -= n + self.next_items = self.next_items[n:] + if len(self.next_items) == 0: + self.next_items = shuffle(list(self.all_items)) + return out + + def __next__(self): + return self.sample(1)[0] + diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py @@ -0,0 +1,40 @@ +from speaker_encoder.data_objects.random_cycler import RandomCycler +from speaker_encoder.data_objects.utterance import Utterance +from pathlib import Path + +# Contains the set of utterances of a single speaker +class Speaker: + def __init__(self, root: Path): + self.root = root + self.name = root.name + self.utterances = None + self.utterance_cycler = None + + def _load_utterances(self): + with self.root.joinpath("_sources.txt").open("r") as sources_file: + sources = [l.split(",") for l in sources_file] + sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} + self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] + self.utterance_cycler = RandomCycler(self.utterances) + + def random_partial(self, count, n_frames): + """ + Samples a batch of unique partial utterances from the disk in a way that all + utterances come up at least once every two cycles and in a random order every time. + + :param count: The number of partial utterances to sample from the set of utterances from + that speaker. Utterances are guaranteed not to be repeated if is not larger than + the number of utterances available. + :param n_frames: The number of frames in the partial utterance. + :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, + frames are the frames of the partial utterances and range is the range of the partial + utterance with regard to the complete utterance. + """ + if self.utterances is None: + self._load_utterances() + + utterances = self.utterance_cycler.sample(count) + + a = [(u,) + u.random_partial(n_frames) for u in utterances] + + return a diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py @@ -0,0 +1,12 @@ +import numpy as np +from typing import List +from speaker_encoder.data_objects.speaker import Speaker + +class SpeakerBatch: + def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): + self.speakers = speakers + self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} + + # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with + # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) + self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py @@ -0,0 +1,56 @@ +from speaker_encoder.data_objects.random_cycler import RandomCycler +from speaker_encoder.data_objects.speaker_batch import SpeakerBatch +from speaker_encoder.data_objects.speaker import Speaker +from speaker_encoder.params_data import partials_n_frames +from torch.utils.data import Dataset, DataLoader +from pathlib import Path + +# TODO: improve with a pool of speakers for data efficiency + +class SpeakerVerificationDataset(Dataset): + def __init__(self, datasets_root: Path): + self.root = datasets_root + speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] + if len(speaker_dirs) == 0: + raise Exception("No speakers found. Make sure you are pointing to the directory " + "containing all preprocessed speaker directories.") + self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] + self.speaker_cycler = RandomCycler(self.speakers) + + def __len__(self): + return int(1e10) + + def __getitem__(self, index): + return next(self.speaker_cycler) + + def get_logs(self): + log_string = "" + for log_fpath in self.root.glob("*.txt"): + with log_fpath.open("r") as log_file: + log_string += "".join(log_file.readlines()) + return log_string + + +class SpeakerVerificationDataLoader(DataLoader): + def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, + batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, + worker_init_fn=None): + self.utterances_per_speaker = utterances_per_speaker + + super().__init__( + dataset=dataset, + batch_size=speakers_per_batch, + shuffle=False, + sampler=sampler, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=self.collate, + pin_memory=pin_memory, + drop_last=False, + timeout=timeout, + worker_init_fn=worker_init_fn + ) + + def collate(self, speakers): + return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) + \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py new file mode 100644 index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py @@ -0,0 +1,26 @@ +import numpy as np + + +class Utterance: + def __init__(self, frames_fpath, wave_fpath): + self.frames_fpath = frames_fpath + self.wave_fpath = wave_fpath + + def get_frames(self): + return np.load(self.frames_fpath) + + def random_partial(self, n_frames): + """ + Crops the frames into a partial utterance of n_frames + + :param n_frames: The number of frames of the partial utterance + :return: the partial utterance frames and a tuple indicating the start and end of the + partial utterance in the complete utterance. + """ + frames = self.get_frames() + if frames.shape[0] == n_frames: + start = 0 + else: + start = np.random.randint(0, frames.shape[0] - n_frames) + end = start + n_frames + return frames[start:end], (start, end) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/hparams.py b/dreamvoice/train_utils/src/speaker_encoder/hparams.py new file mode 100644 index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/hparams.py @@ -0,0 +1,31 @@ +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/inference.py b/dreamvoice/train_utils/src/speaker_encoder/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/inference.py @@ -0,0 +1,177 @@ +from speaker_encoder.params_data import * +from speaker_encoder.model import SpeakerEncoder +from speaker_encoder.audio import preprocess_wav # We want to expose this function from here +from matplotlib import cm +from speaker_encoder import audio +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import torch + +_model = None # type: SpeakerEncoder +_device = None # type: torch.device + + +def load_model(weights_fpath: Path, device=None): + """ + Loads the model in memory. If this function is not explicitely called, it will be run on the + first call to embed_frames() with the default weights file. + + :param weights_fpath: the path to saved model weights. + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The + model will be loaded and will run on this device. Outputs will however always be on the cpu. + If None, will default to your GPU if it"s available, otherwise your CPU. + """ + # TODO: I think the slow loading of the encoder might have something to do with the device it + # was saved on. Worth investigating. + global _model, _device + if device is None: + _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + _device = torch.device(device) + _model = SpeakerEncoder(_device, torch.device("cpu")) + checkpoint = torch.load(weights_fpath) + _model.load_state_dict(checkpoint["model_state"]) + _model.eval() + print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) + + +def is_loaded(): + return _model is not None + + +def embed_frames_batch(frames_batch): + """ + Computes embeddings for a batch of mel spectrogram. + + :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape + (batch_size, n_frames, n_channels) + :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size) + """ + if _model is None: + raise Exception("Model was not loaded. Call load_model() before inference.") + + frames = torch.from_numpy(frames_batch).to(_device) + embed = _model.forward(frames).detach().cpu().numpy() + return embed + + +def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, + min_pad_coverage=0.75, overlap=0.5): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain + partial utterances of each. Both the waveform and the mel + spectrogram slices are returned, so as to make each partial utterance waveform correspond to + its spectrogram. This function assumes that the mel spectrogram parameters used are those + defined in params_data.py. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wave_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial + utterance + :param min_pad_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered, as if we padded the audio. Otherwise, + it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial + utterance, this parameter is ignored so that the function always returns at least 1 slice. + :param overlap: by how much the partial utterance should overlap. If set to 0, the partial + utterances are entirely disjoint. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 <= overlap < 1 + assert 0 < min_pad_coverage <= 1 + + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partial_utterance_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_pad_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + +def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): + """ + Computes an embedding for a single utterance. + + # TODO: handle multiple wavs to benefit from batching on GPU + :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 + :param using_partials: if True, then the utterance is split in partial utterances of + frames and the utterance embedding is computed from their + normalized average. If False, the utterance is instead computed from feeding the entire + spectogram to the network. + :param return_partials: if True, the partial embeddings will also be returned along with the + wav slices that correspond to the partial embeddings. + :param kwargs: additional arguments to compute_partial_splits() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. If is simultaneously set to False, both these values will be None + instead. + """ + # Process the entire utterance if not using partials + if not using_partials: + frames = audio.wav_to_mel_spectrogram(wav) + embed = embed_frames_batch(frames[None, ...])[0] + if return_partials: + return embed, None, None + return embed + + # Compute where to split the utterance into partials and pad if necessary + wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) + max_wave_length = wave_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials + frames = audio.wav_to_mel_spectrogram(wav) + frames_batch = np.array([frames[s] for s in mel_slices]) + partial_embeds = embed_frames_batch(frames_batch) + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wave_slices + return embed + + +def embed_speaker(wavs, **kwargs): + raise NotImplemented() + + +def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): + if ax is None: + ax = plt.gca() + + if shape is None: + height = int(np.sqrt(len(embed))) + shape = (height, -1) + embed = embed.reshape(shape) + + cmap = cm.get_cmap() + mappable = ax.imshow(embed, cmap=cmap) + cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) + cbar.set_clim(*color_range) + + ax.set_xticks([]), ax.set_yticks([]) + ax.set_title(title) diff --git a/dreamvoice/train_utils/src/speaker_encoder/model.py b/dreamvoice/train_utils/src/speaker_encoder/model.py new file mode 100644 index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/model.py @@ -0,0 +1,135 @@ +from speaker_encoder.params_model import * +from speaker_encoder.params_data import * +from scipy.interpolate import interp1d +from sklearn.metrics import roc_curve +from torch.nn.utils import clip_grad_norm_ +from scipy.optimize import brentq +from torch import nn +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, device, loss_device): + super().__init__() + self.loss_device = loss_device + + # Network defition + self.lstm = nn.LSTM(input_size=mel_n_channels, # 40 + hidden_size=model_hidden_size, # 256 + num_layers=model_num_layers, # 3 + batch_first=True).to(device) + self.linear = nn.Linear(in_features=model_hidden_size, + out_features=model_embedding_size).to(device) + self.relu = torch.nn.ReLU().to(device) + + # Cosine similarity scaling (with fixed initial parameter values) + self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) + self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) + + # Loss + self.loss_fn = nn.CrossEntropyLoss().to(loss_device) + + def do_gradient_ops(self): + # Gradient scale + self.similarity_weight.grad *= 0.01 + self.similarity_bias.grad *= 0.01 + + # Gradient clipping + clip_grad_norm_(self.parameters(), 3, norm_type=2) + + def forward(self, utterances, hidden_init=None): + """ + Computes the embeddings of a batch of utterance spectrograms. + + :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape + (batch_size, n_frames, n_channels) + :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, + batch_size, hidden_size). Will default to a tensor of zeros if None. + :return: the embeddings as a tensor of shape (batch_size, embedding_size) + """ + # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state + # and the final cell state. + out, (hidden, cell) = self.lstm(utterances, hidden_init) + + # We take only the hidden state of the last layer + embeds_raw = self.relu(self.linear(hidden[-1])) + + # L2-normalize it + embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + return embeds + + def similarity_matrix(self, embeds): + """ + Computes the similarity matrix according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the similarity matrix as a tensor of shape (speakers_per_batch, + utterances_per_speaker, speakers_per_batch) + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation + centroids_incl = torch.mean(embeds, dim=1, keepdim=True) + centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) + + # Exclusive centroids (1 per utterance) + centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) + centroids_excl /= (utterances_per_speaker - 1) + centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) + + # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot + # product of these vectors (which is just an element-wise multiplication reduced by a sum). + # We vectorize the computation for efficiency. + sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, + speakers_per_batch).to(self.loss_device) + mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) + for j in range(speakers_per_batch): + mask = np.where(mask_matrix[j])[0] + sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) + sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) + + ## Even more vectorized version (slower maybe because of transpose) + # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker + # ).to(self.loss_device) + # eye = np.eye(speakers_per_batch, dtype=np.int) + # mask = np.where(1 - eye) + # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) + # mask = np.where(eye) + # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) + # sim_matrix2 = sim_matrix2.transpose(1, 2) + + sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias + return sim_matrix + + def loss(self, embeds): + """ + Computes the softmax loss according the section 2.1 of GE2E. + + :param embeds: the embeddings as a tensor of shape (speakers_per_batch, + utterances_per_speaker, embedding_size) + :return: the loss and the EER for this batch of embeddings. + """ + speakers_per_batch, utterances_per_speaker = embeds.shape[:2] + + # Loss + sim_matrix = self.similarity_matrix(embeds) + sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, + speakers_per_batch)) + ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) + target = torch.from_numpy(ground_truth).long().to(self.loss_device) + loss = self.loss_fn(sim_matrix, target) + + # EER (not backpropagated) + with torch.no_grad(): + inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] + labels = np.array([inv_argmax(i) for i in ground_truth]) + preds = sim_matrix.detach().cpu().numpy() + + # Snippet from https://yangcha.github.io/EER-ROC/ + fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) + eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + + return loss, eer \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_data.py b/dreamvoice/train_utils/src/speaker_encoder/params_data.py new file mode 100644 index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/params_data.py @@ -0,0 +1,29 @@ + +## Mel-filterbank +mel_window_length = 25 # In milliseconds +mel_window_step = 10 # In milliseconds +mel_n_channels = 40 + + +## Audio +sampling_rate = 16000 +# Number of spectrogram frames in a partial utterance +partials_n_frames = 160 # 1600 ms +# Number of spectrogram frames at inference +inference_n_frames = 80 # 800 ms + + +## Voice Activation Detection +# Window size of the VAD. Must be either 10, 20 or 30 milliseconds. +# This sets the granularity of the VAD. Should not need to be changed. +vad_window_length = 30 # In milliseconds +# Number of frames to average together when performing the moving average smoothing. +# The larger this value, the larger the VAD variations must be to not get smoothed out. +vad_moving_average_width = 8 +# Maximum number of consecutive silent frames a segment can have. +vad_max_silence_length = 6 + + +## Audio volume normalization +audio_norm_target_dBFS = -30 + diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_model.py b/dreamvoice/train_utils/src/speaker_encoder/params_model.py new file mode 100644 index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/params_model.py @@ -0,0 +1,11 @@ + +## Model parameters +model_hidden_size = 256 +model_embedding_size = 256 +model_num_layers = 3 + + +## Training parameters +learning_rate_init = 1e-4 +speakers_per_batch = 64 +utterances_per_speaker = 10 diff --git a/dreamvoice/train_utils/src/speaker_encoder/preprocess.py b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378 --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py @@ -0,0 +1,285 @@ +from multiprocess.pool import ThreadPool +from speaker_encoder.params_data import * +from speaker_encoder.config import librispeech_datasets, anglophone_nationalites +from datetime import datetime +from speaker_encoder import audio +from pathlib import Path +from tqdm import tqdm +import numpy as np + + +class DatasetLog: + """ + Registers metadata about the dataset in a text file. + """ + def __init__(self, root, name): + self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") + self.sample_data = dict() + + start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Creating dataset %s on %s" % (name, start_time)) + self.write_line("-----") + self._log_params() + + def _log_params(self): + from speaker_encoder import params_data + self.write_line("Parameter values:") + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + self.write_line("\t%s: %s" % (param_name, value)) + self.write_line("-----") + + def write_line(self, line): + self.text_file.write("%s\n" % line) + + def add_sample(self, **kwargs): + for param_name, value in kwargs.items(): + if not param_name in self.sample_data: + self.sample_data[param_name] = [] + self.sample_data[param_name].append(value) + + def finalize(self): + self.write_line("Statistics:") + for param_name, values in self.sample_data.items(): + self.write_line("\t%s:" % param_name) + self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) + self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) + self.write_line("-----") + end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) + self.write_line("Finished on %s" % end_time) + self.text_file.close() + + +def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): + dataset_root = datasets_root.joinpath(dataset_name) + if not dataset_root.exists(): + print("Couldn\'t find %s, skipping this dataset." % dataset_root) + return None, None + return dataset_root, DatasetLog(out_dir, dataset_name) + + +def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + + # Function to preprocess utterances for one speaker + def preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + if sources_fpath.exists(): + try: + with sources_fpath.open("r") as sources_file: + existing_fnames = {line.split(",")[0] for line in sources_file} + except: + existing_fnames = {} + else: + existing_fnames = {} + + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + + # Process the utterances for each speaker + with ThreadPool(8) as pool: + list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + unit="speakers")) + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +# Function to preprocess utterances for one speaker +def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + # There's a possibility that the preprocessing was interrupted earlier, check if + # there already is a sources file. + # if sources_fpath.exists(): + # try: + # with sources_fpath.open("r") as sources_file: + # existing_fnames = {line.split(",")[0] for line in sources_file} + # except: + # existing_fnames = {} + # else: + # existing_fnames = {} + existing_fnames = {} + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + # logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + + sources_file.close() + return len(wav) + +def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension, + skip_existing, logger): + # from multiprocessing import Pool, cpu_count + from pathos.multiprocessing import ProcessingPool as Pool + # Function to preprocess utterances for one speaker + def __preprocess_speaker(speaker_dir: Path): + # Give a name to the speaker that includes its dataset + speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) + + # Create an output directory with that name, as well as a txt file containing a + # reference to each source file. + speaker_out_dir = out_dir.joinpath(speaker_name) + speaker_out_dir.mkdir(exist_ok=True) + sources_fpath = speaker_out_dir.joinpath("_sources.txt") + + existing_fnames = {} + # Gather all audio files for that speaker recursively + sources_file = sources_fpath.open("a" if skip_existing else "w") + wav_lens = [] + for in_fpath in speaker_dir.glob("**/*.%s" % extension): + # Check if the target output file already exists + out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) + out_fname = out_fname.replace(".%s" % extension, ".npy") + if skip_existing and out_fname in existing_fnames: + continue + + # Load and preprocess the waveform + wav = audio.preprocess_wav(in_fpath) + if len(wav) == 0: + continue + + # Create the mel spectrogram, discard those that are too short + frames = audio.wav_to_mel_spectrogram(wav) + if len(frames) < partials_n_frames: + continue + + out_fpath = speaker_out_dir.joinpath(out_fname) + np.save(out_fpath, frames) + # logger.add_sample(duration=len(wav) / sampling_rate) + sources_file.write("%s,%s\n" % (out_fname, in_fpath)) + wav_lens.append(len(wav)) + sources_file.close() + return wav_lens + + print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) + # Process the utterances for each speaker + # with ThreadPool(8) as pool: + # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), + # unit="speakers")) + pool = Pool(processes=20) + for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1): + for wav_len in wav_lens: + logger.add_sample(duration=wav_len / sampling_rate) + print(f'{i}/{len(speaker_dirs)} \r') + + logger.finalize() + print("Done preprocessing %s.\n" % dataset_name) + + +def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): + for dataset_name in librispeech_datasets["train"]["other"]: + # Initialize the preprocessing + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Preprocess all speakers + speaker_dirs = list(dataset_root.glob("*")) + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", + skip_existing, logger) + + +def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb1" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the contents of the meta file + with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: + metadata = [line.split("\t") for line in metafile][1:] + + # Select the ID and the nationality, filter out non-anglophone speakers + nationalities = {line[0]: line[3] for line in metadata} + # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if + # nationality.lower() in anglophone_nationalites] + keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()] + print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % + (len(keep_speaker_ids), len(nationalities))) + + # Get the speaker directories for anglophone speakers only + speaker_dirs = dataset_root.joinpath("wav").glob("*") + speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if + speaker_dir.name in keep_speaker_ids] + print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % + (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) + + # Preprocess all speakers + _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", + skip_existing, logger) + + +def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): + # Initialize the preprocessing + dataset_name = "VoxCeleb2" + dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) + if not dataset_root: + return + + # Get the speaker directories + # Preprocess all speakers + speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) + _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", + skip_existing, logger) diff --git a/dreamvoice/train_utils/src/speaker_encoder/train.py b/dreamvoice/train_utils/src/speaker_encoder/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/train.py @@ -0,0 +1,125 @@ +from speaker_encoder.visualizations import Visualizations +from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset +from speaker_encoder.params_model import * +from speaker_encoder.model import SpeakerEncoder +from utils.profiler import Profiler +from pathlib import Path +import torch + +def sync(device: torch.device): + # FIXME + return + # For correct profiling (cuda operations are async) + if device.type == "cuda": + torch.cuda.synchronize(device) + +def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, + backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, + no_visdom: bool): + # Create a dataset and a dataloader + dataset = SpeakerVerificationDataset(clean_data_root) + loader = SpeakerVerificationDataLoader( + dataset, + speakers_per_batch, # 64 + utterances_per_speaker, # 10 + num_workers=8, + ) + + # Setup the device on which to run the forward pass and the loss. These can be different, + # because the forward pass is faster on the GPU whereas the loss is often (depending on your + # hyperparameters) faster on the CPU. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # FIXME: currently, the gradient is None if loss_device is cuda + loss_device = torch.device("cpu") + + # Create the model and the optimizer + model = SpeakerEncoder(device, loss_device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) + init_step = 1 + + # Configure file path for the model + state_fpath = models_dir.joinpath(run_id + ".pt") + backup_dir = models_dir.joinpath(run_id + "_backups") + + # Load any existing model + if not force_restart: + if state_fpath.exists(): + print("Found existing model \"%s\", loading it and resuming training." % run_id) + checkpoint = torch.load(state_fpath) + init_step = checkpoint["step"] + model.load_state_dict(checkpoint["model_state"]) + optimizer.load_state_dict(checkpoint["optimizer_state"]) + optimizer.param_groups[0]["lr"] = learning_rate_init + else: + print("No model \"%s\" found, starting training from scratch." % run_id) + else: + print("Starting the training from scratch.") + model.train() + + # Initialize the visualization environment + vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) + vis.log_dataset(dataset) + vis.log_params() + device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") + vis.log_implementation({"Device": device_name}) + + # Training loop + profiler = Profiler(summarize_every=10, disabled=False) + for step, speaker_batch in enumerate(loader, init_step): + profiler.tick("Blocking, waiting for batch (threaded)") + + # Forward pass + inputs = torch.from_numpy(speaker_batch.data).to(device) + sync(device) + profiler.tick("Data to %s" % device) + embeds = model(inputs) + sync(device) + profiler.tick("Forward pass") + embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) + loss, eer = model.loss(embeds_loss) + sync(loss_device) + profiler.tick("Loss") + + # Backward pass + model.zero_grad() + loss.backward() + profiler.tick("Backward pass") + model.do_gradient_ops() + optimizer.step() + profiler.tick("Parameter update") + + # Update visualizations + # learning_rate = optimizer.param_groups[0]["lr"] + vis.update(loss.item(), eer, step) + + # Draw projections and save them to the backup folder + if umap_every != 0 and step % umap_every == 0: + print("Drawing and saving projections (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) + embeds = embeds.detach().cpu().numpy() + vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) + vis.save() + + # Overwrite the latest version of the model + if save_every != 0 and step % save_every == 0: + print("Saving the model (step %d)" % step) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, state_fpath) + + # Make a backup + if backup_every != 0 and step % backup_every == 0: + print("Making a backup (step %d)" % step) + backup_dir.mkdir(exist_ok=True) + backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) + torch.save({ + "step": step + 1, + "model_state": model.state_dict(), + "optimizer_state": optimizer.state_dict(), + }, backup_fpath) + + profiler.tick("Extras (visualizations, saving)") + \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/visualizations.py b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py @@ -0,0 +1,178 @@ +from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset +from datetime import datetime +from time import perf_counter as timer +import matplotlib.pyplot as plt +import numpy as np +# import webbrowser +import visdom +import umap + +colormap = np.array([ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], +], dtype=np.float) / 255 + + +class Visualizations: + def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): + # Tracking data + self.last_update_timestamp = timer() + self.update_every = update_every + self.step_times = [] + self.losses = [] + self.eers = [] + print("Updating the visualizations every %d steps." % update_every) + + # If visdom is disabled TODO: use a better paradigm for that + self.disabled = disabled + if self.disabled: + return + + # Set the environment name + now = str(datetime.now().strftime("%d-%m %Hh%M")) + if env_name is None: + self.env_name = now + else: + self.env_name = "%s (%s)" % (env_name, now) + + # Connect to visdom and open the corresponding window in the browser + try: + self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) + except ConnectionError: + raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " + "start it.") + # webbrowser.open("http://localhost:8097/env/" + self.env_name) + + # Create the windows + self.loss_win = None + self.eer_win = None + # self.lr_win = None + self.implementation_win = None + self.projection_win = None + self.implementation_string = "" + + def log_params(self): + if self.disabled: + return + from speaker_encoder import params_data + from speaker_encoder import params_model + param_string = "Model parameters:
" + for param_name in (p for p in dir(params_model) if not p.startswith("__")): + value = getattr(params_model, param_name) + param_string += "\t%s: %s
" % (param_name, value) + param_string += "Data parameters:
" + for param_name in (p for p in dir(params_data) if not p.startswith("__")): + value = getattr(params_data, param_name) + param_string += "\t%s: %s
" % (param_name, value) + self.vis.text(param_string, opts={"title": "Parameters"}) + + def log_dataset(self, dataset: SpeakerVerificationDataset): + if self.disabled: + return + dataset_string = "" + dataset_string += "Speakers: %s\n" % len(dataset.speakers) + dataset_string += "\n" + dataset.get_logs() + dataset_string = dataset_string.replace("\n", "
") + self.vis.text(dataset_string, opts={"title": "Dataset"}) + + def log_implementation(self, params): + if self.disabled: + return + implementation_string = "" + for param, value in params.items(): + implementation_string += "%s: %s\n" % (param, value) + implementation_string = implementation_string.replace("\n", "
") + self.implementation_string = implementation_string + self.implementation_win = self.vis.text( + implementation_string, + opts={"title": "Training implementation"} + ) + + def update(self, loss, eer, step): + # Update the tracking data + now = timer() + self.step_times.append(1000 * (now - self.last_update_timestamp)) + self.last_update_timestamp = now + self.losses.append(loss) + self.eers.append(eer) + print(".", end="") + + # Update the plots every steps + if step % self.update_every != 0: + return + time_string = "Step time: mean: %5dms std: %5dms" % \ + (int(np.mean(self.step_times)), int(np.std(self.step_times))) + print("\nStep %6d Loss: %.4f EER: %.4f %s" % + (step, np.mean(self.losses), np.mean(self.eers), time_string)) + if not self.disabled: + self.loss_win = self.vis.line( + [np.mean(self.losses)], + [step], + win=self.loss_win, + update="append" if self.loss_win else None, + opts=dict( + legend=["Avg. loss"], + xlabel="Step", + ylabel="Loss", + title="Loss", + ) + ) + self.eer_win = self.vis.line( + [np.mean(self.eers)], + [step], + win=self.eer_win, + update="append" if self.eer_win else None, + opts=dict( + legend=["Avg. EER"], + xlabel="Step", + ylabel="EER", + title="Equal error rate" + ) + ) + if self.implementation_win is not None: + self.vis.text( + self.implementation_string + ("%s" % time_string), + win=self.implementation_win, + opts={"title": "Training implementation"}, + ) + + # Reset the tracking + self.losses.clear() + self.eers.clear() + self.step_times.clear() + + def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, + max_speakers=10): + max_speakers = min(max_speakers, len(colormap)) + embeds = embeds[:max_speakers * utterances_per_speaker] + + n_speakers = len(embeds) // utterances_per_speaker + ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) + colors = [colormap[i] for i in ground_truth] + + reducer = umap.UMAP() + projected = reducer.fit_transform(embeds) + plt.scatter(projected[:, 0], projected[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection (step %d)" % step) + if not self.disabled: + self.projection_win = self.vis.matplot(plt, win=self.projection_win) + if out_fpath is not None: + plt.savefig(out_fpath) + plt.clf() + + def save(self): + if not self.disabled: + self.vis.save([self.env_name]) + \ No newline at end of file diff --git a/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e --- /dev/null +++ b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py @@ -0,0 +1,173 @@ +from speaker_encoder.hparams import * +from speaker_encoder import audio +from pathlib import Path +from typing import Union, List +from torch import nn +from time import perf_counter as timer +import numpy as np +import torch + + +class SpeakerEncoder(nn.Module): + def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True): + """ + :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). + If None, defaults to cuda if it is available on your machine, otherwise the model will + run on cpu. Outputs are always returned on the cpu, as numpy arrays. + """ + super().__init__() + + # Define the network + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + # Get the target device + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif isinstance(device, str): + device = torch.device(device) + self.device = device + + # Load the pretrained model'speaker weights + # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt") + # if not weights_fpath.exists(): + # raise Exception("Couldn't find the voice encoder pretrained model at %s." % + # weights_fpath) + + start = timer() + checkpoint = torch.load(weights_fpath, map_location="cpu") + + self.load_state_dict(checkpoint["model_state"], strict=False) + self.to(device) + + if verbose: + print("Loaded the voice encoder model on %s in %.2f seconds." % + (device.type, timer() - start)) + + def forward(self, mels: torch.FloatTensor): + """ + Computes the embeddings of a batch of utterance spectrograms. + :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape + (batch_size, n_frames, n_channels) + :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size). + Embeddings are positive and L2-normed, thus they lay in the range [0, 1]. + """ + # Pass the input through the LSTM layers and retrieve the final hidden state of the last + # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings. + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + @staticmethod + def compute_partial_slices(n_samples: int, rate, min_coverage): + """ + Computes where to split an utterance waveform and its corresponding mel spectrogram to + obtain partial utterances of each. Both the waveform and the + mel spectrogram slices are returned, so as to make each partial utterance waveform + correspond to its spectrogram. + + The returned ranges may be indexing further than the length of the waveform. It is + recommended that you pad the waveform with zeros up to wav_slices[-1].stop. + + :param n_samples: the number of samples in the waveform + :param rate: how many partial utterances should occur per second. Partial utterances must + cover the span of the entire utterance, thus the rate should not be lower than the inverse + of the duration of a partial utterance. By default, partial utterances are 1.6s long and + the minimum rate is thus 0.625. + :param min_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered by zero-padding the audio. Otherwise, + it will be discarded. If there aren't enough frames for one partial utterance, + this parameter is ignored so that the function always returns at least one slice. + :return: the waveform slices and mel spectrogram slices as lists of array slices. Index + respectively the waveform and the mel spectrogram with these slices to obtain the partial + utterances. + """ + assert 0 < min_coverage <= 1 + + # Compute how many frames separate two partial utterances + samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) + frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) + assert 0 < frame_step, "The rate is too high" + assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \ + (sampling_rate / (samples_per_frame * partials_n_frames)) + + # Compute the slices + wav_slices, mel_slices = [], [] + steps = max(1, n_frames - partials_n_frames + frame_step + 1) + for i in range(0, steps, frame_step): + mel_range = np.array([i, i + partials_n_frames]) + wav_range = mel_range * samples_per_frame + mel_slices.append(slice(*mel_range)) + wav_slices.append(slice(*wav_range)) + + # Evaluate whether extra padding is warranted or not + last_wav_range = wav_slices[-1] + coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) + if coverage < min_coverage and len(mel_slices) > 1: + mel_slices = mel_slices[:-1] + wav_slices = wav_slices[:-1] + + return wav_slices, mel_slices + + def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): + """ + Computes an embedding for a single utterance. The utterance is divided in partial + utterances and an embedding is computed for each. The complete utterance embedding is the + L2-normed average embedding of the partial utterances. + + TODO: independent batched version of this function + + :param wav: a preprocessed utterance waveform as a numpy array of float32 + :param return_partials: if True, the partial embeddings will also be returned along with + the wav slices corresponding to each partial utterance. + :param rate: how many partial utterances should occur per second. Partial utterances must + cover the span of the entire utterance, thus the rate should not be lower than the inverse + of the duration of a partial utterance. By default, partial utterances are 1.6s long and + the minimum rate is thus 0.625. + :param min_coverage: when reaching the last partial utterance, it may or may not have + enough frames. If at least of are present, + then the last partial utterance will be considered by zero-padding the audio. Otherwise, + it will be discarded. If there aren't enough frames for one partial utterance, + this parameter is ignored so that the function always returns at least one slice. + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If + is True, the partial utterances as a numpy array of float32 of shape + (n_partials, model_embedding_size) and the wav partials as a list of slices will also be + returned. + """ + # Compute where to split the utterance into partials and pad the waveform with zeros if + # the partial utterances cover a larger range. + wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage) + max_wave_length = wav_slices[-1].stop + if max_wave_length >= len(wav): + wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") + + # Split the utterance into partials and forward them through the model + mel = audio.wav_to_mel_spectrogram(wav) + mels = np.array([mel[s] for s in mel_slices]) + with torch.no_grad(): + mels = torch.from_numpy(mels).to(self.device) + partial_embeds = self(mels).cpu().numpy() + + # Compute the utterance embedding from the partial embeddings + raw_embed = np.mean(partial_embeds, axis=0) + embed = raw_embed / np.linalg.norm(raw_embed, 2) + + if return_partials: + return embed, partial_embeds, wav_slices + return embed + + def embed_speaker(self, wavs: List[np.ndarray], **kwargs): + """ + Compute the embedding of a collection of wavs (presumably from the same speaker) by + averaging their embedding and L2-normalizing it. + + :param wavs: list of wavs a numpy arrays of float32. + :param kwargs: extra arguments to embed_utterance() + :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). + """ + raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \ + for wav in wavs], axis=0) + return raw_embed / np.linalg.norm(raw_embed, 2) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/spk_ext.py b/dreamvoice/train_utils/src/spk_ext.py index 6f20b75c46bb518143d9d5acd3481e84c71e0e47..5f348d653b2a945493ede4a2d7e55f5fd1d62288 100644 --- a/dreamvoice/train_utils/src/spk_ext.py +++ b/dreamvoice/train_utils/src/spk_ext.py @@ -46,4 +46,69 @@ def se_extractor(audio_path, vc): gs.append(g.detach()) gs = torch.stack(gs).mean(0) - return gs.cpu() \ No newline at end of file + return gs.cpu() + + +def process_audio_folder(input_folder, output_folder, model, device): + """ + Process all audio files in a folder and its subfolders, + save the extracted features as .pt files in the output folder with the same structure. + + Args: + input_folder (str): Path to the input folder containing audio files. + output_folder (str): Path to the output folder to save .pt files. + model: Pre-trained model for feature extraction. + device: Torch device (e.g., 'cpu' or 'cuda'). + """ + # Collect all audio file paths + audio_files = [] + for root, _, files in os.walk(input_folder): + for file in files: + if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process + audio_files.append(os.path.join(root, file)) + + # Process each audio file with tqdm for progress + for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"): + # Construct output path + relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder) + output_dir = os.path.join(output_folder, relative_path) + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt') + + # Check if the .pt file already exists + if os.path.exists(output_path): + # print(f"Skipped (already exists): {output_path}") + continue # Skip processing this file + # Extract features + target_se = se_extractor(audio_path, model).to(device) + # Save the feature as .pt + torch.save(target_se, output_path) + # print(f"Processed and saved: {output_path}") + + +if __name__ == '__main__': + ckpt_converter = 'checkpoints_v2/converter' + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) + model.load_ckpt(f'{ckpt_converter}/checkpoint.pth') + + # audio_path = 'debug.wav' + # target_se = se_extractor(audio_path, model).to(device) + + # source_path = 'source.wav' + # source_se = se_extractor(source_path, model).to(device) + + # encode_message = "@MyShell" + # model.convert( + # audio_src_path=source_path, + # src_se=source_se, + # tgt_se=target_se, + # output_path='output.wav', + # message=encode_message) + # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/' + # output_folder = 'spk/VCTK-Corpus/' + # process_audio_folder(input_folder, output_folder, model, device) + + input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360' + output_folder = 'spk/LibriTTS-R/train-clean-360/' + process_audio_folder(input_folder, output_folder, model, device) \ No newline at end of file diff --git a/dreamvoice/train_utils/src/train_freevc.py b/dreamvoice/train_utils/src/train_freevc.py new file mode 100644 index 0000000000000000000000000000000000000000..e8e1fac17288da07a1f2bcb6f42f6f6c7b2e0b81 --- /dev/null +++ b/dreamvoice/train_utils/src/train_freevc.py @@ -0,0 +1,214 @@ +import yaml +import random +import argparse +import os +import time +from tqdm import tqdm +from pathlib import Path + +import torch +from torch.utils.data import DataLoader + +from accelerate import Accelerator +from diffusers import DDIMScheduler + +from configs.plugin import get_params +from model.p2e_cross import P2E_Cross +from modules.speaker_encoder.encoder import inference as spk_encoder +from transformers import T5Tokenizer, T5EncoderModel, AutoModel +from inference_freevc import eval_plugin +from dataset.dreamvc import DreamData +# from vc_wrapper import load_diffvc_models +from freevc_wrapper import get_freevc_models +from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift + +parser = argparse.ArgumentParser() + +# config settings +parser.add_argument('--config-name', type=str, default='Plugin_freevc') +parser.add_argument('--vc-unet-path', type=str, default='freevc') +parser.add_argument('--speaker-path', type=str, default='speaker_encoder/ckpt/pretrained_bak_5805000.pt') + + +# training settings +parser.add_argument("--amp", type=str, default='fp16') +parser.add_argument('--epochs', type=int, default=200) +parser.add_argument('--batch-size', type=int, default=32) +parser.add_argument('--num-workers', type=int, default=8) +parser.add_argument('--num-threads', type=int, default=1) +parser.add_argument('--save-every', type=int, default=10) + +# log and random seed +parser.add_argument('--random-seed', type=int, default=2023) +parser.add_argument('--log-step', type=int, default=200) +parser.add_argument('--log-dir', type=str, default='../logs/') +parser.add_argument('--save-dir', type=str, default='../ckpts/') + +args = parser.parse_args() +params = get_params(args.config_name) +args.log_dir = args.log_dir + args.config_name + '/' + +with open('model/p2e_cross.yaml', 'r') as fp: + config = yaml.safe_load(fp) + +if os.path.exists(args.save_dir + args.config_name) is False: + os.makedirs(args.save_dir + args.config_name) + +if os.path.exists(args.log_dir) is False: + os.makedirs(args.log_dir) + +if __name__ == '__main__': + # Fix the random seed + random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + + # Set device + torch.set_num_threads(args.num_threads) + if torch.cuda.is_available(): + args.device = 'cuda' + torch.cuda.manual_seed(args.random_seed) + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cuda.matmul.allow_tf32 = True + if torch.backends.cudnn.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.allow_tf32 = True + torch.backends.cudnn.benchmark = False + else: + args.device = 'cpu' + + train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv', + subset='train', prompt_dir='../prepare/prompts.csv',) + train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) + + # use accelerator for multi-gpu training + accelerator = Accelerator(mixed_precision=args.amp) + + # vc_unet, hifigan, _, logmel, vc_scheduler = load_diffvc_models(args.vc_unet_path, + # args.vocoder_path, + # args.speaker_path, + # args.vc_config_path, + # accelerator.device) + freevc_24, cmodel, _, hps = get_freevc_models(args.vc_unet_path, args.speaker_path, accelerator.device) + # speaker + # spk_encoder.load_model(Path(args.speaker_path), accelerator.device) + + # text encoder + tokenizer = T5Tokenizer.from_pretrained(params.text_encoder.model) + text_encoder = T5EncoderModel.from_pretrained(params.text_encoder.model).to(accelerator.device) + text_encoder.eval() + + # main U-Net + model = P2E_Cross(config['diffwrap']).to(accelerator.device) + model.load_state_dict(torch.load('../ckpts/Plugin_freevc/49.pt')['model']) + + total_params = sum([param.nelement() for param in model.parameters()]) + print("Number of parameter: %.2fM" % (total_params / 1e6)) + + if params.diff.v_prediction: + print('v prediction') + noise_scheduler = DDIMScheduler(num_train_timesteps=params.diff.num_train_steps, + beta_start=params.diff.beta_start, beta_end=params.diff.beta_end, + rescale_betas_zero_snr=True, + timestep_spacing="trailing", + clip_sample=False, + prediction_type='v_prediction') + else: + print('noise prediction') + noise_scheduler = DDIMScheduler(num_train_timesteps=args.num_train_steps, + beta_start=args.beta_start, beta_end=args.beta_end, + clip_sample=False, + prediction_type='epsilon') + + optimizer = torch.optim.AdamW(model.parameters(), + lr=params.opt.learning_rate, + betas=(params.opt.beta1, params.opt.beta2), + weight_decay=params.opt.weight_decay, + eps=params.opt.adam_epsilon, + ) + loss_func = torch.nn.MSELoss() + + model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) + + global_step = 0 + losses = 0 + + if accelerator.is_main_process: + eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder], + model, noise_scheduler, (1, 256, 1), + val_meta='../prepare/val_meta.csv', + val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/', + guidance_scale=3.0, guidance_rescale=0.0, + ddim_steps=100, eta=1, random_seed=None, + device=accelerator.device, + epoch='test', save_path=args.log_dir + 'output/', val_num=10) + accelerator.wait_for_everyone() + + for epoch in range(args.epochs): + model.train() + for step, batch in enumerate(tqdm(train_loader)): + spk_embed, prompt = batch + spk_embed = spk_embed.unsqueeze(-1) + + with torch.no_grad(): + text_batch = tokenizer(prompt, + max_length=32, + padding='max_length', truncation=True, return_tensors="pt") + text, text_mask = text_batch.input_ids.to(spk_embed.device), \ + text_batch.attention_mask.to(spk_embed.device) + text = text_encoder(input_ids=text, attention_mask=text_mask)[0] + + spk_embed = scale_shift(spk_embed, 20, -0.035) + # spk_embed = minmax_norm_diff(spk_embed, vmax=0.5, vmin=0.0) + # content_clip = align_seq(content_clip, audio_clip.shape[-1]) + # f0_clip = align_seq(f0_clip, audio_clip.shape[-1]) + + # adding noise + noise = torch.randn(spk_embed.shape).to(accelerator.device) + timesteps = torch.randint(0, params.diff.num_train_steps, (noise.shape[0],), + device=accelerator.device, ).long() + noisy_target = noise_scheduler.add_noise(spk_embed, noise, timesteps) + # v prediction - model output + velocity = noise_scheduler.get_velocity(spk_embed, noise, timesteps) + + # inference + pred = model(noisy_target, timesteps, text, text_mask, train_cfg=True, cfg_prob=0.25) + # backward + if params.diff.v_prediction: + loss = loss_func(pred, velocity) + else: + loss = loss_func(pred, noise) + + accelerator.backward(loss) + optimizer.step() + optimizer.zero_grad() + + global_step += 1 + losses += loss.item() + + if accelerator.is_main_process: + if global_step % args.log_step == 0: + n = open(args.log_dir + 'diff_vc.txt', mode='a') + n.write(time.asctime(time.localtime(time.time()))) + n.write('\n') + n.write('Epoch: [{}][{}] Batch: [{}][{}] Loss: {:.6f}\n'.format( + epoch + 1, args.epochs, step + 1, len(train_loader), losses / args.log_step)) + n.close() + losses = 0.0 + + accelerator.wait_for_everyone() + + if (epoch + 1) % args.save_every == 0: + if accelerator.is_main_process: + eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder], + model, noise_scheduler, (1, 256, 1), + val_meta='../prepare/val_meta.csv', + val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/', + guidance_scale=3, guidance_rescale=0.0, + ddim_steps=50, eta=1, random_seed=2024, + device=accelerator.device, + epoch=epoch, save_path=args.log_dir + 'output/', val_num=10) + + unwrapped_unet = accelerator.unwrap_model(model) + accelerator.save({ + "model": unwrapped_unet.state_dict(), + }, args.save_dir + args.config_name + '/' + str(epoch) + '.pt') diff --git a/dreamvoice/train_utils/src/train.py b/dreamvoice/train_utils/src/train_openvoice.py similarity index 95% rename from dreamvoice/train_utils/src/train.py rename to dreamvoice/train_utils/src/train_openvoice.py index a5fb5ac6226f985925fa2fbaf417fcfdd6782443..0bfd7d2ae77310eb645a2cfdf613ed6ecd6dc9b9 100644 --- a/dreamvoice/train_utils/src/train.py +++ b/dreamvoice/train_utils/src/train_openvoice.py @@ -25,7 +25,7 @@ from utils import minmax_norm_diff, reverse_minmax_norm_diff parser = argparse.ArgumentParser() # config settings -parser.add_argument('--config-name', type=str, default='Plugin_base') +parser.add_argument('--config-name', type=str, default='Plugin_freevc') # training settings parser.add_argument("--amp", type=str, default='fp16') @@ -73,7 +73,7 @@ if __name__ == '__main__': else: args.device = 'cpu' - train_set = DreamData(data_dir='../prepare/spk/', meta_dir='../prepare/plugin_meta.csv', + train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv', subset='train', prompt_dir='../prepare/prompts.csv',) train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) diff --git a/dreamvoice/train_utils/src/utils.py b/dreamvoice/train_utils/src/utils.py index 0fe1dcba21ce183e2e7c26a711c702ca089813d3..6dc3e7629165253801ceb946e6d9ac80a89a25f2 100644 --- a/dreamvoice/train_utils/src/utils.py +++ b/dreamvoice/train_utils/src/utils.py @@ -15,3 +15,23 @@ def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: floa tensor = (tensor + 1) / 2 tensor = tensor * (vmax - vmin) + vmin return tensor + + +def scale_shift(x, scale, shift): + return (x+shift) * scale + + +def scale_shift_re(x, scale, shift): + return (x/scale) - shift + + +def align_seq(source, target_length, mapping_method='hard'): + source_len = source.shape[1] + if mapping_method == 'hard': + mapping_idx = np.round(np.arange(target_length) * source_len / target_length) + output = source[:, mapping_idx] + else: + # TBD + raise NotImplementedError + + return output \ No newline at end of file diff --git a/freevc_example.py b/freevc_example.py new file mode 100644 index 0000000000000000000000000000000000000000..0228cdaf097068b714c511890a78e1e08196989e --- /dev/null +++ b/freevc_example.py @@ -0,0 +1,25 @@ +import torch +import librosa +import soundfile as sf +from dreamvoice import DreamVoice_Plugin +from dreamvoice.freevc_wrapper import get_freevc_models, convert + +freevc, cmodel, hps = get_freevc_models('ckpts_freevc/', 'dreamvoice/', 'cuda') +device = 'cuda' + +# init dreamvoice +dreamvoice = DreamVoice_Plugin(config='plugin_freevc.yaml', device=device) + +# generate speaker +prompt = "old female's voice, deep and dark" +target_se = dreamvoice.gen_spk(prompt) + +# content source +source_path = 'examples/test1.wav' +audio_clip = librosa.load(source_path, sr=16000)[0] +audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device) + +content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device) + +output, out_sr = convert(freevc, content, target_se) +sf.write('output.wav', output, out_sr) \ No newline at end of file diff --git a/examples/openvoice_example.py b/openvoice_example.py similarity index 92% rename from examples/openvoice_example.py rename to openvoice_example.py index dccbe4b6688e717b551979409cb778dd856f6a1d..4d20e44399cb7d8c87399bc99622d9965b128295 100644 --- a/examples/openvoice_example.py +++ b/openvoice_example.py @@ -14,12 +14,12 @@ openvoice = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) openvoice.load_ckpt(f'{ckpt_converter}/checkpoint.pth') # generate speaker -prompt = 'rough boy voice, young' +prompt = 'female voice, bright and cute' target_se = dreamvoice.gen_spk(prompt) target_se = target_se.unsqueeze(-1) # content source -source_path = 'examples/test2.wav' +source_path = 'segment_1.mp3' source_se = se_extractor(source_path, openvoice).to(device) # voice conversion