diff --git a/ckpts_freevc/freevc.json b/ckpts_freevc/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/ckpts_freevc/freevc.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/ckpts_freevc/freevc.pth b/ckpts_freevc/freevc.pth
new file mode 100644
index 0000000000000000000000000000000000000000..976143bef5d846836704a38f7ad57cb0535d40b8
--- /dev/null
+++ b/ckpts_freevc/freevc.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2cc2d047f63b80d1d6780e37611cec11a01d597560393b1fe6118158b3bd47f
+size 472644351
diff --git a/dreamvoice/freevc/.gitattributes b/dreamvoice/freevc/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26
--- /dev/null
+++ b/dreamvoice/freevc/.gitattributes
@@ -0,0 +1,34 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/dreamvoice/freevc/.gitignore b/dreamvoice/freevc/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4
--- /dev/null
+++ b/dreamvoice/freevc/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+flagged
\ No newline at end of file
diff --git a/dreamvoice/freevc/README.md b/dreamvoice/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/freevc/app.py b/dreamvoice/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79
--- /dev/null
+++ b/dreamvoice/freevc/app.py
@@ -0,0 +1,92 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+ shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# print("Loading FreeVC...")
+# hps = utils.get_hparams_from_file("configs/freevc.json")
+# freevc = SynthesizerTrn(
+# hps.data.filter_length // 2 + 1,
+# hps.train.segment_size // hps.data.hop_length,
+# **hps.model).to(device)
+# _ = freevc.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+# print("Loading FreeVC-s...")
+# hps = utils.get_hparams_from_file("configs/freevc-s.json")
+# freevc_s = SynthesizerTrn(
+# hps.data.filter_length // 2 + 1,
+# hps.train.segment_size // hps.data.hop_length,
+# **hps.model).to(device)
+# _ = freevc_s.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+#
+# print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+def convert(model, cmodel, src, tgt):
+ with torch.no_grad():
+ # tgt
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+
+ # src
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+ # infer
+ if model == "FreeVC":
+ audio = freevc.infer(c, g=g_tgt)
+ elif model == "FreeVC-s":
+ audio = freevc_s.infer(c, mel=mel_tgt)
+ else:
+ audio = freevc_24.infer(c, g=g_tgt)
+ audio = audio[0][0].data.cpu().float().numpy()
+ if model == "FreeVC" or model == "FreeVC-s":
+ write("out.wav", hps.data.sampling_rate, audio)
+ else:
+ write("out.wav", 24000, audio)
+ out = "out.wav"
+ return out
+
+# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
+# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath')
+# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath')
+# inputs = [model, audio1, audio2]
+# outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
+#
+# title = "FreeVC"
+# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting."
+# article = "
Paper | Github Repo
"
+#
+# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+#
+# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
+convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav')
\ No newline at end of file
diff --git a/dreamvoice/freevc/commons.py b/dreamvoice/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+ """KL(P||Q)"""
+ kl = (logs_q - logs_p) - 0.5
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+ return kl
+
+
+def rand_gumbel(shape):
+ """Sample from the Gumbel distribution, protect from overflows."""
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+ return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+ return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, :, idx_str:idx_end]
+ return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def get_timing_signal_1d(
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
+ position = torch.arange(length, dtype=torch.float)
+ num_timescales = channels // 2
+ log_timescale_increment = (
+ math.log(float(max_timescale) / float(min_timescale)) /
+ (num_timescales - 1))
+ inv_timescales = min_timescale * torch.exp(
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
+ signal = signal.view(1, channels, length)
+ return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+ return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def shift_1d(x):
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+ return x
+
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+ """
+ duration: [b, 1, t_x]
+ mask: [b, 1, t_y, t_x]
+ """
+ device = duration.device
+
+ b, _, t_y, t_x = mask.shape
+ cum_duration = torch.cumsum(duration, -1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path.unsqueeze(1).transpose(2,3) * mask
+ return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ norm_type = float(norm_type)
+ if clip_value is not None:
+ clip_value = float(clip_value)
+
+ total_norm = 0
+ for p in parameters:
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item() ** norm_type
+ if clip_value is not None:
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
+ total_norm = total_norm ** (1. / norm_type)
+ return total_norm
diff --git a/dreamvoice/freevc/configs/freevc-24.json b/dreamvoice/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8640,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8008"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,6,4,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/freevc/configs/freevc-s.json b/dreamvoice/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": false
+ }
+}
diff --git a/dreamvoice/freevc/configs/freevc.json b/dreamvoice/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/freevc/mel_processing.py b/dreamvoice/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+ output = dynamic_range_compression_torch(magnitudes)
+ return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+ output = dynamic_range_decompression_torch(magnitudes)
+ return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+ global mel_basis
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+ return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global mel_basis, hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+
+ return spec
diff --git a/dreamvoice/freevc/models.py b/dreamvoice/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361
--- /dev/null
+++ b/dreamvoice/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .commons import sequence_mask, rand_slice_segments
+from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+ self.flows.append(Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+
+class Encoder(nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+ resblock = ResBlock1 if resblock == '1' else ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(weight_norm(
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+ k, u, padding=(k-u)//2)))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel//(2**(i+1))
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+ ])
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(DiscriminatorS, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ])
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ fmap = []
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2,3,5,7,11]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class SpeakerEncoder(torch.nn.Module):
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+ super(SpeakerEncoder, self).__init__()
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+ self.relu = nn.ReLU()
+
+ def forward(self, mels):
+ self.lstm.flatten_parameters()
+ _, (hidden, _) = self.lstm(mels)
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+ mel_slices = []
+ for i in range(0, total_frames-partial_frames, partial_hop):
+ mel_range = torch.arange(i, i+partial_frames)
+ mel_slices.append(mel_range)
+
+ return mel_slices
+
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+ mel_len = mel.size(1)
+ last_mel = mel[:,-partial_frames:]
+
+ if mel_len > partial_frames:
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+ mels = list(mel[:,s] for s in mel_slices)
+ mels.append(last_mel)
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
+
+ with torch.no_grad():
+ partial_embeds = self(mels)
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+ #embed = embed / torch.linalg.norm(embed, 2)
+ else:
+ with torch.no_grad():
+ embed = self(last_mel)
+
+ return embed
+
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels,
+ ssl_dim,
+ use_spk,
+ **kwargs):
+
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ self.ssl_dim = ssl_dim
+ self.use_spk = use_spk
+
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+ if not self.use_spk:
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if spec_lengths == None:
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+
+ if not self.use_spk:
+ g = self.enc_spk(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+ z_p = self.flow(z, spec_mask, g=g)
+
+ z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
+ o = self.dec(z_slice, g=g)
+
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, c, g=None, mel=None, c_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if not self.use_spk:
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
+ o = self.dec(z * c_mask, g=g)
+
+ return o
diff --git a/dreamvoice/freevc/modules.py b/dreamvoice/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5
--- /dev/null
+++ b/dreamvoice/freevc/modules.py
@@ -0,0 +1,341 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+ assert n_layers > 1, "Number of layers should be larger than 0."
+
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.relu_drop = nn.Sequential(
+ nn.ReLU(),
+ nn.Dropout(p_dropout))
+ for _ in range(n_layers-1):
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ x_org = x
+ for i in range(self.n_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.norm_layers[i](x)
+ x = self.relu_drop(x)
+ x = x_org + self.proj(x)
+ return x * x_mask
+
+
+class DDSConv(nn.Module):
+ """
+ Dialted and Depth-Separable Convolution
+ """
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+ super().__init__()
+ self.channels = channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+
+ self.drop = nn.Dropout(p_dropout)
+ self.convs_sep = nn.ModuleList()
+ self.convs_1x1 = nn.ModuleList()
+ self.norms_1 = nn.ModuleList()
+ self.norms_2 = nn.ModuleList()
+ for i in range(n_layers):
+ dilation = kernel_size ** i
+ padding = (kernel_size * dilation - dilation) // 2
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+ groups=channels, dilation=dilation, padding=padding
+ ))
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+ self.norms_1.append(LayerNorm(channels))
+ self.norms_2.append(LayerNorm(channels))
+
+ def forward(self, x, x_mask, g=None):
+ if g is not None:
+ x = x + g
+ for i in range(self.n_layers):
+ y = self.convs_sep[i](x * x_mask)
+ y = self.norms_1[i](y)
+ y = F.gelu(y)
+ y = self.convs_1x1[i](y)
+ y = self.norms_2[i](y)
+ y = F.gelu(y)
+ y = self.drop(y)
+ x = x + y
+ return x * x_mask
+
+
+class WN(torch.nn.Module):
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ self.hidden_channels =hidden_channels
+ self.kernel_size = kernel_size,
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+ self.p_dropout = p_dropout
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.drop = nn.Dropout(p_dropout)
+
+ if gin_channels != 0:
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+ for i in range(n_layers):
+ dilation = dilation_rate ** i
+ padding = int((kernel_size * dilation - dilation) / 2)
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2 * hidden_channels
+ else:
+ res_skip_channels = hidden_channels
+
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, x, x_mask, g=None, **kwargs):
+ output = torch.zeros_like(x)
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+ if g is not None:
+ g = self.cond_layer(g)
+
+ for i in range(self.n_layers):
+ x_in = self.in_layers[i](x)
+ if g is not None:
+ cond_offset = i * 2 * self.hidden_channels
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+ else:
+ g_l = torch.zeros_like(x_in)
+
+ acts = fused_add_tanh_sigmoid_multiply(
+ x_in,
+ g_l,
+ n_channels_tensor)
+ acts = self.drop(acts)
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
+ x = (x + res_acts) * x_mask
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
+ else:
+ output = output + res_skip_acts
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.gin_channels != 0:
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for l in self.in_layers:
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.res_skip_layers:
+ torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+ super(ResBlock1, self).__init__()
+ self.convs1 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2])))
+ ])
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1)))
+ ])
+ self.convs2.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c1, c2 in zip(self.convs1, self.convs2):
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c1(xt)
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c2(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+ super(ResBlock2, self).__init__()
+ self.convs = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1])))
+ ])
+ self.convs.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c in self.convs:
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_weight_norm(l)
+
+
+class Log(nn.Module):
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+ logdet = torch.sum(-y, [1, 2])
+ return y, logdet
+ else:
+ x = torch.exp(x) * x_mask
+ return x
+
+
+class Flip(nn.Module):
+ def forward(self, x, *args, reverse=False, **kwargs):
+ x = torch.flip(x, [1])
+ if not reverse:
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+ return x, logdet
+ else:
+ return x
+
+
+class ElementwiseAffine(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.channels = channels
+ self.m = nn.Parameter(torch.zeros(channels,1))
+ self.logs = nn.Parameter(torch.zeros(channels,1))
+
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = self.m + torch.exp(self.logs) * x
+ y = y * x_mask
+ logdet = torch.sum(self.logs * x_mask, [1,2])
+ return y, logdet
+ else:
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
+ return x
+
+
+class ResidualCouplingLayer(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=0,
+ gin_channels=0,
+ mean_only=False):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1,2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
diff --git a/dreamvoice/freevc/requirements.txt b/dreamvoice/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/freevc/utils.py b/dreamvoice/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69
--- /dev/null
+++ b/dreamvoice/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from .commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+ checkpoint = torch.load('wavlm/WavLM-Large.pt')
+ cfg = WavLMConfig(checkpoint['cfg'])
+ cmodel = WavLM(cfg).cuda(rank)
+ cmodel.load_state_dict(checkpoint['model'])
+ cmodel.eval()
+ return cmodel
+
+
+def get_content(cmodel, y):
+ with torch.no_grad():
+ c = cmodel.extract_features(y.squeeze(1))[0]
+ c = c.transpose(1, 2)
+ return c
+
+
+def get_vocoder(rank):
+ with open("hifigan/config.json", "r") as f:
+ config = json.load(f)
+ config = hifigan.AttrDict(config)
+ vocoder = hifigan.Generator(config)
+ ckpt = torch.load("hifigan/generator_v1")
+ vocoder.load_state_dict(ckpt["generator"])
+ vocoder.eval()
+ vocoder.remove_weight_norm()
+ vocoder.cuda(rank)
+ return vocoder
+
+
+def transform(mel, height): # 68-92
+ #r = np.random.random()
+ #rate = r * 0.3 + 0.85 # 0.85-1.15
+ #height = int(mel.size(-2) * rate)
+ tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+ if height >= mel.size(-2):
+ return tgt[:, :mel.size(-2), :]
+ else:
+ silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
+ silence += torch.randn_like(silence) / 10
+ return torch.cat((tgt, silence), 1)
+
+
+def stretch(mel, width): # 0.5-2
+ return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+ assert os.path.isfile(checkpoint_path)
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+ iteration = checkpoint_dict['iteration']
+ learning_rate = checkpoint_dict['learning_rate']
+ if optimizer is not None:
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
+ saved_state_dict = checkpoint_dict['model']
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ new_state_dict= {}
+ for k, v in state_dict.items():
+ try:
+ new_state_dict[k] = saved_state_dict[k]
+ except:
+ logger.info("%s is not in the checkpoint" % k)
+ new_state_dict[k] = v
+ if hasattr(model, 'module'):
+ model.module.load_state_dict(new_state_dict)
+ else:
+ model.load_state_dict(new_state_dict)
+ logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+ checkpoint_path, iteration))
+ return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
+ iteration, checkpoint_path))
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ torch.save({'model': state_dict,
+ 'iteration': iteration,
+ 'optimizer': optimizer.state_dict(),
+ 'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+ for k, v in scalars.items():
+ writer.add_scalar(k, v, global_step)
+ for k, v in histograms.items():
+ writer.add_histogram(k, v, global_step)
+ for k, v in images.items():
+ writer.add_image(k, v, global_step, dataformats='HWC')
+ for k, v in audios.items():
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+ f_list = glob.glob(os.path.join(dir_path, regex))
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+ x = f_list[-1]
+ print(x)
+ return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(10,2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+ interpolation='none')
+ plt.colorbar(im, ax=ax)
+ plt.xlabel("Frames")
+ plt.ylabel("Channels")
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(6, 4))
+ im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+ interpolation='none')
+ fig.colorbar(im, ax=ax)
+ xlabel = 'Decoder timestep'
+ if info is not None:
+ xlabel += '\n\n' + info
+ plt.xlabel(xlabel)
+ plt.ylabel('Encoder timestep')
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+ with open(filename, encoding='utf-8') as f:
+ filepaths_and_text = [line.strip().split(split) for line in f]
+ return filepaths_and_text
+
+
+def get_hparams(init=True):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+ help='JSON file for configuration')
+ parser.add_argument('-m', '--model', type=str, required=True,
+ help='Model name')
+
+ args = parser.parse_args()
+ model_dir = os.path.join("./logs", args.model)
+
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+
+ config_path = args.config
+ config_save_path = os.path.join(model_dir, "config.json")
+ if init:
+ with open(config_path, "r") as f:
+ data = f.read()
+ with open(config_save_path, "w") as f:
+ f.write(data)
+ else:
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams = HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_dir(model_dir):
+ config_save_path = os.path.join(model_dir, "config.json")
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_file(config_path):
+ with open(config_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ return hparams
+
+
+def check_git_hash(model_dir):
+ source_dir = os.path.dirname(os.path.realpath(__file__))
+ if not os.path.exists(os.path.join(source_dir, ".git")):
+ logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+ source_dir
+ ))
+ return
+
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+ path = os.path.join(model_dir, "githash")
+ if os.path.exists(path):
+ saved_hash = open(path).read()
+ if saved_hash != cur_hash:
+ logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+ saved_hash[:8], cur_hash[:8]))
+ else:
+ open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+ global logger
+ logger = logging.getLogger(os.path.basename(model_dir))
+ logger.setLevel(logging.DEBUG)
+
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+ h = logging.FileHandler(os.path.join(model_dir, filename))
+ h.setLevel(logging.DEBUG)
+ h.setFormatter(formatter)
+ logger.addHandler(h)
+ return logger
+
+
+class HParams():
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ if type(v) == dict:
+ v = HParams(**v)
+ self[k] = v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+ def __setitem__(self, key, value):
+ return setattr(self, key, value)
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return self.__dict__.__repr__()
diff --git a/dreamvoice/freevc_wrapper.py b/dreamvoice/freevc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..958cd74a44e119cbafb2365ca1ebb4a7eac66c7e
--- /dev/null
+++ b/dreamvoice/freevc_wrapper.py
@@ -0,0 +1,63 @@
+import os
+import torch
+import librosa
+import soundfile as sf
+from pathlib import Path
+
+from transformers import WavLMModel
+from .freevc.utils import load_checkpoint, get_hparams_from_file
+from .freevc.models import SynthesizerTrn
+# from mel_processing import mel_spectrogram_torch
+# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder
+# from speaker_encoder.voice_encoder import SpeakerEncoder
+
+
+def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
+ hps = get_hparams_from_file(f"{path}/freevc.json")
+ freevc = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+ freevc.eval()
+ load_checkpoint(f"{path}/freevc.pth", freevc, None)
+
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+ cmodel.eval()
+
+ # smodel = spk_encoder.load_model(Path(speaker_path), device)
+ # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda')
+ # smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
+
+ return freevc, cmodel, hps
+
+
+@torch.no_grad()
+def convert(freevc, content, speaker):
+ audio = freevc.infer(content, g=speaker)
+ audio = audio[0][0].data.cpu().float().numpy()
+ return audio, 16000
+
+
+if __name__ == '__main__':
+ freevc_24, cmodel, smodel, hps = get_freevc_models()
+
+ tgt = 'p226_002.wav'
+ # src = 'p226_002.wav'
+ src = 'p225_001.wav'
+ device = 'cuda'
+
+ # tgt
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+ # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda())
+
+ # src
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+ content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+
+ output, sr = convert(freevc_24, content, g_tgt)
+
+ sf.write('output.wav', output, sr)
\ No newline at end of file
diff --git a/dreamvoice/plugin.py b/dreamvoice/plugin.py
index 12243ecb47d63270aef13fd44c4dbd040198879d..aeef16a90eaa8851293ca2090bdcecb5544dde02 100644
--- a/dreamvoice/plugin.py
+++ b/dreamvoice/plugin.py
@@ -108,7 +108,6 @@ class DreamVoice_Plugin:
self.spk_encoder = spk_encoder
self.spk_embed_cache = None
-
@torch.no_grad()
def gen_spk(self, prompt,
prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
diff --git a/dreamvoice/plugin_ckpts/freevc.pt b/dreamvoice/plugin_ckpts/freevc.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5676c4bbc95085ed5a7da8b7d1d479849b1bd39
--- /dev/null
+++ b/dreamvoice/plugin_ckpts/freevc.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0589fd38d965a7f8aab6eb3bedae5d1c007acb0f305e04bbe0fd4a771fff717d
+size 104892189
diff --git a/dreamvoice/plugin_freevc.yaml b/dreamvoice/plugin_freevc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e67b8c26e8d4c0eb36d0650639e8a547f6e90691
--- /dev/null
+++ b/dreamvoice/plugin_freevc.yaml
@@ -0,0 +1,8 @@
+version: 1.1
+
+lm_path: 'google/flan-t5-base'
+
+dreamvg:
+ config_path: 'src/configs/plugin_cross_freevc.yaml'
+ ckpt_path: 'plugin_ckpts/freevc.pt'
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/freevc.pt'
\ No newline at end of file
diff --git a/dreamvoice/src/configs/plugin_cross.yaml b/dreamvoice/src/configs/plugin_cross_freevc.yaml
similarity index 100%
rename from dreamvoice/src/configs/plugin_cross.yaml
rename to dreamvoice/src/configs/plugin_cross_freevc.yaml
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/app.py b/dreamvoice/train_utils/prepare_freevc/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..040c13a7f789e9edf88565c756d1059c2a3f1e01
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/app.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+ shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+_ = freevc.eval()
+_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+print("Loading FreeVC-s...")
+hps = utils.get_hparams_from_file("configs/freevc-s.json")
+freevc_s = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+_ = freevc_s.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+def convert(model, src, tgt):
+ with torch.no_grad():
+ # tgt
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ if model == "FreeVC" or model == "FreeVC (24kHz)":
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+ else:
+ wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+ mel_tgt = mel_spectrogram_torch(
+ wav_tgt,
+ hps.data.filter_length,
+ hps.data.n_mel_channels,
+ hps.data.sampling_rate,
+ hps.data.hop_length,
+ hps.data.win_length,
+ hps.data.mel_fmin,
+ hps.data.mel_fmax
+ )
+ # src
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+ # infer
+ if model == "FreeVC":
+ audio = freevc.infer(c, g=g_tgt)
+ elif model == "FreeVC-s":
+ audio = freevc_s.infer(c, mel=mel_tgt)
+ else:
+ audio = freevc_24.infer(c, g=g_tgt)
+ audio = audio[0][0].data.cpu().float().numpy()
+ if model == "FreeVC" or model == "FreeVC-s":
+ write("out.wav", hps.data.sampling_rate, audio)
+ else:
+ write("out.wav", 24000, audio)
+ out = "out.wav"
+ return out
+
+model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
+audio1 = gr.Audio(label="Source Audio", type='filepath')
+audio2 = gr.Audio(label="Reference Audio", type='filepath')
+inputs = [model, audio1, audio2]
+outputs = gr.Audio(label="Output Audio", type='filepath')
+
+title = "FreeVC"
+description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting."
+article = "Paper | Github Repo
"
+
+examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+
+gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/commons.py b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+ """KL(P||Q)"""
+ kl = (logs_q - logs_p) - 0.5
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+ return kl
+
+
+def rand_gumbel(shape):
+ """Sample from the Gumbel distribution, protect from overflows."""
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+ return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+ return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, :, idx_str:idx_end]
+ return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def get_timing_signal_1d(
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
+ position = torch.arange(length, dtype=torch.float)
+ num_timescales = channels // 2
+ log_timescale_increment = (
+ math.log(float(max_timescale) / float(min_timescale)) /
+ (num_timescales - 1))
+ inv_timescales = min_timescale * torch.exp(
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
+ signal = signal.view(1, channels, length)
+ return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+ return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def shift_1d(x):
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+ return x
+
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+ """
+ duration: [b, 1, t_x]
+ mask: [b, 1, t_y, t_x]
+ """
+ device = duration.device
+
+ b, _, t_y, t_x = mask.shape
+ cum_duration = torch.cumsum(duration, -1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path.unsqueeze(1).transpose(2,3) * mask
+ return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ norm_type = float(norm_type)
+ if clip_value is not None:
+ clip_value = float(clip_value)
+
+ total_norm = 0
+ for p in parameters:
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item() ** norm_type
+ if clip_value is not None:
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
+ total_norm = total_norm ** (1. / norm_type)
+ return total_norm
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8640,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8008"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,6,4,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": false
+ }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16a7adcabb167ddc2c95e6d4bc722542f5fb716
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py
@@ -0,0 +1,69 @@
+import os
+import torch
+import torch.nn.functional as F
+import librosa
+import sounddevice as sd
+from transformers import WavLMModel
+from scipy.io.wavfile import write
+from models import SynthesizerTrn
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import utils
+import numpy as np
+from transformers import T5Tokenizer, T5EncoderModel
+from src.plugin_wrapper import DreamVG
+import soundfile as sf
+
+
+# Load configurations and models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+freevc.eval()
+utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+
+print("Loading Speaker Encoder...")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+lm_path = 'google/flan-t5-base'
+tokenizer = T5Tokenizer.from_pretrained(lm_path)
+text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
+ ckpt_path='checkpoints/dreamvc_plugin.pt',
+ device=device)
+
+
+prompt = "girl's voice, very young and cute"
+prompt_guidance_scale = 3.0
+
+text_batch = tokenizer(prompt, max_length=32,
+ padding='max_length', truncation=True, return_tensors="pt")
+text, text_mask = text_batch.input_ids.to(device), \
+ text_batch.attention_mask.to(device)
+text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+target_embedding = dreamvg.inference([text, text_mask],
+ guidance_scale=prompt_guidance_scale,
+ guidance_rescale=0.0,
+ ddim_steps=100, eta=1,
+ random_seed=None)
+
+# Convert to tensor and pad
+audio, sr = librosa.load('segment_1.mp3', sr=16000)
+audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
+audio = F.pad(audio, (40, 40))
+
+# Extract content features using WavLM
+c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)
+
+audio = freevc.infer(c, g=target_embedding)
+audio = audio[0][0].data.cpu().float().numpy()
+
+sf.write('freevc_out.wav', audio, 16000)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+ output = dynamic_range_compression_torch(magnitudes)
+ return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+ output = dynamic_range_decompression_torch(magnitudes)
+ return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+ global mel_basis
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+ return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global mel_basis, hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+
+ return spec
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..f732af47416bc0ed884a821e063fed5b7eab7957
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import commons
+import modules
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+ self.flows.append(modules.Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+
+class Encoder(nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(weight_norm(
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+ k, u, padding=(k-u)//2)))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel//(2**(i+1))
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+ ])
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(DiscriminatorS, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ])
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ fmap = []
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2,3,5,7,11]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class SpeakerEncoder(torch.nn.Module):
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+ super(SpeakerEncoder, self).__init__()
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+ self.relu = nn.ReLU()
+
+ def forward(self, mels):
+ self.lstm.flatten_parameters()
+ _, (hidden, _) = self.lstm(mels)
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+ mel_slices = []
+ for i in range(0, total_frames-partial_frames, partial_hop):
+ mel_range = torch.arange(i, i+partial_frames)
+ mel_slices.append(mel_range)
+
+ return mel_slices
+
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+ mel_len = mel.size(1)
+ last_mel = mel[:,-partial_frames:]
+
+ if mel_len > partial_frames:
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+ mels = list(mel[:,s] for s in mel_slices)
+ mels.append(last_mel)
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
+
+ with torch.no_grad():
+ partial_embeds = self(mels)
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+ #embed = embed / torch.linalg.norm(embed, 2)
+ else:
+ with torch.no_grad():
+ embed = self(last_mel)
+
+ return embed
+
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels,
+ ssl_dim,
+ use_spk,
+ **kwargs):
+
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ self.ssl_dim = ssl_dim
+ self.use_spk = use_spk
+
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+ if not self.use_spk:
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if spec_lengths == None:
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+
+ if not self.use_spk:
+ g = self.enc_spk(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+ z_p = self.flow(z, spec_mask, g=g)
+
+ z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+ o = self.dec(z_slice, g=g)
+
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, c, g=None, mel=None, c_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if not self.use_spk:
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
+ o = self.dec(z * c_mask, g=g)
+
+ return o
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/modules.py b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eeb47c190cdc4d42d5de5fa47f94ecc1b931c5d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py
@@ -0,0 +1,342 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+import commons
+from commons import init_weights, get_padding
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+ assert n_layers > 1, "Number of layers should be larger than 0."
+
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.relu_drop = nn.Sequential(
+ nn.ReLU(),
+ nn.Dropout(p_dropout))
+ for _ in range(n_layers-1):
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ x_org = x
+ for i in range(self.n_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.norm_layers[i](x)
+ x = self.relu_drop(x)
+ x = x_org + self.proj(x)
+ return x * x_mask
+
+
+class DDSConv(nn.Module):
+ """
+ Dialted and Depth-Separable Convolution
+ """
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+ super().__init__()
+ self.channels = channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+
+ self.drop = nn.Dropout(p_dropout)
+ self.convs_sep = nn.ModuleList()
+ self.convs_1x1 = nn.ModuleList()
+ self.norms_1 = nn.ModuleList()
+ self.norms_2 = nn.ModuleList()
+ for i in range(n_layers):
+ dilation = kernel_size ** i
+ padding = (kernel_size * dilation - dilation) // 2
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+ groups=channels, dilation=dilation, padding=padding
+ ))
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+ self.norms_1.append(LayerNorm(channels))
+ self.norms_2.append(LayerNorm(channels))
+
+ def forward(self, x, x_mask, g=None):
+ if g is not None:
+ x = x + g
+ for i in range(self.n_layers):
+ y = self.convs_sep[i](x * x_mask)
+ y = self.norms_1[i](y)
+ y = F.gelu(y)
+ y = self.convs_1x1[i](y)
+ y = self.norms_2[i](y)
+ y = F.gelu(y)
+ y = self.drop(y)
+ x = x + y
+ return x * x_mask
+
+
+class WN(torch.nn.Module):
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ self.hidden_channels =hidden_channels
+ self.kernel_size = kernel_size,
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+ self.p_dropout = p_dropout
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.drop = nn.Dropout(p_dropout)
+
+ if gin_channels != 0:
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+ for i in range(n_layers):
+ dilation = dilation_rate ** i
+ padding = int((kernel_size * dilation - dilation) / 2)
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2 * hidden_channels
+ else:
+ res_skip_channels = hidden_channels
+
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, x, x_mask, g=None, **kwargs):
+ output = torch.zeros_like(x)
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+ if g is not None:
+ g = self.cond_layer(g)
+
+ for i in range(self.n_layers):
+ x_in = self.in_layers[i](x)
+ if g is not None:
+ cond_offset = i * 2 * self.hidden_channels
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+ else:
+ g_l = torch.zeros_like(x_in)
+
+ acts = commons.fused_add_tanh_sigmoid_multiply(
+ x_in,
+ g_l,
+ n_channels_tensor)
+ acts = self.drop(acts)
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
+ x = (x + res_acts) * x_mask
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
+ else:
+ output = output + res_skip_acts
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.gin_channels != 0:
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for l in self.in_layers:
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.res_skip_layers:
+ torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+ super(ResBlock1, self).__init__()
+ self.convs1 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2])))
+ ])
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1)))
+ ])
+ self.convs2.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c1, c2 in zip(self.convs1, self.convs2):
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c1(xt)
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c2(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+ super(ResBlock2, self).__init__()
+ self.convs = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1])))
+ ])
+ self.convs.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c in self.convs:
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_weight_norm(l)
+
+
+class Log(nn.Module):
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+ logdet = torch.sum(-y, [1, 2])
+ return y, logdet
+ else:
+ x = torch.exp(x) * x_mask
+ return x
+
+
+class Flip(nn.Module):
+ def forward(self, x, *args, reverse=False, **kwargs):
+ x = torch.flip(x, [1])
+ if not reverse:
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+ return x, logdet
+ else:
+ return x
+
+
+class ElementwiseAffine(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.channels = channels
+ self.m = nn.Parameter(torch.zeros(channels,1))
+ self.logs = nn.Parameter(torch.zeros(channels,1))
+
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = self.m + torch.exp(self.logs) * x
+ y = y * x_mask
+ logdet = torch.sum(self.logs * x_mask, [1,2])
+ return y, logdet
+ else:
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
+ return x
+
+
+class ResidualCouplingLayer(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=0,
+ gin_channels=0,
+ mean_only=False):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1,2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml
@@ -0,0 +1,47 @@
+version: 1.0
+
+system: "base"
+
+model:
+ cls_embedding:
+ speaker_dim: 256
+ feature_dim: 512
+ content_dim: 768
+ content_hidden: 256
+ use_pitch: false
+
+ unet:
+ sample_size: [128, 256]
+ in_channels: 257
+ out_channels: 1
+ layers_per_block: 2
+ block_out_channels: [128, 256, 256, 512]
+ down_block_types:
+ [
+ "DownBlock2D",
+ "DownBlock2D",
+ "AttnDownBlock2D",
+ "AttnDownBlock2D",
+ ]
+ up_block_types:
+ [
+ "AttnUpBlock2D",
+ "AttnUpBlock2D",
+ "UpBlock2D",
+ "UpBlock2D"
+ ]
+ attention_head_dim: 32
+ class_embed_type: 'identity'
+
+scheduler:
+ num_train_steps: 1000
+ beta_schedule: 'linear'
+ beta_start: 0.0001
+ beta_end: 0.02
+ num_infer_steps: 50
+ rescale_betas_zero_snr: true
+ timestep_spacing: "trailing"
+ clip_sample: false
+ prediction_type: 'v_prediction'
+ scale: 2.75
+ shift: 5.80
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d8b894cd095accdcb9eab7788e8088d0430eae1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml
@@ -0,0 +1,34 @@
+version: 1.0
+
+system: "base"
+
+diffwrap:
+ cls_embedding:
+ speaker_dim: 256
+ feature_dim: 512
+ content_dim: 768
+ content_hidden: 256
+ use_pitch: true
+ pitch_dim: 1
+ pitch_hidden: 128
+
+ unet:
+ sample_size: [128, 256]
+ in_channels: 385
+ out_channels: 1
+ layers_per_block: 2
+ block_out_channels: [128, 256, 512]
+ down_block_types:
+ [
+ "DownBlock2D",
+ "AttnDownBlock2D",
+ "AttnDownBlock2D",
+ ]
+ up_block_types:
+ [
+ "AttnUpBlock2D",
+ "AttnUpBlock2D",
+ "UpBlock2D"
+ ]
+ attention_head_dim: 32
+ class_embed_type: 'identity'
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml
@@ -0,0 +1,45 @@
+version: 1.0
+
+system: "cross"
+
+model:
+ cls_embedding:
+ content_dim: 768
+ content_hidden: 256
+ use_pitch: false
+
+ unet:
+ sample_size: [128, 256]
+ in_channels: 257
+ out_channels: 1
+ layers_per_block: 2
+ block_out_channels: [128, 256, 256, 512]
+ down_block_types:
+ [
+ "DownBlock2D",
+ "DownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ ]
+ up_block_types:
+ [
+ "CrossAttnUpBlock2D",
+ "CrossAttnUpBlock2D",
+ "UpBlock2D",
+ "UpBlock2D",
+ ]
+ attention_head_dim: 32
+ cross_attention_dim: 768
+
+scheduler:
+ num_train_steps: 1000
+ beta_schedule: 'linear'
+ beta_start: 0.0001
+ beta_end: 0.02
+ num_infer_steps: 50
+ rescale_betas_zero_snr: true
+ timestep_spacing: "trailing"
+ clip_sample: false
+ prediction_type: 'v_prediction'
+ scale: 2.75
+ shift: 5.80
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml
@@ -0,0 +1,33 @@
+version: 1.0
+
+system: "cross"
+
+diffwrap:
+ cls_embedding:
+ content_dim: 768
+ content_hidden: 256
+ use_pitch: true
+ pitch_dim: 1
+ pitch_hidden: 128
+
+ unet:
+ sample_size: [100, 256]
+ in_channels: 385
+ out_channels: 1
+ layers_per_block: 2
+ block_out_channels: [128, 256, 512]
+ down_block_types:
+ [
+ "DownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ ]
+ up_block_types:
+ [
+ "CrossAttnUpBlock2D",
+ "CrossAttnUpBlock2D",
+ "UpBlock2D",
+ ]
+ attention_head_dim: 32
+ cross_attention_dim: 768
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml
@@ -0,0 +1,39 @@
+version: 1.0
+
+system: "cross"
+
+model:
+ cls_embedding:
+ content_dim: 768
+ content_hidden: 256
+
+ unet:
+ sample_size: [1, 1]
+ in_channels: 256
+ out_channels: 256
+ layers_per_block: 2
+ block_out_channels: [256]
+ down_block_types:
+ [
+ "CrossAttnDownBlock2D",
+ ]
+ up_block_types:
+ [
+ "CrossAttnUpBlock2D",
+ ]
+ attention_head_dim: 32
+ cross_attention_dim: 768
+
+scheduler:
+ num_train_steps: 1000
+ beta_schedule: 'linear'
+ beta_start: 0.0001
+ beta_end: 0.02
+ num_infer_steps: 50
+ rescale_betas_zero_snr: true
+ timestep_spacing: "trailing"
+ clip_sample: false
+ prediction_type: 'v_prediction'
+ scale: 0.05
+ shift: -0.035
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+import pandas as pd
+# from feats.hubert_model import get_soft_model, get_hubert_soft_content
+from feats.contentvec_hf import get_content_model, get_content
+# from modules.speaker_encoder.encoder import inference as spk_encoder
+# from pathlib import Path
+from tqdm import tqdm
+from multiprocessing import Process
+import pyworld as pw
+
+
+def resample_save(infolder, audio_path, model,
+ audio_sr=24000, content_sr=16000, min_length=1.92,
+ content_resolution=50,
+ save_path='features'):
+ if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
+ audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
+ final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
+ # final_length = final_length / content_sr
+
+ length = max(round(min_length*content_sr), round(final_length))
+ assert length % 10 == 0
+ audio = audio[:length]
+ audio_save = np.zeros(length, dtype=audio.dtype)
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+
+ # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
+ content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
+ content = content.cpu()
+ os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
+ torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
+ # print(audio_save.shape)
+ # print(content.shape)
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
+ sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
+ # print(save_path + '/' + 'audio_16k/' + audio_path)
+
+ audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
+ length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
+ assert length % 10 == 0
+ audio = audio[:length]
+ audio_save = np.zeros(length, dtype=audio.dtype)
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+ # print(audio_save.shape)
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
+ sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
+
+
+def extract_f0(in_folder, audio_path, save_path):
+ audio, sr = librosa.load(in_folder + audio_path, sr=None)
+ assert sr == 16000
+ if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
+ # wav = audio
+ # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
+ # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
+ # fmin=librosa.note_to_hz('C2'),
+ # fmax=librosa.note_to_hz('C6'))
+
+ _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
+ f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
+
+ f0 = np.nan_to_num(f0)
+ os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
+ # print(save_path + '/' + 'f0/' + audio_path + '.pt')
+ torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
+
+
+def chunks(arr, m):
+ result = [[] for i in range(m)]
+ for i in range(len(arr)):
+ result[i%m].append(arr[i])
+ return result
+
+
+def extract_f0_main(in_folder, audio_paths, save_path):
+ for audio_path in tqdm(audio_paths):
+ extract_f0(in_folder, audio_path, save_path)
+
+
+if __name__ == '__main__':
+ df = pd.read_csv('../test_data/vc_meta.csv')
+ # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
+ model = get_content_model().to('cuda')
+ # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
+ for i in tqdm(range(len(df))):
+ row = df.iloc[i]
+ in_path = row['path']
+ resample_save('../test_data/', in_path, model, save_path='../features/')
+
+ in_folder = '../features/audio_16k/'
+ audio_files = list(df['path'])
+ save_path = '../features/'
+ cores = 6
+
+ subsets = chunks(audio_files, cores)
+
+ for subset in subsets:
+ t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
+ t.start()
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py
@@ -0,0 +1,42 @@
+import torch
+import librosa
+from fairseq import checkpoint_utils
+import torch.nn.functional as F
+
+
+def get_model(vec_path):
+ print("load model(s) from {}".format(vec_path))
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+ [vec_path],
+ suffix="",
+ )
+ model = models[0]
+ model.eval()
+ return model
+
+
+@torch.no_grad()
+def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
+ # print(layer)
+ wav_16k_tensor = wav_16k_tensor.to(device)
+ # so that the output shape will be len(audio//320)
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+ feats = wav_16k_tensor
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+ inputs = {
+ "source": feats.to(wav_16k_tensor.device),
+ "padding_mask": padding_mask.to(wav_16k_tensor.device),
+ "output_layer": layer
+ }
+ logits = hmodel.extract_features(**inputs)[0]
+ # feats = hmodel.final_proj(logits[0])
+ return logits
+
+
+if __name__ == '__main__':
+ audio, sr = librosa.load('test.wav', sr=16000)
+ audio = audio[:100*320]
+ model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
+ model = model.cuda()
+ content = get_content(model, torch.tensor([audio]))
+ print(content)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py
@@ -0,0 +1,40 @@
+from transformers import HubertModel
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import librosa
+
+
+class HubertModelWithFinalProj(HubertModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ # The final projection layer is only used for backward compatibility.
+ # Following https://github.com/auspicious3000/contentvec/issues/6
+ # Remove this layer is necessary to achieve the desired outcome.
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+
+
+def get_content_model(config='lengyue233/content-vec-best'):
+ model = HubertModelWithFinalProj.from_pretrained(config)
+ model.eval()
+ return model
+
+
+@torch.no_grad()
+def get_content(model, wav_16k_tensor, device='cuda'):
+ # print(layer)
+ wav_16k_tensor = wav_16k_tensor.to(device)
+ # so that the output shape will be len(audio//320)
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+ logits = model(wav_16k_tensor)['last_hidden_state']
+ return logits
+
+
+if __name__ == '__main__':
+ model = get_content_model().cuda()
+ audio, sr = librosa.load('test.wav', sr=16000)
+ audio = audio[:100*320]
+ audio = torch.tensor([audio])
+ content = get_content(model, audio, 'cuda')
+ print(content)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0202868f93e8b1be2f925f2ec6b22f3df691e8c3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# VSCode project settings
+.vscode
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Benjamin van Niekerk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68602858ed726acd4f99ce9fecca008f3511dc90
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md
@@ -0,0 +1,161 @@
+# HuBERT
+
+[](https://arxiv.org/abs/2111.02392)
+[](https://bshall.github.io/soft-vc/)
+[](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb)
+
+Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484).
+For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb).
+
+
+

+
+
+
+ Fig 1: Architecture of the voice conversion system. a) The discrete content encoder clusters audio features to produce a sequence of discrete speech units. b) The soft content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform.
+
+
+
+## Example Usage
+
+### Programmatic Usage
+
+```python
+import torch, torchaudio
+
+# Load checkpoint (either hubert_soft or hubert_discrete)
+hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()
+
+# Load audio
+wav, sr = torchaudio.load("path/to/wav")
+assert sr == 16000
+wav = wav.unsqueeze(0).cuda()
+
+# Extract speech units
+units = hubert.units(x)
+```
+
+### Script-Based Usage
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+ {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete)
+ in-dir path to the dataset directory.
+ out-dir path to the output directory.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --extension EXTENSION
+ extension of the audio files (defaults to .flac).
+```
+
+## Training
+
+### Step 1: Dataset Preparation
+
+Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory:
+
+```
+│ lengths.json
+│
+└───wavs
+ ├───dev-*
+ │ ├───84
+ │ ├───...
+ │ └───8842
+ └───train-*
+ ├───19
+ ├───...
+ └───8975
+```
+
+The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples:
+
+```json
+{
+ "dev-clean/1272/128104/1272-128104-0000": 93680,
+ "dev-clean/1272/128104/1272-128104-0001": 77040,
+}
+```
+
+### Step 2: Extract Discrete Speech Units
+
+Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script:
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+ {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete)
+ in-dir path to the dataset directory.
+ out-dir path to the output directory.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --extension EXTENSION
+ extension of the audio files (defaults to .flac).
+```
+
+for example:
+
+```
+python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete
+```
+
+At this point the directory tree should look like:
+
+```
+│ lengths.json
+│
+├───discrete
+│ ├───...
+└───wavs
+ ├───...
+```
+
+### Step 3: Train the HuBERT-Soft Content Encoder
+
+```
+usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir
+
+Train HuBERT soft content encoder.
+
+positional arguments:
+ dataset-dir path to the data directory.
+ checkpoint-dir path to the checkpoint directory.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --resume RESUME path to the checkpoint to resume from.
+ --warmstart whether to initialize from the fairseq HuBERT checkpoint.
+ --mask whether to use input masking.
+ --alpha ALPHA weight for the masked loss.
+```
+
+## Links
+
+- [Soft-VC repo](https://github.com/bshall/soft-vc)
+- [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484)
+- [Official HuBERT repo](https://github.com/pytorch/fairseq)
+- [HuBERT paper](https://arxiv.org/abs/2106.07447)
+
+## Citation
+
+If you found this work helpful please consider citing our paper:
+
+```
+@inproceedings{
+ soft-vc-2022,
+ author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman},
+ booktitle={ICASSP},
+ title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion},
+ year={2022}
+}
+```
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b754c73c63b79e943d51e76414f0056f05589f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+import logging
+import argparse
+
+import torch
+import numpy as np
+from sklearn.cluster import KMeans
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def cluster(args):
+ with open(args.subset) as file:
+ subset = [line.strip() for line in file]
+
+ logger.info(f"Loading features from {args.in_dir}")
+ features = []
+ for path in subset:
+ in_path = args.in_dir / path
+ features.append(np.load(in_path.with_suffix(".npy")))
+ features = np.concatenate(features, axis=0)
+
+ logger.info(f"Clustering features of shape: {features.shape}")
+ kmeans = KMeans(n_clusters=args.n_clusters).fit(features)
+
+ checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt"
+ checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
+ torch.save(
+ checkpoint_path,
+ {
+ "n_features_in_": kmeans.n_features_in_,
+ "_n_threads": kmeans._n_threads,
+ "cluster_centers_": kmeans.cluster_centers_,
+ },
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Cluster speech features features.")
+ parser.add_argument(
+ "in_dir",
+ metavar="in-dir",
+ help="path to the encoded dataset",
+ type=Path,
+ )
+ parser.add_argument(
+ "subset",
+ matavar="subset",
+ help="path to the .txt file containing the list of files to cluster",
+ type=Path,
+ )
+ parser.add_argument(
+ "checkpoint_dir",
+ metavar="checkpoint-dir",
+ help="path to the checkpoint directory",
+ type=Path,
+ )
+ parser.add_argument(
+ "--n-clusters",
+ help="number of clusters",
+ type=int,
+ default=100,
+ )
+ args = parser.parse_args()
+ cluster(args)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc59d538a9383896cf0c36e1d4a3f5030fce38fe
Binary files /dev/null and b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png differ
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..14246e985fb0e9dc157d290853af6dcf6036f61c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py
@@ -0,0 +1,60 @@
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+
+import torch
+import torchaudio
+from torchaudio.functional import resample
+
+
+def encode_dataset(args):
+ print(f"Loading hubert checkpoint")
+ hubert = torch.hub.load(
+ "bshall/hubert:main",
+ f"hubert_{args.model}",
+ trust_repo=True,
+ ).cuda()
+
+ print(f"Encoding dataset at {args.in_dir}")
+ for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))):
+ wav, sr = torchaudio.load(in_path)
+ wav = resample(wav, sr, 16000)
+ wav = wav.unsqueeze(0).cuda()
+
+ with torch.inference_mode():
+ units = hubert.units(wav)
+
+ out_path = args.out_dir / in_path.relative_to(args.in_dir)
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+ np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy())
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Encode an audio dataset.")
+ parser.add_argument(
+ "model",
+ help="available models (HuBERT-Soft or HuBERT-Discrete)",
+ choices=["soft", "discrete"],
+ )
+ parser.add_argument(
+ "in_dir",
+ metavar="in-dir",
+ help="path to the dataset directory.",
+ type=Path,
+ )
+ parser.add_argument(
+ "out_dir",
+ metavar="out-dir",
+ help="path to the output directory.",
+ type=Path,
+ )
+ parser.add_argument(
+ "--extension",
+ help="extension of the audio files (defaults to .flac).",
+ default=".flac",
+ type=str,
+ )
+ args = parser.parse_args()
+ encode_dataset(args)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58749e4a40b29eab470686b27e06a97bfecb321
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py
@@ -0,0 +1,80 @@
+dependencies = ["torch", "torchaudio", "sklearn"]
+
+URLS = {
+ "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt",
+ "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt",
+ "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt",
+}
+
+import torch
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from sklearn.cluster import KMeans
+
+from hubert import HubertDiscrete, HubertSoft
+
+
+def hubert_discrete(
+ pretrained: bool = True,
+ progress: bool = True,
+) -> HubertDiscrete:
+ r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+ Args:
+ pretrained (bool): load pretrained weights into the model
+ progress (bool): show progress bar when downloading model
+ """
+ kmeans = kmeans100(pretrained=pretrained, progress=progress)
+ hubert = HubertDiscrete(kmeans)
+ if pretrained:
+ checkpoint = torch.hub.load_state_dict_from_url(
+ URLS["hubert-discrete"], progress=progress
+ )
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+ hubert.load_state_dict(checkpoint["hubert"])
+ hubert.eval()
+ return hubert
+
+
+def hubert_soft(
+ pretrained: bool = True,
+ progress: bool = True,
+) -> HubertSoft:
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+ Args:
+ pretrained (bool): load pretrained weights into the model.
+ progress (bool): show progress bar when downloading model.
+ """
+ hubert = HubertSoft()
+ if pretrained:
+ checkpoint = torch.hub.load_state_dict_from_url(
+ URLS["hubert-soft"],
+ progress=progress,
+ )
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+ hubert.load_state_dict(checkpoint["hubert"])
+ hubert.eval()
+ return hubert
+
+
+def _kmeans(
+ num_clusters: int, pretrained: bool = True, progress: bool = True
+) -> KMeans:
+ kmeans = KMeans(num_clusters)
+ if pretrained:
+ checkpoint = torch.hub.load_state_dict_from_url(
+ URLS[f"kmeans{num_clusters}"], progress=progress
+ )
+ kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
+ kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
+ kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
+ return kmeans
+
+
+def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
+ r"""
+ k-means checkpoint for HuBERT-Discrete with 100 clusters.
+ Args:
+ pretrained (bool): load pretrained weights into the model
+ progress (bool): show progress bar when downloading model
+ """
+ return _kmeans(100, pretrained, progress)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e07f859e99f51dcf35639f26a3eb53a81c993f3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py
@@ -0,0 +1,5 @@
+from .model import (
+ Hubert,
+ HubertDiscrete,
+ HubertSoft,
+)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac2b84f95340e088913e06db8e5db0a68e83c2e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py
@@ -0,0 +1,91 @@
+import random
+from pathlib import Path
+import numpy as np
+import json
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import torchaudio
+
+
+class AcousticUnitsDataset(Dataset):
+ def __init__(
+ self,
+ root: Path,
+ sample_rate: int = 16000,
+ label_rate: int = 50,
+ min_samples: int = 32000,
+ max_samples: int = 250000,
+ train: bool = True,
+ ):
+ self.wavs_dir = root / "wavs"
+ self.units_dir = root / "discrete"
+
+ with open(root / "lengths.json") as file:
+ self.lenghts = json.load(file)
+
+ pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
+ metadata = (
+ (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
+ for path in self.wavs_dir.rglob(pattern)
+ )
+ metadata = ((path, key) for path, key in metadata if key in self.lenghts)
+ self.metadata = [
+ path for path, key in metadata if self.lenghts[key] > min_samples
+ ]
+
+ self.sample_rate = sample_rate
+ self.label_rate = label_rate
+ self.min_samples = min_samples
+ self.max_samples = max_samples
+ self.train = train
+
+ def __len__(self):
+ return len(self.metadata)
+
+ def __getitem__(self, index):
+ wav_path = self.metadata[index]
+ units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
+
+ wav, _ = torchaudio.load(wav_path)
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+ codes = np.load(units_path.with_suffix(".npy"))
+
+ return wav, torch.from_numpy(codes).long()
+
+ def collate(self, batch):
+ wavs, codes = zip(*batch)
+ wavs, codes = list(wavs), list(codes)
+
+ wav_lengths = [wav.size(-1) for wav in wavs]
+ code_lengths = [code.size(-1) for code in codes]
+
+ wav_frames = min(self.max_samples, *wav_lengths)
+
+ collated_wavs, wav_offsets = [], []
+ for wav in wavs:
+ wav_diff = wav.size(-1) - wav_frames
+ wav_offset = random.randint(0, wav_diff)
+ wav = wav[:, wav_offset : wav_offset + wav_frames]
+
+ collated_wavs.append(wav)
+ wav_offsets.append(wav_offset)
+
+ rate = self.label_rate / self.sample_rate
+ code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
+ code_frames = round(wav_frames * rate)
+ remaining_code_frames = [
+ length - offset for length, offset in zip(code_lengths, code_offsets)
+ ]
+ code_frames = min(code_frames, *remaining_code_frames)
+
+ collated_codes = []
+ for code, code_offset in zip(codes, code_offsets):
+ code = code[code_offset : code_offset + code_frames]
+ collated_codes.append(code)
+
+ wavs = torch.stack(collated_wavs, dim=0)
+ codes = torch.stack(collated_codes, dim=0)
+
+ return wavs, codes
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..523dd95633ba73babff8b6836324ae0a7c2d267f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py
@@ -0,0 +1,241 @@
+import copy
+from typing import Optional, Tuple
+import random
+
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Hubert(nn.Module):
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+ super().__init__()
+ self._mask = mask
+ self.feature_extractor = FeatureExtractor()
+ self.feature_projection = FeatureProjection()
+ self.positional_embedding = PositionalConvEmbedding()
+ self.norm = nn.LayerNorm(768)
+ self.dropout = nn.Dropout(0.1)
+ self.encoder = TransformerEncoder(
+ nn.TransformerEncoderLayer(
+ 768, 12, 3072, activation="gelu", batch_first=True
+ ),
+ 12,
+ )
+ self.proj = nn.Linear(768, 256)
+
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ mask = None
+ if self.training and self._mask:
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+ x[mask] = self.masked_spec_embed.to(x.dtype)
+ return x, mask
+
+ def encode(
+ self, x: torch.Tensor, layer: Optional[int] = None
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ x = self.feature_extractor(x)
+ x = self.feature_projection(x.transpose(1, 2))
+ x, mask = self.mask(x)
+ x = x + self.positional_embedding(x)
+ x = self.dropout(self.norm(x))
+ x = self.encoder(x, output_layer=layer)
+ return x, mask
+
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
+ logits = torch.cosine_similarity(
+ x.unsqueeze(2),
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+ dim=-1,
+ )
+ return logits / 0.1
+
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ x, mask = self.encode(x)
+ x = self.proj(x)
+ logits = self.logits(x)
+ return logits, mask
+
+
+class HubertSoft(Hubert):
+ """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+ def __init__(self):
+ super().__init__()
+
+ @torch.inference_mode()
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
+ """Extract soft speech units.
+
+ Args:
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+ Returns:
+ Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions.
+ """
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+ x, _ = self.encode(wav)
+ return self.proj(x)
+
+
+class HubertDiscrete(Hubert):
+ """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+ def __init__(self, kmeans: KMeans):
+ super().__init__(504)
+ self.kmeans = kmeans
+
+ @torch.inference_mode()
+ def units(self, wav: torch.Tensor) -> torch.LongTensor:
+ """Extract discrete speech units.
+
+ Args:
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+ Returns:
+ LongTensor: soft speech units of shape (N,), where N is the number of frames.
+ """
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+ x, _ = self.encode(wav, layer=7)
+ x = self.kmeans.predict(x.squeeze().cpu().numpy())
+ return torch.tensor(x, dtype=torch.long, device=wav.device)
+
+
+class FeatureExtractor(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+ self.norm0 = nn.GroupNorm(512, 512)
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = F.gelu(self.norm0(self.conv0(x)))
+ x = F.gelu(self.conv1(x))
+ x = F.gelu(self.conv2(x))
+ x = F.gelu(self.conv3(x))
+ x = F.gelu(self.conv4(x))
+ x = F.gelu(self.conv5(x))
+ x = F.gelu(self.conv6(x))
+ return x
+
+
+class FeatureProjection(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.norm = nn.LayerNorm(512)
+ self.projection = nn.Linear(512, 768)
+ self.dropout = nn.Dropout(0.1)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.norm(x)
+ x = self.projection(x)
+ x = self.dropout(x)
+ return x
+
+
+class PositionalConvEmbedding(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ 768,
+ 768,
+ kernel_size=128,
+ padding=128 // 2,
+ groups=16,
+ )
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.conv(x.transpose(1, 2))
+ x = F.gelu(x[:, :, :-1])
+ return x.transpose(1, 2)
+
+
+class TransformerEncoder(nn.Module):
+ def __init__(
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+ ) -> None:
+ super(TransformerEncoder, self).__init__()
+ self.layers = nn.ModuleList(
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+ )
+ self.num_layers = num_layers
+
+ def forward(
+ self,
+ src: torch.Tensor,
+ mask: torch.Tensor = None,
+ src_key_padding_mask: torch.Tensor = None,
+ output_layer: Optional[int] = None,
+ ) -> torch.Tensor:
+ output = src
+ for layer in self.layers[:output_layer]:
+ output = layer(
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+ )
+ return output
+
+
+def _compute_mask(
+ shape: Tuple[int, int],
+ mask_prob: float,
+ mask_length: int,
+ device: torch.device,
+ min_masks: int = 0,
+) -> torch.Tensor:
+ batch_size, sequence_length = shape
+
+ if mask_length < 1:
+ raise ValueError("`mask_length` has to be bigger than 0.")
+
+ if mask_length > sequence_length:
+ raise ValueError(
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+ )
+
+ # compute number of masked spans in batch
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+ num_masked_spans = max(num_masked_spans, min_masks)
+
+ # make sure num masked indices <= sequence_length
+ if num_masked_spans * mask_length > sequence_length:
+ num_masked_spans = sequence_length // mask_length
+
+ # SpecAugment mask to fill
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
+ uniform_dist = torch.ones(
+ (batch_size, sequence_length - (mask_length - 1)), device=device
+ )
+
+ # get random indices to mask
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+
+ # expand masked indices to masked spans
+ mask_indices = (
+ mask_indices.unsqueeze(dim=-1)
+ .expand((batch_size, num_masked_spans, mask_length))
+ .reshape(batch_size, num_masked_spans * mask_length)
+ )
+ offsets = (
+ torch.arange(mask_length, device=device)[None, None, :]
+ .expand((batch_size, num_masked_spans, mask_length))
+ .reshape(batch_size, num_masked_spans * mask_length)
+ )
+ mask_idxs = mask_indices + offsets
+
+ # scatter indices to mask
+ mask = mask.scatter(1, mask_idxs, True)
+
+ return mask
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42ba3acb822938f246dba27b3de81ec51aa72b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py
@@ -0,0 +1,61 @@
+import torch
+
+
+class Metric:
+ def __init__(self):
+ self.steps = 0
+ self.value = 0
+
+ def update(self, value):
+ self.steps += 1
+ self.value += (value - self.value) / self.steps
+ return self.value
+
+ def reset(self):
+ self.steps = 0
+ self.value = 0
+
+
+def save_checkpoint(
+ checkpoint_dir,
+ hubert,
+ optimizer,
+ scaler,
+ step,
+ loss,
+ best,
+ logger,
+):
+ state = {
+ "hubert": hubert.state_dict(),
+ "optimizer": optimizer.state_dict(),
+ "scaler": scaler.state_dict(),
+ "step": step,
+ "loss": loss,
+ }
+ checkpoint_dir.mkdir(exist_ok=True, parents=True)
+ checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+ torch.save(state, checkpoint_path)
+ if best:
+ best_path = checkpoint_dir / "model-best.pt"
+ torch.save(state, best_path)
+ logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+
+
+def load_checkpoint(
+ load_path,
+ hubert,
+ optimizer,
+ scaler,
+ rank,
+ logger,
+):
+ logger.info(f"Loading checkpoint from {load_path}")
+ checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+ hubert.load_state_dict(checkpoint["hubert"])
+ if "scaler" in checkpoint:
+ scaler.load_state_dict(checkpoint["scaler"])
+ if "optimizer" in checkpoint:
+ optimizer.load_state_dict(checkpoint["optimizer"])
+ step, loss = checkpoint.get("step", 0), checkpoint.get("loss", float("inf"))
+ return step, loss
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5ca9de087f72e343ffb4e5ef00cdbb90765097
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py
@@ -0,0 +1,459 @@
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from hubert.model import Hubert, URLS
+from hubert.dataset import AcousticUnitsDataset
+from hubert.utils import Metric, save_checkpoint, load_checkpoint
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+########################################################################################
+# Define hyperparameters for training:
+########################################################################################
+
+BATCH_SIZE = 32
+LEARNING_RATE = 2e-5
+BETAS = (0.9, 0.98)
+EPS = 1e-06
+WEIGHT_DECAY = 1e-2
+MAX_NORM = 10
+STEPS = 25000
+LOG_INTERVAL = 5
+VALIDATION_INTERVAL = 1000
+CHECKPOINT_INTERVAL = 5000
+BACKEND = "nccl"
+INIT_METHOD = "tcp://localhost:54321"
+
+
+def train(rank, world_size, args):
+ dist.init_process_group(
+ BACKEND,
+ rank=rank,
+ world_size=world_size,
+ init_method=INIT_METHOD,
+ )
+
+ ####################################################################################
+ # Setup logging utilities:
+ ####################################################################################
+
+ log_dir = args.checkpoint_dir / "logs"
+ log_dir.mkdir(exist_ok=True, parents=True)
+
+ if rank == 0:
+ logger.setLevel(logging.INFO)
+ handler = logging.FileHandler(log_dir / f"{args.checkpoint_dir.stem}.log")
+ handler.setLevel(logging.INFO)
+ formatter = logging.Formatter(
+ "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S"
+ )
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+ else:
+ logger.setLevel(logging.ERROR)
+
+ writer = SummaryWriter(log_dir) if rank == 0 else None
+
+ ####################################################################################
+ # Initialize models
+ ####################################################################################
+
+ hubert = Hubert(mask=args.mask).to(rank)
+
+ if args.warmstart:
+ checkpoint = torch.hub.load_state_dict_from_url(
+ URLS["hubert-discrete"], map_location={"cuda:0": f"cuda:{rank}"}
+ )
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+
+ # don't use warmstart weights for label embeddings and proj layer
+ del checkpoint["hubert"]["label_embedding.weight"]
+ del checkpoint["hubert"]["proj.weight"]
+ del checkpoint["hubert"]["proj.bias"]
+
+ hubert.load_state_dict(checkpoint["hubert"], strict=False)
+
+ hubert = DDP(hubert, device_ids=[rank])
+
+ ####################################################################################
+ # Initialze optimizer and grad scaler
+ ####################################################################################
+
+ optimizer = optim.AdamW(
+ hubert.parameters(),
+ lr=LEARNING_RATE,
+ betas=BETAS,
+ eps=EPS,
+ weight_decay=WEIGHT_DECAY,
+ )
+ scaler = amp.GradScaler()
+
+ ####################################################################################
+ # Initialize datasets and dataloaders
+ ####################################################################################
+
+ train_dataset = AcousticUnitsDataset(
+ root=args.dataset_dir,
+ train=True,
+ )
+ train_sampler = DistributedSampler(train_dataset, drop_last=True)
+ train_loader = DataLoader(
+ train_dataset,
+ collate_fn=train_dataset.collate,
+ batch_size=BATCH_SIZE,
+ sampler=train_sampler,
+ num_workers=8,
+ pin_memory=True,
+ shuffle=False,
+ drop_last=True,
+ )
+
+ validation_dataset = AcousticUnitsDataset(
+ root=args.dataset_dir,
+ train=False,
+ )
+ validation_loader = DataLoader(
+ validation_dataset,
+ batch_size=1,
+ shuffle=False,
+ num_workers=8,
+ pin_memory=True,
+ )
+
+ ####################################################################################
+ # Load checkpoint if args.resume is set
+ ####################################################################################
+
+ if args.resume is not None:
+ global_step, best_loss = load_checkpoint(
+ load_path=args.resume,
+ hubert=hubert,
+ optimizer=optimizer,
+ scaler=scaler,
+ rank=rank,
+ logger=logger,
+ )
+ else:
+ global_step, best_loss = 0, float("inf")
+
+ # =================================================================================#
+ # Start training loop
+ # =================================================================================#
+
+ n_epochs = STEPS // len(train_loader) + 1
+ start_epoch = global_step // len(train_loader) + 1
+
+ logger.info("**" * 40)
+ logger.info(f"PyTorch version: {torch.__version__}")
+ logger.info(f"CUDA version: {torch.version.cuda}")
+ logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
+ logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}")
+ logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}")
+ logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}")
+ logger.info(f"# of GPUS: {torch.cuda.device_count()}")
+ logger.info(f"batch size: {BATCH_SIZE}")
+ logger.info(f"iterations per epoch: {len(train_loader)}")
+ logger.info(f"# of epochs: {n_epochs}")
+ logger.info(f"started at epoch: {start_epoch}")
+ logger.info("**" * 40 + "\n")
+
+ if args.mask:
+ average_masked_loss = Metric()
+ average_unmasked_loss = Metric()
+ average_masked_accuracy = Metric()
+ average_unmasked_accuracy = Metric()
+
+ epoch_masked_loss = Metric()
+ epoch_unmasked_loss = Metric()
+ epoch_masked_accuracy = Metric()
+ epoch_unmasked_accuracy = Metric()
+ else:
+ average_loss = Metric()
+ average_accuracy = Metric()
+
+ epoch_loss = Metric()
+ epoch_accuracy = Metric()
+
+ validation_loss = Metric()
+ validation_accuracy = Metric()
+
+ for epoch in range(start_epoch, n_epochs + 1):
+ train_sampler.set_epoch(epoch)
+
+ hubert.train()
+ if args.mask:
+ epoch_masked_loss.reset()
+ epoch_unmasked_loss.reset()
+ epoch_masked_accuracy.reset()
+ epoch_unmasked_accuracy.reset()
+ else:
+ epoch_loss.reset()
+ epoch_accuracy.reset()
+
+ for wavs, codes in train_loader:
+ global_step += 1
+ wavs, codes = wavs.to(rank), codes.to(rank)
+
+ ############################################################################
+ # Compute training loss
+ ############################################################################
+
+ optimizer.zero_grad()
+
+ with amp.autocast():
+ logits, mask = hubert(wavs)
+ length = min(
+ mask.size(-1) if args.mask else float("inf"), codes.size(-1)
+ )
+ logits = logits[:, :length, :]
+ codes = codes[:, :length]
+ if args.mask:
+ mask = mask[:, :length]
+
+ if args.mask:
+ masked_loss = F.cross_entropy(logits[mask], codes[mask])
+ unmasked_loss = F.cross_entropy(logits[~mask], codes[~mask])
+ loss = args.alpha * masked_loss + (1 - args.alpha) * unmasked_loss
+ else:
+ loss = F.cross_entropy(logits.transpose(1, 2), codes)
+
+ scaler.scale(loss).backward()
+ scaler.unscale_(optimizer)
+
+ nn.utils.clip_grad_norm_(hubert.parameters(), MAX_NORM)
+
+ scaler.step(optimizer)
+ scaler.update()
+
+ if args.mask:
+ masked_accuracy = logits[mask].argmax(dim=-1) == codes[mask]
+ masked_accuracy = torch.mean(masked_accuracy.float())
+
+ unmasked_accuracy = logits[~mask].argmax(dim=-1) == codes[~mask]
+ unmasked_accuracy = torch.mean(unmasked_accuracy.float())
+ else:
+ accuracy = logits.argmax(dim=-1) == codes
+ accuracy = torch.mean(accuracy.float())
+
+ ############################################################################
+ # Update and log training metrics
+ ############################################################################
+
+ if args.mask:
+ average_masked_loss.update(masked_loss.item())
+ average_unmasked_loss.update(unmasked_loss.item())
+ average_masked_accuracy.update(masked_accuracy.item())
+ average_unmasked_accuracy.update(unmasked_accuracy.item())
+
+ epoch_masked_loss.update(masked_loss.item())
+ epoch_unmasked_loss.update(unmasked_loss.item())
+ epoch_masked_accuracy.update(masked_accuracy.item())
+ epoch_unmasked_accuracy.update(unmasked_accuracy.item())
+ else:
+ average_loss.update(loss.item())
+ average_accuracy.update(accuracy.item())
+
+ epoch_loss.update(loss.item())
+ epoch_accuracy.update(accuracy.item())
+
+ if rank == 0 and global_step % LOG_INTERVAL == 0:
+ if args.mask:
+ writer.add_scalar(
+ "train/masked_loss",
+ average_masked_loss.value,
+ global_step,
+ )
+ writer.add_scalar(
+ "train/unmasked_loss",
+ average_unmasked_loss.value,
+ global_step,
+ )
+ writer.add_scalar(
+ "train/masked_accuracy",
+ average_masked_accuracy.value * 100,
+ global_step,
+ )
+ writer.add_scalar(
+ "train/unmasked_accuracy",
+ average_unmasked_accuracy.value * 100,
+ global_step,
+ )
+ average_masked_loss.reset()
+ average_unmasked_loss.reset()
+ average_masked_accuracy.reset()
+ average_unmasked_accuracy.reset()
+ else:
+ writer.add_scalar(
+ "train/loss",
+ average_loss.value,
+ global_step,
+ )
+ writer.add_scalar(
+ "train/accuracy",
+ average_accuracy.value,
+ global_step,
+ )
+ average_loss.reset()
+ average_accuracy.reset()
+
+ # --------------------------------------------------------------------------#
+ # Start validation loop
+ # --------------------------------------------------------------------------#
+
+ if global_step % VALIDATION_INTERVAL == 0:
+ hubert.eval()
+ validation_loss.reset()
+ validation_accuracy.reset()
+ for wavs, codes in validation_loader:
+ wavs, codes = wavs.to(rank), codes.to(rank)
+
+ with torch.no_grad():
+ logits, _ = hubert(wavs)
+ logits = logits.transpose(1, 2)
+
+ loss = F.cross_entropy(logits, codes)
+
+ accuracy = logits.argmax(dim=1) == codes
+ accuracy = torch.mean(accuracy.float())
+
+ ####################################################################
+ # Update validation metrics
+ ####################################################################
+
+ validation_loss.update(loss.item())
+ validation_accuracy.update(accuracy.item())
+
+ hubert.train()
+
+ ############################################################################
+ # Log validation metrics
+ ############################################################################
+
+ if rank == 0:
+ writer.add_scalar(
+ "validation/unit_loss",
+ validation_loss.value,
+ global_step,
+ )
+ writer.add_scalar(
+ "validation/unit_accuracy",
+ validation_accuracy.value * 100,
+ global_step,
+ )
+ logger.info(
+ f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}, accuracy: {validation_accuracy.value * 100:.2f}"
+ )
+
+ ############################################################################
+ # Save model checkpoint
+ ############################################################################
+
+ new_best = best_loss > validation_loss.value
+ if new_best or global_step % CHECKPOINT_INTERVAL == 0:
+ if new_best:
+ logger.info("-------- new best model found!")
+ best_loss = validation_loss.value
+
+ if rank == 0:
+ save_checkpoint(
+ checkpoint_dir=args.checkpoint_dir,
+ hubert=hubert,
+ optimizer=optimizer,
+ scaler=scaler,
+ step=global_step,
+ loss=validation_loss.value,
+ best=new_best,
+ logger=logger,
+ )
+
+ # -----------------------------------------------------------------------------#
+ # End validation loop
+ # -----------------------------------------------------------------------------#
+
+ ####################################################################################
+ # Log training metrics
+ ####################################################################################
+
+ logger.info(
+ f"""
+ train -- epoch: {epoch}, masked loss: {epoch_masked_loss.value:.4f}, unmasked loss: {epoch_unmasked_loss.value:.4f},
+ masked accuracy: {epoch_masked_accuracy.value * 100:.2f}, umasked accuracy: {epoch_unmasked_accuracy.value * 100:.2f}
+ """
+ )
+
+ # ==================================================================================#
+ # End training loop
+ # ==================================================================================#
+
+ dist.destroy_process_group()
+
+
+def train_hubert(args):
+ world_size = torch.cuda.device_count()
+ mp.spawn(
+ train,
+ args=(world_size, args),
+ nprocs=world_size,
+ join=True,
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Train HuBERT soft content encoder.")
+ parser.add_argument(
+ "dataset_dir",
+ metavar="dataset-dir",
+ help="path to the data directory.",
+ type=Path,
+ )
+ parser.add_argument(
+ "checkpoint_dir",
+ metavar="checkpoint-dir",
+ help="path to the checkpoint directory.",
+ type=Path,
+ )
+ parser.add_argument(
+ "--resume",
+ help="path to the checkpoint to resume from.",
+ type=Path,
+ )
+ parser.add_argument(
+ "--warmstart",
+ help="whether to initialize from the fairseq HuBERT checkpoint.",
+ action="store_true",
+ )
+ parser.add_argument(
+ "--mask",
+ help="whether to use input masking.",
+ action="store_true",
+ )
+ parser.add_argument(
+ "--alpha",
+ help="weight for the masked loss.",
+ default=1,
+ type=float,
+ )
+ args = parser.parse_args()
+
+ world_size = torch.cuda.device_count()
+ mp.spawn(
+ train,
+ args=(world_size, args),
+ nprocs=world_size,
+ join=True,
+ )
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py
@@ -0,0 +1,24 @@
+import torch, torchaudio
+from .hubert.hubert import HubertSoft
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+import librosa
+
+
+def get_soft_model(model_path):
+ hubert = HubertSoft()
+ # Load checkpoint (either hubert_soft or hubert_discrete)
+ # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
+ checkpoint = torch.load(model_path)
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+ hubert.load_state_dict(checkpoint["hubert"])
+ hubert.eval()
+ return hubert
+
+
+@torch.no_grad()
+def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
+ wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
+ # print(wav_16k_tensor.shape)
+ units = hmodel.units(wav_16k_tensor)
+ # print(units.shape)
+ return units.cpu()
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+ if proba == 1:
+ return torch.ones(shape, device=device, dtype=torch.bool)
+ elif proba == 0:
+ return torch.zeros(shape, device=device, dtype=torch.bool)
+ else:
+ return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class DiffVC(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.unet = UNet2DModel(**self.config['unet'])
+ self.unet.set_use_memory_efficient_attention_xformers(True)
+ self.speaker_embedding = nn.Sequential(
+ nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']),
+ nn.SiLU(),
+ nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim']))
+ self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) /
+ self.config['cls_embedding']['speaker_dim'] ** 0.5)
+ self.content_embedding = nn.Sequential(
+ nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+ nn.SiLU(),
+ nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+ if self.config['cls_embedding']['use_pitch']:
+ self.pitch_control = True
+ self.pitch_embedding = nn.Sequential(
+ nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+ nn.SiLU(),
+ nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+ self.config['cls_embedding']['pitch_hidden']))
+ self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+ self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+ else:
+ print('no pitch module')
+ self.pitch_control = False
+
+ def forward(self, target, t, content, speaker, pitch,
+ train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+ B, C, M, L = target.shape
+ content = self.content_embedding(content)
+ content = repeat(content, "b t c-> b c m t", m=M)
+ target = target.to(content.dtype)
+ x = torch.cat([target, content], dim=1)
+
+ if self.pitch_control:
+ if pitch is not None:
+ pitch = self.pitch_embedding(pitch)
+ else:
+ pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+ if train_cfg:
+ uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype)
+ batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device)
+ speaker = torch.where(batch_mask, uncond, speaker)
+
+ if self.pitch_control:
+ batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+ pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+ pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+ speaker = self.speaker_embedding(speaker)
+
+ if self.pitch_control:
+ pitch = repeat(pitch, "b t c-> b c m t", m=M)
+ x = torch.cat([x, pitch], dim=1)
+
+ output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample']
+
+ return output
+
+
+if __name__ == "__main__":
+ with open('diffvc_base_pitch.yaml', 'r') as fp:
+ config = yaml.safe_load(fp)
+ device = 'cuda'
+
+ model = DiffVC(config['diffwrap']).to(device)
+
+ x = torch.rand((2, 1, 100, 256)).to(device)
+ y = torch.rand((2, 256, 768)).to(device)
+ p = torch.rand(2, 256, 1).to(device)
+ t = torch.randint(0, 1000, (2,)).long().to(device)
+ spk = torch.rand(2, 256).to(device)
+
+ output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+ if proba == 1:
+ return torch.ones(shape, device=device, dtype=torch.bool)
+ elif proba == 0:
+ return torch.zeros(shape, device=device, dtype=torch.bool)
+ else:
+ return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+ def __init__(self, features=128):
+ super().__init__()
+ self.embedding = nn.Embedding(1, features)
+
+ def forward(self, y):
+ B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+ embed = self.embedding(torch.zeros(B, device=device).long())
+ fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+ return fixed_embedding
+
+
+class DiffVC_Cross(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.unet = UNet2DConditionModel(**self.config['unet'])
+ self.unet.set_use_memory_efficient_attention_xformers(True)
+ self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+ self.context_embedding = nn.Sequential(
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+ nn.SiLU(),
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+ self.content_embedding = nn.Sequential(
+ nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+ nn.SiLU(),
+ nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+ if self.config['cls_embedding']['use_pitch']:
+ self.pitch_control = True
+ self.pitch_embedding = nn.Sequential(
+ nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+ nn.SiLU(),
+ nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+ self.config['cls_embedding']['pitch_hidden']))
+
+ self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+ self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+ else:
+ print('no pitch module')
+ self.pitch_control = False
+
+ def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None,
+ train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+ B, C, M, L = target.shape
+ content = self.content_embedding(content)
+ content = repeat(content, "b t c-> b c m t", m=M)
+ target = target.to(content.dtype)
+ x = torch.cat([target, content], dim=1)
+
+ if self.pitch_control:
+ if pitch is not None:
+ pitch = self.pitch_embedding(pitch)
+ else:
+ pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+ if train_cfg:
+ # Randomly mask embedding
+ batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device)
+ fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+ prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+ if self.pitch_control:
+ batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+ pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+ pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+ prompt = self.context_embedding(prompt)
+
+ if self.pitch_control:
+ pitch = repeat(pitch, "b t c-> b c m t", m=M)
+ x = torch.cat([x, pitch], dim=1)
+
+ output = self.unet(sample=x, timestep=t,
+ encoder_hidden_states=prompt,
+ encoder_attention_mask=prompt_mask)['sample']
+
+ return output
+
+
+if __name__ == "__main__":
+ with open('diffvc_cross_pitch.yaml', 'r') as fp:
+ config = yaml.safe_load(fp)
+ device = 'cuda'
+
+ model = DiffVC_Cross(config['diffwrap']).to(device)
+
+ x = torch.rand((2, 1, 100, 256)).to(device)
+ y = torch.rand((2, 256, 768)).to(device)
+ t = torch.randint(0, 1000, (2,)).long().to(device)
+ prompt = torch.rand(2, 64, 768).to(device)
+ prompt_mask = torch.ones(2, 64).to(device)
+ p = torch.rand(2, 256, 1).to(device)
+
+ output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e878e4daa06309e7ca9b6d970f333bcf9d4524
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+ if proba == 1:
+ return torch.ones(shape, device=device, dtype=torch.bool)
+ elif proba == 0:
+ return torch.zeros(shape, device=device, dtype=torch.bool)
+ else:
+ return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+ def __init__(self, features=128):
+ super().__init__()
+ self.embedding = nn.Embedding(1, features)
+
+ def forward(self, y):
+ B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+ embed = self.embedding(torch.zeros(B, device=device).long())
+ fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+ return fixed_embedding
+
+
+class P2E_Cross(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.unet = UNet2DConditionModel(**self.config['unet'])
+ # self.unet.set_use_memory_efficient_attention_xformers(True)
+ self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+ self.context_embedding = nn.Sequential(
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+ nn.SiLU(),
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+ def forward(self, target, t, prompt, prompt_mask=None,
+ train_cfg=False, cfg_prob=0.0):
+ B, C = target.shape
+ target = target.unsqueeze(-1).unsqueeze(-1)
+
+ if train_cfg:
+ if cfg_prob > 0.0:
+ # Randomly mask embedding
+ batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
+ fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+ prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+ prompt = self.context_embedding(prompt)
+ # fix the bug that prompt will copy dtype from target in diffusers
+ target = target.to(prompt.dtype)
+
+ output = self.unet(sample=target, timestep=t,
+ encoder_hidden_states=prompt,
+ encoder_attention_mask=prompt_mask)['sample']
+
+ return output.squeeze(-1).squeeze(-1)
+
+
+if __name__ == "__main__":
+ with open('p2e_cross.yaml', 'r') as fp:
+ config = yaml.safe_load(fp)
+ device = 'cuda'
+
+ model = P2E_Cross(config['diffwrap']).to(device)
+
+ x = torch.rand((2, 256)).to(device)
+ t = torch.randint(0, 1000, (2,)).long().to(device)
+ prompt = torch.rand(2, 64, 768).to(device)
+ prompt_mask = torch.ones(2, 64).to(device)
+
+ output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md
@@ -0,0 +1,95 @@
+## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
+
+
+
+
+### [Paper](https://arxiv.org/abs/2206.04658)
+### [Audio demo](https://bigvgan-demo.github.io/)
+
+## Installation
+Clone the repository and install dependencies.
+```shell
+# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
+git clone https://github.com/NVIDIA/BigVGAN
+pip install -r requirements.txt
+```
+
+Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
+``` shell
+cd LibriTTS && \
+ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
+ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
+ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
+ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
+ln -s /path/to/your/LibriTTS/dev-other dev-other && \
+ln -s /path/to/your/LibriTTS/test-clean test-clean && \
+ln -s /path/to/your/LibriTTS/test-other test-other && \
+cd ..
+```
+
+## Training
+Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
+```shell
+python train.py \
+--config configs/bigvgan_24khz_100band.json \
+--input_wavs_dir LibriTTS \
+--input_training_file LibriTTS/train-full.txt \
+--input_validation_file LibriTTS/val-full.txt \
+--list_input_unseen_wavs_dir LibriTTS LibriTTS \
+--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
+--checkpoint_path exp/bigvgan
+```
+
+## Synthesis
+Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
+It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
+```shell
+python inference.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_wavs_dir /path/to/your/input_wav \
+--output_dir /path/to/your/output_wav
+```
+
+`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
+It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
+
+Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
+```shell
+python inference_e2e.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_mels_dir /path/to/your/input_mel \
+--output_dir /path/to/your/output_wav
+```
+
+## Pretrained Models
+We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
+One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
+
+|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
+|------|---|---|---|---|------|---|
+|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
+|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
+|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
+|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
+
+The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
+We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
+Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
+
+
+## TODO
+
+Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future.
+
+
+## References
+* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
+
+* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
+
+* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
+
+* [Julius](https://github.com/adefossez/julius) (for low-pass filter)
+
+* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py
@@ -0,0 +1,120 @@
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+# LICENSE is in incl_licenses directory.
+
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+ '''
+ Implementation of a sine-based periodic activation function
+ Shape:
+ - Input: (B, C, T)
+ - Output: (B, C, T), same shape as the input
+ Parameters:
+ - alpha - trainable parameter
+ References:
+ - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+ https://arxiv.org/abs/2006.08195
+ Examples:
+ >>> a1 = snake(256)
+ >>> x = torch.randn(256)
+ >>> x = a1(x)
+ '''
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+ '''
+ Initialization.
+ INPUT:
+ - in_features: shape of the input
+ - alpha: trainable parameter
+ alpha is initialized to 1 by default, higher values = higher-frequency.
+ alpha will be trained along with the rest of your model.
+ '''
+ super(Snake, self).__init__()
+ self.in_features = in_features
+
+ # initialize alpha
+ self.alpha_logscale = alpha_logscale
+ if self.alpha_logscale: # log scale alphas initialized to zeros
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
+ else: # linear scale alphas initialized to ones
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+ self.alpha.requires_grad = alpha_trainable
+
+ self.no_div_by_zero = 0.000000001
+
+ def forward(self, x):
+ '''
+ Forward pass of the function.
+ Applies the function to the input elementwise.
+ Snake ∶= x + 1/a * sin^2 (xa)
+ '''
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+ if self.alpha_logscale:
+ alpha = torch.exp(alpha)
+ x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+ return x
+
+
+class SnakeBeta(nn.Module):
+ '''
+ A modified Snake function which uses separate parameters for the magnitude of the periodic components
+ Shape:
+ - Input: (B, C, T)
+ - Output: (B, C, T), same shape as the input
+ Parameters:
+ - alpha - trainable parameter that controls frequency
+ - beta - trainable parameter that controls magnitude
+ References:
+ - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+ https://arxiv.org/abs/2006.08195
+ Examples:
+ >>> a1 = snakebeta(256)
+ >>> x = torch.randn(256)
+ >>> x = a1(x)
+ '''
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+ '''
+ Initialization.
+ INPUT:
+ - in_features: shape of the input
+ - alpha - trainable parameter that controls frequency
+ - beta - trainable parameter that controls magnitude
+ alpha is initialized to 1 by default, higher values = higher-frequency.
+ beta is initialized to 1 by default, higher values = higher-magnitude.
+ alpha will be trained along with the rest of your model.
+ '''
+ super(SnakeBeta, self).__init__()
+ self.in_features = in_features
+
+ # initialize alpha
+ self.alpha_logscale = alpha_logscale
+ if self.alpha_logscale: # log scale alphas initialized to zeros
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
+ self.beta = Parameter(torch.zeros(in_features) * alpha)
+ else: # linear scale alphas initialized to ones
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
+ self.beta = Parameter(torch.ones(in_features) * alpha)
+
+ self.alpha.requires_grad = alpha_trainable
+ self.beta.requires_grad = alpha_trainable
+
+ self.no_div_by_zero = 0.000000001
+
+ def forward(self, x):
+ '''
+ Forward pass of the function.
+ Applies the function to the input elementwise.
+ SnakeBeta ∶= x + 1/b * sin^2 (xa)
+ '''
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+ beta = self.beta.unsqueeze(0).unsqueeze(-1)
+ if self.alpha_logscale:
+ alpha = torch.exp(alpha)
+ beta = torch.exp(beta)
+ x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+ return x
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+# LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py
@@ -0,0 +1,28 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+# LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+ def __init__(self,
+ activation,
+ up_ratio: int = 2,
+ down_ratio: int = 2,
+ up_kernel_size: int = 12,
+ down_kernel_size: int = 12):
+ super().__init__()
+ self.up_ratio = up_ratio
+ self.down_ratio = down_ratio
+ self.act = activation
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+ # x: [B,C,T]
+ def forward(self, x):
+ x = self.upsample(x)
+ x = self.act(x)
+ x = self.downsample(x)
+
+ return x
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+# LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if 'sinc' in dir(torch):
+ sinc = torch.sinc
+else:
+ # This code is adopted from adefossez's julius.core.sinc under the MIT License
+ # https://adefossez.github.io/julius/julius/core.html
+ # LICENSE is in incl_licenses directory.
+ def sinc(x: torch.Tensor):
+ """
+ Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+ __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+ """
+ return torch.where(x == 0,
+ torch.tensor(1., device=x.device, dtype=x.dtype),
+ torch.sin(math.pi * x) / math.pi / x)
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+# LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+ even = (kernel_size % 2 == 0)
+ half_size = kernel_size // 2
+
+ #For kaiser window
+ delta_f = 4 * half_width
+ A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+ if A > 50.:
+ beta = 0.1102 * (A - 8.7)
+ elif A >= 21.:
+ beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+ else:
+ beta = 0.
+ window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+ # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+ if even:
+ time = (torch.arange(-half_size, half_size) + 0.5)
+ else:
+ time = torch.arange(kernel_size) - half_size
+ if cutoff == 0:
+ filter_ = torch.zeros_like(time)
+ else:
+ filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+ # Normalize filter to have sum = 1, otherwise we will have a small leakage
+ # of the constant component in the input signal.
+ filter_ /= filter_.sum()
+ filter = filter_.view(1, 1, kernel_size)
+
+ return filter
+
+
+class LowPassFilter1d(nn.Module):
+ def __init__(self,
+ cutoff=0.5,
+ half_width=0.6,
+ stride: int = 1,
+ padding: bool = True,
+ padding_mode: str = 'replicate',
+ kernel_size: int = 12):
+ # kernel_size should be even number for stylegan3 setup,
+ # in this implementation, odd number is also possible.
+ super().__init__()
+ if cutoff < -0.:
+ raise ValueError("Minimum cutoff must be larger than zero.")
+ if cutoff > 0.5:
+ raise ValueError("A cutoff above 0.5 does not make sense.")
+ self.kernel_size = kernel_size
+ self.even = (kernel_size % 2 == 0)
+ self.pad_left = kernel_size // 2 - int(self.even)
+ self.pad_right = kernel_size // 2
+ self.stride = stride
+ self.padding = padding
+ self.padding_mode = padding_mode
+ filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+ self.register_buffer("filter", filter)
+
+ #input [B, C, T]
+ def forward(self, x):
+ _, C, _ = x.shape
+
+ if self.padding:
+ x = F.pad(x, (self.pad_left, self.pad_right),
+ mode=self.padding_mode)
+ out = F.conv1d(x, self.filter.expand(C, -1, -1),
+ stride=self.stride, groups=C)
+
+ return out
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+# LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+ def __init__(self, ratio=2, kernel_size=None):
+ super().__init__()
+ self.ratio = ratio
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+ self.stride = ratio
+ self.pad = self.kernel_size // ratio - 1
+ self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+ self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+ filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+ half_width=0.6 / ratio,
+ kernel_size=self.kernel_size)
+ self.register_buffer("filter", filter)
+
+ # x: [B, C, T]
+ def forward(self, x):
+ _, C, _ = x.shape
+
+ x = F.pad(x, (self.pad, self.pad), mode='replicate')
+ x = self.ratio * F.conv_transpose1d(
+ x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+ x = x[..., self.pad_left:-self.pad_right]
+
+ return x
+
+
+class DownSample1d(nn.Module):
+ def __init__(self, ratio=2, kernel_size=None):
+ super().__init__()
+ self.ratio = ratio
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+ self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+ half_width=0.6 / ratio,
+ stride=ratio,
+ kernel_size=self.kernel_size)
+
+ def forward(self, x):
+ xx = self.lowpass(x)
+
+ return xx
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py
@@ -0,0 +1,18 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+# LICENSE is in incl_licenses directory.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+ t_path = os.path.join(path, config_name)
+ if config != t_path:
+ os.makedirs(path, exist_ok=True)
+ shutil.copyfile(config, os.path.join(path, config_name))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py
@@ -0,0 +1,36 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+# LICENSE is in incl_licenses directory.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+import os
+import argparse
+import json
+import torch
+from scipy.io.wavfile import write
+from .env import AttrDict
+from .utils import MAX_WAV_VALUE
+from .models import BigVGAN as Generator
+import librosa
+
+
+def load_model(model_path, device='cuda'):
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+ with open(config_file) as f:
+ data = f.read()
+
+ global h
+ json_config = json.loads(data)
+
+ h = AttrDict(json_config)
+
+ generator = Generator(h).to(device)
+
+ cp_dict = torch.load(model_path, map_location=device)
+ generator.load_state_dict(cp_dict['generator'])
+ generator.eval()
+ generator.remove_weight_norm()
+ del cp_dict
+ return generator, h
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.
+# Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+# LICENSE is in incl_licenses directory.
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from .activations import activations
+from .utils import init_weights, get_padding
+from .alias_free_torch import *
+
+LRELU_SLOPE = 0.1
+
+
+class AMPBlock1(torch.nn.Module):
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+ super(AMPBlock1, self).__init__()
+ self.h = h
+
+ self.convs1 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2])))
+ ])
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1)))
+ ])
+ self.convs2.apply(init_weights)
+
+ self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+ if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+ self.activations = nn.ModuleList([
+ Activation1d(
+ activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+ for _ in range(self.num_layers)
+ ])
+ elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+ self.activations = nn.ModuleList([
+ Activation1d(
+ activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+ for _ in range(self.num_layers)
+ ])
+ else:
+ raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+ def forward(self, x):
+ acts1, acts2 = self.activations[::2], self.activations[1::2]
+ for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+ xt = a1(x)
+ xt = c1(xt)
+ xt = a2(xt)
+ xt = c2(xt)
+ x = xt + x
+
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+ super(AMPBlock2, self).__init__()
+ self.h = h
+
+ self.convs = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1])))
+ ])
+ self.convs.apply(init_weights)
+
+ self.num_layers = len(self.convs) # total number of conv layers
+
+ if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+ self.activations = nn.ModuleList([
+ Activation1d(
+ activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+ for _ in range(self.num_layers)
+ ])
+ elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+ self.activations = nn.ModuleList([
+ Activation1d(
+ activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+ for _ in range(self.num_layers)
+ ])
+ else:
+ raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+ def forward(self, x):
+ for c, a in zip (self.convs, self.activations):
+ xt = a(x)
+ xt = c(xt)
+ x = xt + x
+
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_weight_norm(l)
+
+
+class BigVGAN(torch.nn.Module):
+ # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+ def __init__(self, h):
+ super(BigVGAN, self).__init__()
+ self.h = h
+
+ self.num_kernels = len(h.resblock_kernel_sizes)
+ self.num_upsamples = len(h.upsample_rates)
+
+ # pre conv
+ self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+
+ # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+ resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+
+ # transposed conv-based upsamplers. does not apply anti-aliasing
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+ self.ups.append(nn.ModuleList([
+ weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+ h.upsample_initial_channel // (2 ** (i + 1)),
+ k, u, padding=(k - u) // 2))
+ ]))
+
+ # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = h.upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+ self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+
+ # post conv
+ if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+ activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+ self.activation_post = Activation1d(activation=activation_post)
+ elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+ activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+ self.activation_post = Activation1d(activation=activation_post)
+ else:
+ raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+ # weight initialization
+ for i in range(len(self.ups)):
+ self.ups[i].apply(init_weights)
+ self.conv_post.apply(init_weights)
+
+ def forward(self, x):
+ # pre conv
+ x = self.conv_pre(x)
+
+ for i in range(self.num_upsamples):
+ # upsampling
+ for i_up in range(len(self.ups[i])):
+ x = self.ups[i][i_up](x)
+ # AMP blocks
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+
+ # post conv
+ x = self.activation_post(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ for l_i in l:
+ remove_weight_norm(l_i)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+ remove_weight_norm(self.conv_pre)
+ remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.d_mult = h.discriminator_channel_mult
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+ norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+ norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+ norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+ norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
+ ])
+ self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, h):
+ super(MultiPeriodDiscriminator, self).__init__()
+ self.mpd_reshapes = h.mpd_reshapes
+ print("mpd_reshapes: {}".format(self.mpd_reshapes))
+ discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
+ self.discriminators = nn.ModuleList(discriminators)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ fmap_rs.append(fmap_r)
+ y_d_gs.append(y_d_g)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+ def __init__(self, cfg, resolution):
+ super().__init__()
+
+ self.resolution = resolution
+ assert len(self.resolution) == 3, \
+ "MRD layer requires list with len=3, got {}".format(self.resolution)
+ self.lrelu_slope = LRELU_SLOPE
+
+ norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
+ if hasattr(cfg, "mrd_use_spectral_norm"):
+ print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
+ norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
+ self.d_mult = cfg.discriminator_channel_mult
+ if hasattr(cfg, "mrd_channel_mult"):
+ print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
+ self.d_mult = cfg.mrd_channel_mult
+
+ self.convs = nn.ModuleList([
+ norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
+ norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+ norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+ norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+ norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
+ ])
+ self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
+
+ def forward(self, x):
+ fmap = []
+
+ x = self.spectrogram(x)
+ x = x.unsqueeze(1)
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, self.lrelu_slope)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+ def spectrogram(self, x):
+ n_fft, hop_length, win_length = self.resolution
+ x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+ x = x.squeeze(1)
+ x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
+ x = torch.view_as_real(x) # [B, F, TT, 2]
+ mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+ return mag
+
+
+class MultiResolutionDiscriminator(nn.Module):
+ def __init__(self, cfg, debug=False):
+ super().__init__()
+ self.resolutions = cfg.resolutions
+ assert len(self.resolutions) == 3,\
+ "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
+ format(self.resolutions)
+ self.discriminators = nn.ModuleList(
+ [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
+ )
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(x=y)
+ y_d_g, fmap_g = d(x=y_hat)
+ y_d_rs.append(y_d_r)
+ fmap_rs.append(fmap_r)
+ y_d_gs.append(y_d_g)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+ loss = 0
+ for dr, dg in zip(fmap_r, fmap_g):
+ for rl, gl in zip(dr, dg):
+ loss += torch.mean(torch.abs(rl - gl))
+
+ return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+ loss = 0
+ r_losses = []
+ g_losses = []
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+ r_loss = torch.mean((1-dr)**2)
+ g_loss = torch.mean(dg**2)
+ loss += (r_loss + g_loss)
+ r_losses.append(r_loss.item())
+ g_losses.append(g_loss.item())
+
+ return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+ loss = 0
+ gen_losses = []
+ for dg in disc_outputs:
+ l = torch.mean((1-dg)**2)
+ gen_losses.append(l)
+ loss += l
+
+ return loss, gen_losses
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+# LICENSE is in incl_licenses directory.
+
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+from scipy.io.wavfile import write
+
+MAX_WAV_VALUE = 32768.0
+
+
+def plot_spectrogram(spectrogram):
+ fig, ax = plt.subplots(figsize=(10, 2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+ interpolation='none')
+ plt.colorbar(im, ax=ax)
+
+ fig.canvas.draw()
+ plt.close()
+
+ return fig
+
+
+def plot_spectrogram_clipped(spectrogram, clip_max=2.):
+ fig, ax = plt.subplots(figsize=(10, 2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+ interpolation='none', vmin=1e-6, vmax=clip_max)
+ plt.colorbar(im, ax=ax)
+
+ fig.canvas.draw()
+ plt.close()
+
+ return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def load_checkpoint(filepath, device):
+ assert os.path.isfile(filepath)
+ print("Loading '{}'".format(filepath))
+ checkpoint_dict = torch.load(filepath, map_location=device)
+ print("Complete.")
+ return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+ print("Saving checkpoint to {}".format(filepath))
+ torch.save(obj, filepath)
+ print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+ pattern = os.path.join(cp_dir, prefix + '????????')
+ cp_list = glob.glob(pattern)
+ if len(cp_list) == 0:
+ return None
+ return sorted(cp_list)[-1]
+
+def save_audio(audio, path, sr):
+ # wav: torch with 1d shape
+ audio = audio * MAX_WAV_VALUE
+ audio = audio.cpu().numpy().astype('int16')
+ write(path, sr, audio)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as transforms
+
+
+class LogMelSpectrogram(torch.nn.Module):
+ def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,):
+ super().__init__()
+ self.frame_length = frame_length
+ self.hop_length = hop_length
+ self.mel = transforms.MelSpectrogram(
+ sample_rate=sr,
+ n_fft=frame_length,
+ win_length=frame_length,
+ hop_length=hop_length,
+ center=False,
+ power=1.0,
+ norm="slaney",
+ n_mels=n_mel,
+ mel_scale="slaney",
+ f_min=f_min,
+ f_max=f_max
+ )
+
+ @torch.no_grad()
+ def forward(self, x, target_length=None):
+ x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
+ (self.frame_length - self.hop_length) // 2), "reflect")
+ mel = self.mel(x)
+
+ target_length = mel.shape[-1] if target_length is None else target_length
+ logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
+ logmel[:, :, :mel.shape[2]] = mel
+
+ logmel = torch.log(torch.clamp(logmel, min=1e-5))
+ return logmel
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE
@@ -0,0 +1,24 @@
+MIT License
+
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95663cf5b29be905a8422176f661a8f7745b5cb0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md
@@ -0,0 +1,64 @@
+# Real-Time Voice Cloning
+This repository is an implementation of [Transfer Learning from Speaker Verification to
+Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
+
+SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
+
+**Video demonstration** (click the picture):
+
+[](https://www.youtube.com/watch?v=-O_hYhToKoA)
+
+
+
+### Papers implemented
+| URL | Designation | Title | Implementation source |
+| --- | ----------- | ----- | --------------------- |
+|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
+|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
+|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
+
+## News
+**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
+
+**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
+
+**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
+
+**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
+- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
+- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
+
+**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
+
+
+## Setup
+
+### 1. Install Requirements
+1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
+2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
+3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
+4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
+5. Install the remaining requirements with `pip install -r requirements.txt`
+
+### 2. (Optional) Download Pretrained Models
+Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
+
+### 3. (Optional) Test Configuration
+Before you download any dataset, you can begin by testing your configuration with:
+
+`python demo_cli.py`
+
+If all tests pass, you're good to go.
+
+### 4. (Optional) Download Datasets
+For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
+
+### 5. Launch the Toolbox
+You can then try the toolbox:
+
+`python demo_toolbox.py -d `
+or
+`python demo_toolbox.py`
+
+depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..de650b972fc7a4f3f8a698c128ee4642a373a6d6
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py
@@ -0,0 +1,157 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from scipy.ndimage.morphology import binary_dilation
+from .params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+import torch
+from torchaudio.transforms import Resample
+from librosa.filters import mel as librosa_mel_fn
+
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+ source_sr: Optional[int] = None):
+ """
+ Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+ either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+ just .wav), either the waveform as a numpy array of floats.
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+ preprocessing. After preprocessing, the waveform's sampling rate will match the data
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+ this argument will be ignored.
+ """
+ # Load the wav from disk if needed
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+ wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+ else:
+ wav = fpath_or_wav
+
+ # Resample the wav if needed
+ if source_sr is not None and source_sr != sampling_rate:
+ wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
+
+ # Apply the preprocessing: normalize volume and shorten long silences
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+ wav = trim_long_silences(wav)
+
+ return wav
+
+
+def preprocess_wav_batch(wavs, source_sr=22050):
+ # This torch version is designed to cope with a batch of same lengths wavs
+ if sampling_rate != source_sr:
+ resample = Resample(source_sr, sampling_rate)
+ wavs = resample(wavs)
+ wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS,
+ increase_only=True)
+ # Trimming silence is not implemented in this version yet!
+ return wavs_preprocessed
+
+
+def wav_to_mel_spectrogram(wav):
+ """
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+ Note: this not a log-mel spectrogram.
+ """
+ frames = librosa.feature.melspectrogram(
+ y=wav,
+ sr=sampling_rate,
+ n_fft=int(sampling_rate * mel_window_length / 1000),
+ hop_length=int(sampling_rate * mel_window_step / 1000),
+ n_mels=mel_n_channels
+ )
+ return frames.astype(np.float32).T
+
+
+def wav_to_mel_spectrogram_batch(wavs):
+ # This torch version is designed to cope with a batch of same lengths wavs
+ n_fft = int(sampling_rate * mel_window_length / 1000)
+ hop_length = int(sampling_rate * mel_window_step / 1000)
+ win_length = int(sampling_rate * mel_window_length / 1000)
+ window = torch.hann_window(n_fft).to(wavs)
+ mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
+ n_mels=mel_n_channels)).to(wavs)
+ s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length,
+ win_length=win_length, window=window, center=True, return_complex=False)
+ real_part, imag_part = s.unbind(-1)
+ stftm = real_part**2 + imag_part**2
+ mels = torch.matmul(mel_basis, stftm)
+ return torch.transpose(mels, 1, 2)
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+ if increase_only and decrease_only:
+ raise ValueError("Both increase only and decrease only are set")
+ dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+ if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+ return wav
+ return wav * (10 ** (dBFS_change / 20))
+
+
+def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
+ # This torch version is designed to cope with a batch of same lengths wavs
+ if increase_only and decrease_only:
+ raise ValueError("Both increase only and decrease only are set")
+ dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
+ scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
+ if increase_only:
+ mask = (dBFS_change > 0).to(scales)
+ elif decrease_only:
+ mask = (dBFS_change < 0).to(scales)
+ else:
+ mask = torch.zeros_like(scales)
+ scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
+ return wavs * scales.unsqueeze(-1)
+
+
+def trim_long_silences(wav):
+ """
+ Ensures that segments without voice in the waveform remain no longer than a
+ threshold determined by the VAD parameters in params.py.
+
+ :param wav: the raw waveform as a numpy array of floats
+ :return: the same waveform with silences trimmed away (length <= original wav length)
+ """
+ # Compute the voice detection window size
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
+
+ # Trim the end of the audio to have a multiple of the window size
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+
+ # Convert the float waveform to 16-bit mono PCM
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+
+ # Perform voice activation detection
+ voice_flags = []
+ vad = webrtcvad.Vad(mode=3)
+ for window_start in range(0, len(wav), samples_per_window):
+ window_end = window_start + samples_per_window
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+ sample_rate=sampling_rate))
+ voice_flags = np.array(voice_flags)
+
+ # Smooth the voice detection with a moving average
+ def moving_average(array, width):
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+ ret = np.cumsum(array_padded, dtype=float)
+ ret[width:] = ret[width:] - ret[:-width]
+ return ret[width - 1:] / width
+
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
+ audio_mask = np.round(audio_mask).astype(np.bool)
+
+ # Dilate the voiced regions
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+ audio_mask = np.repeat(audio_mask, samples_per_window)
+
+ return wav[audio_mask == True]
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1f5aab0d3899c5e5045b40d4cecee1a11d844c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+librispeech_datasets = {
+ "train": {
+ "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+ "other": ["LibriSpeech/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriSpeech/test-clean"],
+ "other": ["LibriSpeech/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriSpeech/dev-clean"],
+ "other": ["LibriSpeech/dev-other"]
+ },
+}
+libritts_datasets = {
+ "train": {
+ "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+ "other": ["LibriTTS/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriTTS/test-clean"],
+ "other": ["LibriTTS/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriTTS/dev-clean"],
+ "other": ["LibriTTS/dev-other"]
+ },
+}
+voxceleb_datasets = {
+ "voxceleb1" : {
+ "train": ["VoxCeleb1/wav"],
+ "test": ["VoxCeleb1/test_wav"]
+ },
+ "voxceleb2" : {
+ "train": ["VoxCeleb2/dev/aac"],
+ "test": ["VoxCeleb2/test_wav"]
+ }
+}
+
+other_datasets = [
+ "LJSpeech-1.1",
+ "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9af30b406f2a8debe81a8275cb2682cbd896245a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py
@@ -0,0 +1,4 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .speaker_verification_dataset import SpeakerVerificationDataset
+from .speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd5bb005923852327581e2dcaa03fec7dbce5b8
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
@@ -0,0 +1,39 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import random
+
+class RandomCycler:
+ """
+ Creates an internal copy of a sequence and allows access to its items in a constrained random
+ order. For a source sequence of n items and one or several consecutive queries of a total
+ of m items, the following guarantees hold (one implies the other):
+ - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+ - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+ """
+
+ def __init__(self, source):
+ if len(source) == 0:
+ raise Exception("Can't create RandomCycler from an empty collection")
+ self.all_items = list(source)
+ self.next_items = []
+
+ def sample(self, count: int):
+ shuffle = lambda l: random.sample(l, len(l))
+
+ out = []
+ while count > 0:
+ if count >= len(self.all_items):
+ out.extend(shuffle(list(self.all_items)))
+ count -= len(self.all_items)
+ continue
+ n = min(count, len(self.next_items))
+ out.extend(self.next_items[:n])
+ count -= n
+ self.next_items = self.next_items[n:]
+ if len(self.next_items) == 0:
+ self.next_items = shuffle(list(self.all_items))
+ return out
+
+ def __next__(self):
+ return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d189c835859efefa686d49b53f4e79aa444d96
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+ def __init__(self, root: Path):
+ self.root = root
+ self.name = root.name
+ self.utterances = None
+ self.utterance_cycler = None
+
+ def _load_utterances(self):
+ with self.root.joinpath("_sources.txt").open("r") as sources_file:
+ sources = [l.split(",") for l in sources_file]
+ sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+ self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+ self.utterance_cycler = RandomCycler(self.utterances)
+
+ def random_partial(self, count, n_frames):
+ """
+ Samples a batch of unique partial utterances from the disk in a way that all
+ utterances come up at least once every two cycles and in a random order every time.
+
+ :param count: The number of partial utterances to sample from the set of utterances from
+ that speaker. Utterances are guaranteed not to be repeated if is not larger than
+ the number of utterances available.
+ :param n_frames: The number of frames in the partial utterance.
+ :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+ frames are the frames of the partial utterances and range is the range of the partial
+ utterance with regard to the complete utterance.
+ """
+ if self.utterances is None:
+ self._load_utterances()
+
+ utterances = self.utterance_cycler.sample(count)
+
+ a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+ return a
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4080d636338bedcb8d1b8fc77945057027fd0ac1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
@@ -0,0 +1,14 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+from typing import List
+from .speaker import Speaker
+
+class SpeakerBatch:
+ def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+ self.speakers = speakers
+ self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+
+ # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+ # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+ self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc31fee9e0d62545caa2599aebc22decfb50aa0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,58 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .speaker_batch import SpeakerBatch
+from .speaker import Speaker
+from ..params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+ def __init__(self, datasets_root: Path):
+ self.root = datasets_root
+ speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+ if len(speaker_dirs) == 0:
+ raise Exception("No speakers found. Make sure you are pointing to the directory "
+ "containing all preprocessed speaker directories.")
+ self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+ self.speaker_cycler = RandomCycler(self.speakers)
+
+ def __len__(self):
+ return int(1e10)
+
+ def __getitem__(self, index):
+ return next(self.speaker_cycler)
+
+ def get_logs(self):
+ log_string = ""
+ for log_fpath in self.root.glob("*.txt"):
+ with log_fpath.open("r") as log_file:
+ log_string += "".join(log_file.readlines())
+ return log_string
+
+
+class SpeakerVerificationDataLoader(DataLoader):
+ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+ batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+ worker_init_fn=None):
+ self.utterances_per_speaker = utterances_per_speaker
+
+ super().__init__(
+ dataset=dataset,
+ batch_size=speakers_per_batch,
+ shuffle=False,
+ sampler=sampler,
+ batch_sampler=batch_sampler,
+ num_workers=num_workers,
+ collate_fn=self.collate,
+ pin_memory=pin_memory,
+ drop_last=False,
+ timeout=timeout,
+ worker_init_fn=worker_init_fn
+ )
+
+ def collate(self, speakers):
+ return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b878c58fd7d70d3ba0b33def66912adc1c1a45d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py
@@ -0,0 +1,28 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+
+
+class Utterance:
+ def __init__(self, frames_fpath, wave_fpath):
+ self.frames_fpath = frames_fpath
+ self.wave_fpath = wave_fpath
+
+ def get_frames(self):
+ return np.load(self.frames_fpath)
+
+ def random_partial(self, n_frames):
+ """
+ Crops the frames into a partial utterance of n_frames
+
+ :param n_frames: The number of frames of the partial utterance
+ :return: the partial utterance frames and a tuple indicating the start and end of the
+ partial utterance in the complete utterance.
+ """
+ frames = self.get_frames()
+ if frames.shape[0] == n_frames:
+ start = 0
+ else:
+ start = np.random.randint(0, frames.shape[0] - n_frames)
+ end = start + n_frames
+ return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py
@@ -0,0 +1,211 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_data import *
+from .model import SpeakerEncoder
+from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
+from matplotlib import cm
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device="cpu"):
+ """
+ Loads the model in memory. If this function is not explicitely called, it will be run on the
+ first call to embed_frames() with the default weights file.
+
+ :param weights_fpath: the path to saved model weights.
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+ model will be loaded and will run on this device. Outputs will however always be on the cpu.
+ If None, will default to your GPU if it"s available, otherwise your CPU.
+ """
+ # TODO: I think the slow loading of the encoder might have something to do with the device it
+ # was saved on. Worth investigating.
+ global _model, _device
+ if device is None:
+ _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ elif isinstance(device, str):
+ _device = torch.device(device)
+ _model = SpeakerEncoder(_device, torch.device("cpu"))
+ checkpoint = torch.load(weights_fpath, map_location="cpu")
+ _model.load_state_dict(checkpoint["model_state"])
+ _model.eval()
+ _model = _model.to(device)
+ print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+
+
+def is_loaded():
+ return _model is not None
+
+
+@torch.no_grad()
+def embed_frames_batch(frames, use_torch=False):
+ if _model is None:
+ raise Exception("Model was not loaded. Call load_model() before inference.")
+
+ if not use_torch:
+ frames = torch.from_numpy(frames)
+ frames = frames.to(_device)
+
+ embeds = _model.forward(frames)
+ if not use_torch:
+ embeds = embeds.detach().cpu().numpy()
+ return embeds
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+ min_pad_coverage=0.75, overlap=0.5):
+ """
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+ partial utterances of each. Both the waveform and the mel
+ spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+ its spectrogram. This function assumes that the mel spectrogram parameters used are those
+ defined in params_data.py.
+
+ The returned ranges may be indexing further than the length of the waveform. It is
+ recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+
+ :param n_samples: the number of samples in the waveform
+ :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+ utterance
+ :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+ it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+ utterance, this parameter is ignored so that the function always returns at least 1 slice.
+ :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+ utterances are entirely disjoint.
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
+ utterances.
+ """
+ assert 0 <= overlap < 1
+ assert 0 < min_pad_coverage <= 1
+
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+ frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+ # Compute the slices
+ wav_slices, mel_slices = [], []
+ steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+ for i in range(0, steps, frame_step):
+ mel_range = np.array([i, i + partial_utterance_n_frames])
+ wav_range = mel_range * samples_per_frame
+ mel_slices.append(slice(*mel_range))
+ wav_slices.append(slice(*wav_range))
+
+ # Evaluate whether extra padding is warranted or not
+ last_wav_range = wav_slices[-1]
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+ if coverage < min_pad_coverage and len(mel_slices) > 1:
+ mel_slices = mel_slices[:-1]
+ wav_slices = wav_slices[:-1]
+
+ return wav_slices, mel_slices
+
+
+@torch.no_grad()
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+ """
+ Computes an embedding for a single utterance.
+
+ # TODO: handle multiple wavs to benefit from batching on GPU
+ :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+ :param using_partials: if True, then the utterance is split in partial utterances of
+ frames and the utterance embedding is computed from their
+ normalized average. If False, the utterance is instead computed from feeding the entire
+ spectogram to the network.
+ :param return_partials: if True, the partial embeddings will also be returned along with the
+ wav slices that correspond to the partial embeddings.
+ :param kwargs: additional arguments to compute_partial_splits()
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+ is True, the partial utterances as a numpy array of float32 of shape
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+ returned. If is simultaneously set to False, both these values will be None
+ instead.
+ """
+ # Process the entire utterance if not using partials
+ if not using_partials:
+ frames = wav_to_mel_spectrogram(wav)
+ embed = embed_frames_batch(frames[None, ...])[0]
+ if return_partials:
+ return embed, None, None
+ return embed
+
+ # Compute where to split the utterance into partials and pad if necessary
+ wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+ max_wave_length = wave_slices[-1].stop
+ if max_wave_length >= len(wav):
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+ # Split the utterance into partials
+ frames = wav_to_mel_spectrogram(wav)
+ frames_batch = np.array([frames[s] for s in mel_slices])
+ partial_embeds = embed_frames_batch(frames_batch)
+
+ # Compute the utterance embedding from the partial embeddings
+ raw_embed = np.mean(partial_embeds, axis=0)
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
+
+ if return_partials:
+ return embed, partial_embeds, wave_slices
+ return embed
+
+
+@torch.no_grad()
+def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
+ # This torch version is designed to cope with a batch of same lengths wavs
+ if not using_partials:
+ frames = wav_to_mel_spectrogram_batch(wavs)
+ embeds = embed_frames_batch(frames)
+ if return_partials:
+ return embeds, None, None
+ return embeds
+
+ wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
+ max_wave_length = wave_slices[-1].stop
+ if max_wave_length >= wavs.shape[-1]:
+ wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]),
+ dtype=wavs.dtype, device=wavs.device)], 1)
+
+ frames = wav_to_mel_spectrogram_batch(wavs)
+ frames_batch = []
+ for i in range(len(frames)):
+ frames_batch += [frames[i][s] for s in mel_slices]
+ frames_batch = torch.stack(frames_batch, 0)
+ partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
+ partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
+
+ raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
+ embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
+
+ if return_partials:
+ return embeds, partial_embeds, wave_slices
+ return embeds
+
+
+def embed_speaker(wavs, **kwargs):
+ raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+ if ax is None:
+ ax = plt.gca()
+
+ if shape is None:
+ height = int(np.sqrt(len(embed)))
+ shape = (height, -1)
+ embed = embed.reshape(shape)
+
+ cmap = cm.get_cmap()
+ mappable = ax.imshow(embed, cmap=cmap)
+ cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+ cbar.set_clim(*color_range)
+
+ ax.set_xticks([]), ax.set_yticks([])
+ ax.set_title(title)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d246bc359ce1ffc6229ba8a4ced24d07b77e703
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py
@@ -0,0 +1,137 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_model import *
+from .params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+ def __init__(self, device, loss_device):
+ super().__init__()
+ self.loss_device = loss_device
+
+ # Network defition
+ self.lstm = nn.LSTM(input_size=mel_n_channels,
+ hidden_size=model_hidden_size,
+ num_layers=model_num_layers,
+ batch_first=True).to(device)
+ self.linear = nn.Linear(in_features=model_hidden_size,
+ out_features=model_embedding_size).to(device)
+ self.relu = torch.nn.ReLU().to(device)
+
+ # Cosine similarity scaling (with fixed initial parameter values)
+ self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+ self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+ # Loss
+ self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+
+ def do_gradient_ops(self):
+ # Gradient scale
+ self.similarity_weight.grad *= 0.01
+ self.similarity_bias.grad *= 0.01
+
+ # Gradient clipping
+ clip_grad_norm_(self.parameters(), 3, norm_type=2)
+
+ def forward(self, utterances, hidden_init=None):
+ """
+ Computes the embeddings of a batch of utterance spectrograms.
+
+ :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+ (batch_size, n_frames, n_channels)
+ :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+ batch_size, hidden_size). Will default to a tensor of zeros if None.
+ :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+ """
+ # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+ # and the final cell state.
+ out, (hidden, cell) = self.lstm(utterances, hidden_init)
+
+ # We take only the hidden state of the last layer
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+
+ # L2-normalize it
+ embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ return embeds
+
+ def similarity_matrix(self, embeds):
+ """
+ Computes the similarity matrix according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, speakers_per_batch)
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+ centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+ centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+ # Exclusive centroids (1 per utterance)
+ centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+ centroids_excl /= (utterances_per_speaker - 1)
+ centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+ # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+ # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+ # We vectorize the computation for efficiency.
+ sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+ speakers_per_batch).to(self.loss_device)
+ mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+ for j in range(speakers_per_batch):
+ mask = np.where(mask_matrix[j])[0]
+ sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+ sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+
+ ## Even more vectorized version (slower maybe because of transpose)
+ # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+ # ).to(self.loss_device)
+ # eye = np.eye(speakers_per_batch, dtype=np.int)
+ # mask = np.where(1 - eye)
+ # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+ # mask = np.where(eye)
+ # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+ # sim_matrix2 = sim_matrix2.transpose(1, 2)
+
+ sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+ return sim_matrix
+
+ def loss(self, embeds):
+ """
+ Computes the softmax loss according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the loss and the EER for this batch of embeddings.
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Loss
+ sim_matrix = self.similarity_matrix(embeds)
+ sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+ speakers_per_batch))
+ ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+ target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+ loss = self.loss_fn(sim_matrix, target)
+
+ # EER (not backpropagated)
+ with torch.no_grad():
+ inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+ labels = np.array([inv_argmax(i) for i in ground_truth])
+ preds = sim_matrix.detach().cpu().numpy()
+
+ # Snippet from https://yangcha.github.io/EER-ROC/
+ fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+ eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+ return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d04121aed3d7862889ad6c771055db9b74ab6e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py
@@ -0,0 +1,30 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Mel-filterbank
+mel_window_length = 25 # In milliseconds
+mel_window_step = 10 # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160 # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80 # 800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30 # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c535205028bfec75ba7c58ea7e750ba3fff1633
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py
@@ -0,0 +1,12 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py
@@ -0,0 +1,177 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from multiprocess.pool import ThreadPool
+from .params_data import *
+from .config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+ """
+ Registers metadata about the dataset in a text file.
+ """
+ def __init__(self, root, name):
+ self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+ self.sample_data = dict()
+
+ start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Creating dataset %s on %s" % (name, start_time))
+ self.write_line("-----")
+ self._log_params()
+
+ def _log_params(self):
+ from encoder import params_data
+ self.write_line("Parameter values:")
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ self.write_line("\t%s: %s" % (param_name, value))
+ self.write_line("-----")
+
+ def write_line(self, line):
+ self.text_file.write("%s\n" % line)
+
+ def add_sample(self, **kwargs):
+ for param_name, value in kwargs.items():
+ if not param_name in self.sample_data:
+ self.sample_data[param_name] = []
+ self.sample_data[param_name].append(value)
+
+ def finalize(self):
+ self.write_line("Statistics:")
+ for param_name, values in self.sample_data.items():
+ self.write_line("\t%s:" % param_name)
+ self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+ self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+ self.write_line("-----")
+ end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Finished on %s" % end_time)
+ self.text_file.close()
+
+
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+ dataset_root = datasets_root.joinpath(dataset_name)
+ if not dataset_root.exists():
+ print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+ return None, None
+ return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+ skip_existing, logger):
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+
+ # Function to preprocess utterances for one speaker
+ def preprocess_speaker(speaker_dir: Path):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ # There's a possibility that the preprocessing was interrupted earlier, check if
+ # there already is a sources file.
+ if sources_fpath.exists():
+ try:
+ with sources_fpath.open("r") as sources_file:
+ existing_fnames = {line.split(",")[0] for line in sources_file}
+ except:
+ existing_fnames = {}
+ else:
+ existing_fnames = {}
+
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+
+ sources_file.close()
+
+ # Process the utterances for each speaker
+ with ThreadPool(8) as pool:
+ list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+ unit="speakers"))
+ logger.finalize()
+ print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+ for dataset_name in librispeech_datasets["train"]["other"]:
+ # Initialize the preprocessing
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.glob("*"))
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb1"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the contents of the meta file
+ with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+ metadata = [line.split("\t") for line in metafile][1:]
+
+ # Select the ID and the nationality, filter out non-anglophone speakers
+ nationalities = {line[0]: line[3] for line in metadata}
+ keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+ nationality.lower() in anglophone_nationalites]
+ print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+ (len(keep_speaker_ids), len(nationalities)))
+
+ # Get the speaker directories for anglophone speakers only
+ speaker_dirs = dataset_root.joinpath("wav").glob("*")
+ speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+ speaker_dir.name in keep_speaker_ids]
+ print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+ (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+ # Preprocess all speakers
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb2"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the speaker directories
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+ skip_existing, logger)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..250d038a33b72d09dfe67811c917708aa0ea6714
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py
@@ -0,0 +1,127 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .visualizations import Visualizations
+from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from .params_model import *
+from .model import SpeakerEncoder
+from .utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+ # FIXME
+ return
+ # For correct profiling (cuda operations are async)
+ if device.type == "cuda":
+ torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+ backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+ no_visdom: bool):
+ # Create a dataset and a dataloader
+ dataset = SpeakerVerificationDataset(clean_data_root)
+ loader = SpeakerVerificationDataLoader(
+ dataset,
+ speakers_per_batch,
+ utterances_per_speaker,
+ num_workers=8,
+ )
+
+ # Setup the device on which to run the forward pass and the loss. These can be different,
+ # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+ # hyperparameters) faster on the CPU.
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ # FIXME: currently, the gradient is None if loss_device is cuda
+ loss_device = torch.device("cpu")
+
+ # Create the model and the optimizer
+ model = SpeakerEncoder(device, loss_device)
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+ init_step = 1
+
+ # Configure file path for the model
+ state_fpath = models_dir.joinpath(run_id + ".pt")
+ backup_dir = models_dir.joinpath(run_id + "_backups")
+
+ # Load any existing model
+ if not force_restart:
+ if state_fpath.exists():
+ print("Found existing model \"%s\", loading it and resuming training." % run_id)
+ checkpoint = torch.load(state_fpath)
+ init_step = checkpoint["step"]
+ model.load_state_dict(checkpoint["model_state"])
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
+ optimizer.param_groups[0]["lr"] = learning_rate_init
+ else:
+ print("No model \"%s\" found, starting training from scratch." % run_id)
+ else:
+ print("Starting the training from scratch.")
+ model.train()
+
+ # Initialize the visualization environment
+ vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+ vis.log_dataset(dataset)
+ vis.log_params()
+ device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+ vis.log_implementation({"Device": device_name})
+
+ # Training loop
+ profiler = Profiler(summarize_every=10, disabled=False)
+ for step, speaker_batch in enumerate(loader, init_step):
+ profiler.tick("Blocking, waiting for batch (threaded)")
+
+ # Forward pass
+ inputs = torch.from_numpy(speaker_batch.data).to(device)
+ sync(device)
+ profiler.tick("Data to %s" % device)
+ embeds = model(inputs)
+ sync(device)
+ profiler.tick("Forward pass")
+ embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+ loss, eer = model.loss(embeds_loss)
+ sync(loss_device)
+ profiler.tick("Loss")
+
+ # Backward pass
+ model.zero_grad()
+ loss.backward()
+ profiler.tick("Backward pass")
+ model.do_gradient_ops()
+ optimizer.step()
+ profiler.tick("Parameter update")
+
+ # Update visualizations
+ # learning_rate = optimizer.param_groups[0]["lr"]
+ vis.update(loss.item(), eer, step)
+
+ # Draw projections and save them to the backup folder
+ if umap_every != 0 and step % umap_every == 0:
+ print("Drawing and saving projections (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+ embeds = embeds.detach().cpu().numpy()
+ vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+ vis.save()
+
+ # Overwrite the latest version of the model
+ if save_every != 0 and step % save_every == 0:
+ print("Saving the model (step %d)" % step)
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, state_fpath)
+
+ # Make a backup
+ if backup_every != 0 and step % backup_every == 0:
+ print("Making a backup (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, backup_fpath)
+
+ profiler.tick("Extras (visualizations, saving)")
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de50f3ec61f6b61798299726b13a1caa1638abb
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from pathlib import Path
+import numpy as np
+import argparse
+
+_type_priorities = [ # In decreasing order
+ Path,
+ str,
+ int,
+ float,
+ bool,
+]
+
+def _priority(o):
+ p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None)
+ if p is not None:
+ return p
+ p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None)
+ if p is not None:
+ return p
+ return len(_type_priorities)
+
+def print_args(args: argparse.Namespace, parser=None):
+ args = vars(args)
+ if parser is None:
+ priorities = list(map(_priority, args.values()))
+ else:
+ all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
+ priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
+ priorities = list(map(priority, args.keys()))
+
+ pad = max(map(len, args.keys())) + 3
+ indices = np.lexsort((list(args.keys()), priorities))
+ items = list(args.items())
+
+ print("Arguments:")
+ for i in indices:
+ param, value = items[i]
+ print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
+ print("")
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py
new file mode 100644
index 0000000000000000000000000000000000000000..43de43e4c29821df5d20d8303ce491101a041a86
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py
@@ -0,0 +1,222 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+import math
+from scipy.special import expn
+from collections import namedtuple
+
+NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
+
+
+def profile_noise(noise, sampling_rate, window_size=0):
+ """
+ Creates a profile of the noise in a given waveform.
+
+ :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
+ :param sampling_rate: the sampling rate of the audio
+ :param window_size: the size of the window the logmmse algorithm operates on. A default value
+ will be picked if left as 0.
+ :return: a NoiseProfile object
+ """
+ noise, dtype = to_float(noise)
+ noise += np.finfo(np.float64).eps
+
+ if window_size == 0:
+ window_size = int(math.floor(0.02 * sampling_rate))
+
+ if window_size % 2 == 1:
+ window_size = window_size + 1
+
+ perc = 50
+ len1 = int(math.floor(window_size * perc / 100))
+ len2 = int(window_size - len1)
+
+ win = np.hanning(window_size)
+ win = win * len2 / np.sum(win)
+ n_fft = 2 * window_size
+
+ noise_mean = np.zeros(n_fft)
+ n_frames = len(noise) // window_size
+ for j in range(0, window_size * n_frames, window_size):
+ noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
+ noise_mu2 = (noise_mean / n_frames) ** 2
+
+ return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
+
+
+def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
+ """
+ Cleans the noise from a speech waveform given a noise profile. The waveform must have the
+ same sampling rate as the one used to create the noise profile.
+
+ :param wav: a speech waveform as a numpy array of floats or ints.
+ :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
+ the same) waveform.
+ :param eta: voice threshold for noise update. While the voice activation detection value is
+ below this threshold, the noise profile will be continuously updated throughout the audio.
+ Set to 0 to disable updating the noise profile.
+ :return: the clean wav as a numpy array of floats or ints of the same length.
+ """
+ wav, dtype = to_float(wav)
+ wav += np.finfo(np.float64).eps
+ p = noise_profile
+
+ nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
+ x_final = np.zeros(nframes * p.len2)
+
+ aa = 0.98
+ mu = 0.98
+ ksi_min = 10 ** (-25 / 10)
+
+ x_old = np.zeros(p.len1)
+ xk_prev = np.zeros(p.len1)
+ noise_mu2 = p.noise_mu2
+ for k in range(0, nframes * p.len2, p.len2):
+ insign = p.win * wav[k:k + p.window_size]
+
+ spec = np.fft.fft(insign, p.n_fft, axis=0)
+ sig = np.absolute(spec)
+ sig2 = sig ** 2
+
+ gammak = np.minimum(sig2 / noise_mu2, 40)
+
+ if xk_prev.all() == 0:
+ ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+ else:
+ ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+ ksi = np.maximum(ksi_min, ksi)
+
+ log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
+ vad_decision = np.sum(log_sigma_k) / p.window_size
+ if vad_decision < eta:
+ noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+
+ a = ksi / (1 + ksi)
+ vk = a * gammak
+ ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+ hw = a * np.exp(ei_vk)
+ sig = sig * hw
+ xk_prev = sig ** 2
+ xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
+ xi_w = np.real(xi_w)
+
+ x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
+ x_old = xi_w[p.len1:p.window_size]
+
+ output = from_float(x_final, dtype)
+ output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
+ return output
+
+
+## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that
+## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of
+## webrctvad
+# def vad(wav, sampling_rate, eta=0.15, window_size=0):
+# """
+# TODO: fix doc
+# Creates a profile of the noise in a given waveform.
+#
+# :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints.
+# :param sampling_rate: the sampling rate of the audio
+# :param window_size: the size of the window the logmmse algorithm operates on. A default value
+# will be picked if left as 0.
+# :param eta: voice threshold for noise update. While the voice activation detection value is
+# below this threshold, the noise profile will be continuously updated throughout the audio.
+# Set to 0 to disable updating the noise profile.
+# """
+# wav, dtype = to_float(wav)
+# wav += np.finfo(np.float64).eps
+#
+# if window_size == 0:
+# window_size = int(math.floor(0.02 * sampling_rate))
+#
+# if window_size % 2 == 1:
+# window_size = window_size + 1
+#
+# perc = 50
+# len1 = int(math.floor(window_size * perc / 100))
+# len2 = int(window_size - len1)
+#
+# win = np.hanning(window_size)
+# win = win * len2 / np.sum(win)
+# n_fft = 2 * window_size
+#
+# wav_mean = np.zeros(n_fft)
+# n_frames = len(wav) // window_size
+# for j in range(0, window_size * n_frames, window_size):
+# wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
+# noise_mu2 = (wav_mean / n_frames) ** 2
+#
+# wav, dtype = to_float(wav)
+# wav += np.finfo(np.float64).eps
+#
+# nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
+# vad = np.zeros(nframes * len2, dtype=np.bool)
+#
+# aa = 0.98
+# mu = 0.98
+# ksi_min = 10 ** (-25 / 10)
+#
+# xk_prev = np.zeros(len1)
+# noise_mu2 = noise_mu2
+# for k in range(0, nframes * len2, len2):
+# insign = win * wav[k:k + window_size]
+#
+# spec = np.fft.fft(insign, n_fft, axis=0)
+# sig = np.absolute(spec)
+# sig2 = sig ** 2
+#
+# gammak = np.minimum(sig2 / noise_mu2, 40)
+#
+# if xk_prev.all() == 0:
+# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+# else:
+# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+# ksi = np.maximum(ksi_min, ksi)
+#
+# log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
+# vad_decision = np.sum(log_sigma_k) / window_size
+# if vad_decision < eta:
+# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+# print(vad_decision)
+#
+# a = ksi / (1 + ksi)
+# vk = a * gammak
+# ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+# hw = a * np.exp(ei_vk)
+# sig = sig * hw
+# xk_prev = sig ** 2
+#
+# vad[k:k + len2] = vad_decision >= eta
+#
+# vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
+# return vad
+
+
+def to_float(_input):
+ if _input.dtype == np.float64:
+ return _input, _input.dtype
+ elif _input.dtype == np.float32:
+ return _input.astype(np.float64), _input.dtype
+ elif _input.dtype == np.uint8:
+ return (_input - 128) / 128., _input.dtype
+ elif _input.dtype == np.int16:
+ return _input / 32768., _input.dtype
+ elif _input.dtype == np.int32:
+ return _input / 2147483648., _input.dtype
+ raise ValueError('Unsupported wave file format')
+
+
+def from_float(_input, dtype):
+ if dtype == np.float64:
+ return _input, np.float64
+ elif dtype == np.float32:
+ return _input.astype(np.float32)
+ elif dtype == np.uint8:
+ return ((_input * 128) + 128).astype(np.uint8)
+ elif dtype == np.int16:
+ return (_input * 32768).astype(np.int16)
+ elif dtype == np.int32:
+ print(_input)
+ return (_input * 2147483648).astype(np.int32)
+ raise ValueError('Unsupported wave file format')
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0176f632b58dfde15e31c04e79543b629bd4499
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from time import perf_counter as timer
+from collections import OrderedDict
+import numpy as np
+
+
+class Profiler:
+ def __init__(self, summarize_every=5, disabled=False):
+ self.last_tick = timer()
+ self.logs = OrderedDict()
+ self.summarize_every = summarize_every
+ self.disabled = disabled
+
+ def tick(self, name):
+ if self.disabled:
+ return
+
+ # Log the time needed to execute that function
+ if not name in self.logs:
+ self.logs[name] = []
+ if len(self.logs[name]) >= self.summarize_every:
+ self.summarize()
+ self.purge_logs()
+ self.logs[name].append(timer() - self.last_tick)
+
+ self.reset_timer()
+
+ def purge_logs(self):
+ for name in self.logs:
+ self.logs[name].clear()
+
+ def reset_timer(self):
+ self.last_tick = timer()
+
+ def summarize(self):
+ n = max(map(len, self.logs.values()))
+ assert n == self.summarize_every
+ print("\nAverage execution time over %d steps:" % n)
+
+ name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
+ pad = max(map(len, name_msgs))
+ for name_msg, deltas in zip(name_msgs, self.logs.values()):
+ print(" %s mean: %4.0fms std: %4.0fms" %
+ (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
+ print("", flush=True)
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b0ffc1f3c54d85158521cac6d09f05dd21de6d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py
@@ -0,0 +1,180 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+ [76, 255, 0],
+ [0, 127, 70],
+ [255, 0, 0],
+ [255, 217, 38],
+ [0, 135, 255],
+ [165, 0, 165],
+ [255, 167, 255],
+ [0, 255, 255],
+ [255, 96, 38],
+ [142, 76, 0],
+ [33, 0, 127],
+ [0, 0, 0],
+ [183, 183, 183],
+], dtype=np.float) / 255
+
+
+class Visualizations:
+ def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+ # Tracking data
+ self.last_update_timestamp = timer()
+ self.update_every = update_every
+ self.step_times = []
+ self.losses = []
+ self.eers = []
+ print("Updating the visualizations every %d steps." % update_every)
+
+ # If visdom is disabled TODO: use a better paradigm for that
+ self.disabled = disabled
+ if self.disabled:
+ return
+
+ # Set the environment name
+ now = str(datetime.now().strftime("%d-%m %Hh%M"))
+ if env_name is None:
+ self.env_name = now
+ else:
+ self.env_name = "%s (%s)" % (env_name, now)
+
+ # Connect to visdom and open the corresponding window in the browser
+ try:
+ self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+ except ConnectionError:
+ raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+ "start it.")
+ # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+
+ # Create the windows
+ self.loss_win = None
+ self.eer_win = None
+ # self.lr_win = None
+ self.implementation_win = None
+ self.projection_win = None
+ self.implementation_string = ""
+
+ def log_params(self):
+ if self.disabled:
+ return
+ from encoder import params_data
+ from encoder import params_model
+ param_string = "Model parameters:
"
+ for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+ value = getattr(params_model, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ param_string += "Data parameters:
"
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ self.vis.text(param_string, opts={"title": "Parameters"})
+
+ def log_dataset(self, dataset: SpeakerVerificationDataset):
+ if self.disabled:
+ return
+ dataset_string = ""
+ dataset_string += "Speakers: %s\n" % len(dataset.speakers)
+ dataset_string += "\n" + dataset.get_logs()
+ dataset_string = dataset_string.replace("\n", "
")
+ self.vis.text(dataset_string, opts={"title": "Dataset"})
+
+ def log_implementation(self, params):
+ if self.disabled:
+ return
+ implementation_string = ""
+ for param, value in params.items():
+ implementation_string += "%s: %s\n" % (param, value)
+ implementation_string = implementation_string.replace("\n", "
")
+ self.implementation_string = implementation_string
+ self.implementation_win = self.vis.text(
+ implementation_string,
+ opts={"title": "Training implementation"}
+ )
+
+ def update(self, loss, eer, step):
+ # Update the tracking data
+ now = timer()
+ self.step_times.append(1000 * (now - self.last_update_timestamp))
+ self.last_update_timestamp = now
+ self.losses.append(loss)
+ self.eers.append(eer)
+ print(".", end="")
+
+ # Update the plots every steps
+ if step % self.update_every != 0:
+ return
+ time_string = "Step time: mean: %5dms std: %5dms" % \
+ (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+ print("\nStep %6d Loss: %.4f EER: %.4f %s" %
+ (step, np.mean(self.losses), np.mean(self.eers), time_string))
+ if not self.disabled:
+ self.loss_win = self.vis.line(
+ [np.mean(self.losses)],
+ [step],
+ win=self.loss_win,
+ update="append" if self.loss_win else None,
+ opts=dict(
+ legend=["Avg. loss"],
+ xlabel="Step",
+ ylabel="Loss",
+ title="Loss",
+ )
+ )
+ self.eer_win = self.vis.line(
+ [np.mean(self.eers)],
+ [step],
+ win=self.eer_win,
+ update="append" if self.eer_win else None,
+ opts=dict(
+ legend=["Avg. EER"],
+ xlabel="Step",
+ ylabel="EER",
+ title="Equal error rate"
+ )
+ )
+ if self.implementation_win is not None:
+ self.vis.text(
+ self.implementation_string + ("%s" % time_string),
+ win=self.implementation_win,
+ opts={"title": "Training implementation"},
+ )
+
+ # Reset the tracking
+ self.losses.clear()
+ self.eers.clear()
+ self.step_times.clear()
+
+ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+ max_speakers=10):
+ max_speakers = min(max_speakers, len(colormap))
+ embeds = embeds[:max_speakers * utterances_per_speaker]
+
+ n_speakers = len(embeds) // utterances_per_speaker
+ ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+ colors = [colormap[i] for i in ground_truth]
+
+ reducer = umap.UMAP()
+ projected = reducer.fit_transform(embeds)
+ plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+ plt.gca().set_aspect("equal", "datalim")
+ plt.title("UMAP projection (step %d)" % step)
+ if not self.disabled:
+ self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+ if out_fpath is not None:
+ plt.savefig(out_fpath)
+ plt.clf()
+
+ def save(self):
+ if not self.disabled:
+ self.vis.save([self.env_name])
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py
@@ -0,0 +1,76 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.p2e_cross import P2E_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class DreamVG(object):
+ def __init__(self,
+ config_path='configs/plugin_cross.yaml',
+ ckpt_path='../ckpts/dreamvc_plugin.pt',
+ device='cpu'):
+
+ with open(config_path, 'r') as fp:
+ config = yaml.safe_load(fp)
+
+ self.device = device
+ self.model = P2E_Cross(config['model']).to(device)
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
+ self.model.eval()
+
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+ beta_start=config['scheduler']['beta_start'],
+ beta_end=config['scheduler']['beta_end'],
+ rescale_betas_zero_snr=True,
+ timestep_spacing="trailing",
+ clip_sample=False,
+ prediction_type='v_prediction')
+ self.noise_scheduler = noise_scheduler
+ self.scale = config['scheduler']['scale']
+ self.shift = config['scheduler']['shift']
+ self.spk_shape = config['model']['unet']['in_channels']
+
+ @torch.no_grad()
+ def inference(self, text,
+ guidance_scale=5, guidance_rescale=0.7,
+ ddim_steps=50, eta=1, random_seed=2023,
+ ):
+ text, text_mask = text
+ self.model.eval()
+
+ gen_shape = (1, self.spk_shape)
+
+ if random_seed is not None:
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
+ else:
+ generator = torch.Generator(device=self.device)
+ generator.seed()
+
+ self.noise_scheduler.set_timesteps(ddim_steps)
+
+ # init noise
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
+ latents = noise
+
+ for t in self.noise_scheduler.timesteps:
+ latents = self.noise_scheduler.scale_model_input(latents, t)
+
+ if guidance_scale:
+ output_text = self.model(latents, t, text, text_mask, train_cfg=False)
+ output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+ if guidance_rescale > 0.0:
+ output_pred = rescale_noise_cfg(output_pred, output_text,
+ guidance_rescale=guidance_rescale)
+ else:
+ output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
+
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+ eta=eta, generator=generator).prev_sample
+
+ # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+ pred = scale_shift_re(latents, 1/self.scale, self.shift)
+ # pred = torch.clip(pred, min=0.0, max=0.5)
+ return pred
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py
@@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+ """
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+ """
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+ # rescale the results from guidance (fixes overexposure)
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+ return noise_cfg
+
+
+def scale_shift(x, scale, shift):
+ return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+ return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+ source_len = source.shape[1]
+ if mapping_method == 'hard':
+ mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+ output = source[:, mapping_idx]
+ else:
+ # TBD
+ raise NotImplementedError
+
+ return output
+
+
+def save_plot(tensor, savepath):
+ tensor = tensor.squeeze().cpu()
+ plt.style.use('default')
+ fig, ax = plt.subplots(figsize=(12, 3))
+ im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+ plt.colorbar(im, ax=ax)
+ plt.tight_layout()
+ fig.canvas.draw()
+ plt.savefig(savepath)
+ plt.close()
+
+
+def save_audio(file_path, sampling_rate, audio):
+ audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999)
+ wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+ tensor = torch.clip(tensor, vmin, vmax)
+ tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+ return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+ tensor = torch.clip(tensor, -1.0, 1.0)
+ tensor = (tensor + 1) / 2
+ tensor = tensor * (vmax - vmin) + vmin
+ return tensor
+
+
+if __name__ == "__main__":
+
+ a = torch.rand(2, 10)
+ target_len = 15
+
+ b = align_seq(a, target_len)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py
@@ -0,0 +1,144 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.model import DiffVC
+from .model.model_cross import DiffVC_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class ReDiffVC(object):
+ def __init__(self,
+ config_path='configs/diffvc_base.yaml',
+ ckpt_path='../ckpts/dreamvc_base.pt',
+ device='cpu'):
+
+ with open(config_path, 'r') as fp:
+ config = yaml.safe_load(fp)
+
+ self.device = device
+ self.model = DiffVC(config['model']).to(device)
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
+ self.model.eval()
+
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+ beta_start=config['scheduler']['beta_start'],
+ beta_end=config['scheduler']['beta_end'],
+ rescale_betas_zero_snr=True,
+ timestep_spacing="trailing",
+ clip_sample=False,
+ prediction_type='v_prediction')
+ self.noise_scheduler = noise_scheduler
+ self.scale = config['scheduler']['scale']
+ self.shift = config['scheduler']['shift']
+ self.melshape = config['model']['unet']['sample_size'][0]
+
+ @torch.no_grad()
+ def inference(self,
+ spk_embed, content_clip, f0_clip=None,
+ guidance_scale=3, guidance_rescale=0.7,
+ ddim_steps=50, eta=1, random_seed=2023):
+
+ self.model.eval()
+ if random_seed is not None:
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
+ else:
+ generator = torch.Generator(device=self.device)
+ generator.seed()
+
+ self.noise_scheduler.set_timesteps(ddim_steps)
+
+ # init noise
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
+ latents = noise
+
+ for t in self.noise_scheduler.timesteps:
+ latents = self.noise_scheduler.scale_model_input(latents, t)
+
+ if guidance_scale:
+ output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+ output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
+ speaker_cfg=1.0, pitch_cfg=0.0)
+
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+ if guidance_rescale > 0.0:
+ output_pred = rescale_noise_cfg(output_pred, output_text,
+ guidance_rescale=guidance_rescale)
+ else:
+ output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+ eta=eta, generator=generator).prev_sample
+
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+ return pred
+
+
+class DreamVC(object):
+ def __init__(self,
+ config_path='configs/diffvc_cross.yaml',
+ ckpt_path='../ckpts/dreamvc_cross.pt',
+ device='cpu'):
+
+ with open(config_path, 'r') as fp:
+ config = yaml.safe_load(fp)
+
+ self.device = device
+ self.model = DiffVC_Cross(config['model']).to(device)
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
+ self.model.eval()
+
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+ beta_start=config['scheduler']['beta_start'],
+ beta_end=config['scheduler']['beta_end'],
+ rescale_betas_zero_snr=True,
+ timestep_spacing="trailing",
+ clip_sample=False,
+ prediction_type='v_prediction')
+ self.noise_scheduler = noise_scheduler
+ self.scale = config['scheduler']['scale']
+ self.shift = config['scheduler']['shift']
+ self.melshape = config['model']['unet']['sample_size'][0]
+
+ @torch.no_grad()
+ def inference(self,
+ text, content_clip, f0_clip=None,
+ guidance_scale=3, guidance_rescale=0.7,
+ ddim_steps=50, eta=1, random_seed=2023):
+
+ text, text_mask = text
+ self.model.eval()
+ if random_seed is not None:
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
+ else:
+ generator = torch.Generator(device=self.device)
+ generator.seed()
+
+ self.noise_scheduler.set_timesteps(ddim_steps)
+
+ # init noise
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
+ latents = noise
+
+ for t in self.noise_scheduler.timesteps:
+ latents = self.noise_scheduler.scale_model_input(latents, t)
+
+ if guidance_scale:
+ output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+ output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
+ speaker_cfg=1.0, pitch_cfg=0.0)
+
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+ if guidance_rescale > 0.0:
+ output_pred = rescale_noise_cfg(output_pred, output_text,
+ guidance_rescale=guidance_rescale)
+ else:
+ output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+ eta=eta, generator=generator).prev_sample
+
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+ return pred
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/stream.py b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e83847c3ed3e2db37c1adcef4c635b4ea30ebd0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py
@@ -0,0 +1,158 @@
+import os
+import torch
+import torch.nn.functional as F
+import librosa
+import sounddevice as sd
+from transformers import WavLMModel
+from scipy.io.wavfile import write
+from models import SynthesizerTrn
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import utils
+import numpy as np
+from transformers import T5Tokenizer, T5EncoderModel
+from src.plugin_wrapper import DreamVG
+
+
+# Load configurations and models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+freevc.eval()
+utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+
+print("Loading Speaker Encoder...")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+lm_path = 'google/flan-t5-base'
+tokenizer = T5Tokenizer.from_pretrained(lm_path)
+text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
+ ckpt_path='checkpoints/dreamvc_plugin.pt',
+ device=device)
+
+
+# Constants for overlap-add
+CHUNK_SIZE = 47040
+OVERLAP = 960
+BUFFER_SIZE = OVERLAP + CHUNK_SIZE
+fade_size = OVERLAP
+HANN_WINDOW = np.ones(BUFFER_SIZE)
+HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
+HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]
+
+# Initialize buffers
+input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
+output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
+
+
+@torch.no_grad()
+def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
+ """Process audio in chunks with overlap and manage input/output buffers."""
+ global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE
+
+ # Add incoming audio chunk to input buffer
+ input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
+ input_buffer[OVERLAP:] = audio_chunk
+
+ # Downsample to 16,000 Hz
+ chunk = input_buffer
+ chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
+
+ # Convert to tensor and pad
+ chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
+ chunk_tensor = F.pad(chunk_tensor, (40, 40))
+
+ # Extract content features using WavLM
+ c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)
+
+ # Generate converted audio using FreeVC
+ audio = freevc.infer(c, g=tgt_embedding)
+ audio = audio[0][0].data.cpu().float().numpy()
+
+ # Upsample back to 48,000 Hz
+ audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)
+
+ # Apply Hann window to the output
+ windowed_output = audio * HANN_WINDOW
+
+ # Add the new processed audio to the output buffer with overlap
+ output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
+ output_buffer[OVERLAP:] = 0
+ output_buffer += windowed_output
+
+ normalization_factors = np.zeros(BUFFER_SIZE)
+ normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
+ normalization_factors += HANN_WINDOW
+ normalization_factors = np.clip(normalization_factors, 1e-6, None)
+ # output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE]
+
+ return output_buffer[:CHUNK_SIZE]
+
+
+def prepare_target_embedding(tgt_audio_path):
+ """Preprocess target audio and get speaker embedding."""
+ wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+ return g_tgt
+
+
+# Prepare the target speaker embedding
+# target_audio = "p225_001.wav" # Target speaker audio
+# target_embedding = prepare_target_embedding(target_audio)
+prompt = "A young girl voice, very cute"
+prompt_guidance_scale = 3.0
+
+text_batch = tokenizer(prompt, max_length=32,
+ padding='max_length', truncation=True, return_tensors="pt")
+text, text_mask = text_batch.input_ids.to(device), \
+ text_batch.attention_mask.to(device)
+text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+target_embedding = dreamvg.inference([text, text_mask],
+ guidance_scale=prompt_guidance_scale,
+ guidance_rescale=0.0,
+ ddim_steps=100, eta=1,
+ random_seed=None)
+
+# Stream settings
+SAMPLING_RATE = 48000
+INPUT_DEVICE = 69
+OUTPUT_DEVICE = 58
+
+
+def audio_callback(indata, outdata, frames, time, status):
+ """Callback function for real-time audio processing with input and output buffers."""
+ global input_buffer, output_buffer
+
+ if status:
+ print(f"Status: {status}")
+ # Reshape and process input audio
+ indata = indata[:, 0] # Mono input
+ converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
+ # Write the converted audio to the output stream
+ outdata[:] = converted_audio.reshape(-1, 1)
+
+
+# Start the audio stream with the updated callback
+with sd.Stream(
+ samplerate=SAMPLING_RATE,
+ blocksize=CHUNK_SIZE,
+ channels=1,
+ dtype='float32',
+ latency='low',
+ device=(INPUT_DEVICE, OUTPUT_DEVICE),
+ callback=audio_callback):
+ try:
+ sd.sleep(1000000)
+ except KeyboardInterrupt:
+ print("Voice conversion stopped.")
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff09995743b34dc0c96c81a5fc0ae72c3eda5843
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+ checkpoint = torch.load('wavlm/WavLM-Large.pt')
+ cfg = WavLMConfig(checkpoint['cfg'])
+ cmodel = WavLM(cfg).cuda(rank)
+ cmodel.load_state_dict(checkpoint['model'])
+ cmodel.eval()
+ return cmodel
+
+
+def get_content(cmodel, y):
+ with torch.no_grad():
+ c = cmodel.extract_features(y.squeeze(1))[0]
+ c = c.transpose(1, 2)
+ return c
+
+
+def get_vocoder(rank):
+ with open("hifigan/config.json", "r") as f:
+ config = json.load(f)
+ config = hifigan.AttrDict(config)
+ vocoder = hifigan.Generator(config)
+ ckpt = torch.load("hifigan/generator_v1")
+ vocoder.load_state_dict(ckpt["generator"])
+ vocoder.eval()
+ vocoder.remove_weight_norm()
+ vocoder.cuda(rank)
+ return vocoder
+
+
+def transform(mel, height): # 68-92
+ #r = np.random.random()
+ #rate = r * 0.3 + 0.85 # 0.85-1.15
+ #height = int(mel.size(-2) * rate)
+ tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+ if height >= mel.size(-2):
+ return tgt[:, :mel.size(-2), :]
+ else:
+ silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
+ silence += torch.randn_like(silence) / 10
+ return torch.cat((tgt, silence), 1)
+
+
+def stretch(mel, width): # 0.5-2
+ return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+ assert os.path.isfile(checkpoint_path)
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+ iteration = checkpoint_dict['iteration']
+ learning_rate = checkpoint_dict['learning_rate']
+ if optimizer is not None:
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
+ saved_state_dict = checkpoint_dict['model']
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ new_state_dict= {}
+ for k, v in state_dict.items():
+ try:
+ new_state_dict[k] = saved_state_dict[k]
+ except:
+ logger.info("%s is not in the checkpoint" % k)
+ new_state_dict[k] = v
+ if hasattr(model, 'module'):
+ model.module.load_state_dict(new_state_dict)
+ else:
+ model.load_state_dict(new_state_dict)
+ logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+ checkpoint_path, iteration))
+ return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
+ iteration, checkpoint_path))
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ torch.save({'model': state_dict,
+ 'iteration': iteration,
+ 'optimizer': optimizer.state_dict(),
+ 'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+ for k, v in scalars.items():
+ writer.add_scalar(k, v, global_step)
+ for k, v in histograms.items():
+ writer.add_histogram(k, v, global_step)
+ for k, v in images.items():
+ writer.add_image(k, v, global_step, dataformats='HWC')
+ for k, v in audios.items():
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+ f_list = glob.glob(os.path.join(dir_path, regex))
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+ x = f_list[-1]
+ print(x)
+ return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(10,2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+ interpolation='none')
+ plt.colorbar(im, ax=ax)
+ plt.xlabel("Frames")
+ plt.ylabel("Channels")
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(6, 4))
+ im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+ interpolation='none')
+ fig.colorbar(im, ax=ax)
+ xlabel = 'Decoder timestep'
+ if info is not None:
+ xlabel += '\n\n' + info
+ plt.xlabel(xlabel)
+ plt.ylabel('Encoder timestep')
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+ with open(filename, encoding='utf-8') as f:
+ filepaths_and_text = [line.strip().split(split) for line in f]
+ return filepaths_and_text
+
+
+def get_hparams(init=True):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+ help='JSON file for configuration')
+ parser.add_argument('-m', '--model', type=str, required=True,
+ help='Model name')
+
+ args = parser.parse_args()
+ model_dir = os.path.join("./logs", args.model)
+
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+
+ config_path = args.config
+ config_save_path = os.path.join(model_dir, "config.json")
+ if init:
+ with open(config_path, "r") as f:
+ data = f.read()
+ with open(config_save_path, "w") as f:
+ f.write(data)
+ else:
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams = HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_dir(model_dir):
+ config_save_path = os.path.join(model_dir, "config.json")
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_file(config_path):
+ with open(config_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ return hparams
+
+
+def check_git_hash(model_dir):
+ source_dir = os.path.dirname(os.path.realpath(__file__))
+ if not os.path.exists(os.path.join(source_dir, ".git")):
+ logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+ source_dir
+ ))
+ return
+
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+ path = os.path.join(model_dir, "githash")
+ if os.path.exists(path):
+ saved_hash = open(path).read()
+ if saved_hash != cur_hash:
+ logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+ saved_hash[:8], cur_hash[:8]))
+ else:
+ open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+ global logger
+ logger = logging.getLogger(os.path.basename(model_dir))
+ logger.setLevel(logging.DEBUG)
+
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+ h = logging.FileHandler(os.path.join(model_dir, filename))
+ h.setLevel(logging.DEBUG)
+ h.setFormatter(formatter)
+ logger.addHandler(h)
+ return logger
+
+
+class HParams():
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ if type(v) == dict:
+ v = HParams(**v)
+ self[k] = v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+ def __setitem__(self, key, value):
+ return setattr(self, key, value)
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return self.__dict__.__repr__()
diff --git a/dreamvoice/train_utils/prepare_freevc/get_dist.py b/dreamvoice/train_utils/prepare_freevc/get_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ad1dcbbc5a83c38ceb9101c5ae6cd744959f6e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/get_dist.py
@@ -0,0 +1,49 @@
+import os
+import torch
+import random
+import numpy as np
+
+
+# Function to recursively find all .pt files in a directory
+def find_pt_files(root_dir):
+ pt_files = []
+ for dirpath, _, filenames in os.walk(root_dir):
+ for file in filenames:
+ if file.endswith('.pt'):
+ pt_files.append(os.path.join(dirpath, file))
+ return pt_files
+
+
+# Function to compute statistics for a given tensor list
+def compute_statistics(tensor_list):
+ all_data = torch.cat(tensor_list)
+ mean = torch.mean(all_data).item()
+ std = torch.std(all_data).item()
+ max_val = torch.max(all_data).item()
+ min_val = torch.min(all_data).item()
+ return mean, std, max_val, min_val
+
+
+# Root directory containing .pt files in subfolders
+root_dir = "spk"
+
+# Find all .pt files
+pt_files = find_pt_files(root_dir)
+
+# Randomly sample 1000 .pt files (or fewer if less than 1000 files are available)
+sampled_files = random.sample(pt_files, min(1000, len(pt_files)))
+
+# Load tensors from sampled files
+tensor_list = []
+for file in sampled_files:
+ tensor = torch.load(file)
+ tensor_list.append(tensor.view(-1)) # Flatten the tensor
+
+# Compute statistics
+mean, std, max_val, min_val = compute_statistics(tensor_list)
+
+# Print the results
+print(f"Mean: {mean}")
+print(f"Std: {std}")
+print(f"Max: {max_val}")
+print(f"Min: {min_val}")
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py
@@ -0,0 +1,107 @@
+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+ source_sr: Optional[int] = None):
+ """
+ Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+ either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+ just .wav), either the waveform as a numpy array of floats.
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+ preprocessing. After preprocessing, the waveform's sampling rate will match the data
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+ this argument will be ignored.
+ """
+ # Load the wav from disk if needed
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+ wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+ else:
+ wav = fpath_or_wav
+
+ # Resample the wav if needed
+ if source_sr is not None and source_sr != sampling_rate:
+ wav = librosa.resample(wav, source_sr, sampling_rate)
+
+ # Apply the preprocessing: normalize volume and shorten long silences
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+ wav = trim_long_silences(wav)
+
+ return wav
+
+
+def wav_to_mel_spectrogram(wav):
+ """
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+ Note: this not a log-mel spectrogram.
+ """
+ frames = librosa.feature.melspectrogram(
+ y=wav,
+ sr=sampling_rate,
+ n_fft=int(sampling_rate * mel_window_length / 1000),
+ hop_length=int(sampling_rate * mel_window_step / 1000),
+ n_mels=mel_n_channels
+ )
+ return frames.astype(np.float32).T
+
+
+def trim_long_silences(wav):
+ """
+ Ensures that segments without voice in the waveform remain no longer than a
+ threshold determined by the VAD parameters in params.py.
+
+ :param wav: the raw waveform as a numpy array of floats
+ :return: the same waveform with silences trimmed away (length <= original wav length)
+ """
+ # Compute the voice detection window size
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
+
+ # Trim the end of the audio to have a multiple of the window size
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+
+ # Convert the float waveform to 16-bit mono PCM
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+
+ # Perform voice activation detection
+ voice_flags = []
+ vad = webrtcvad.Vad(mode=3)
+ for window_start in range(0, len(wav), samples_per_window):
+ window_end = window_start + samples_per_window
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+ sample_rate=sampling_rate))
+ voice_flags = np.array(voice_flags)
+
+ # Smooth the voice detection with a moving average
+ def moving_average(array, width):
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+ ret = np.cumsum(array_padded, dtype=float)
+ ret[width:] = ret[width:] - ret[:-width]
+ return ret[width - 1:] / width
+
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
+ audio_mask = np.round(audio_mask).astype(np.bool)
+
+ # Dilate the voiced regions
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+ audio_mask = np.repeat(audio_mask, samples_per_window)
+
+ return wav[audio_mask == True]
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+ if increase_only and decrease_only:
+ raise ValueError("Both increase only and decrease only are set")
+ dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+ if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+ return wav
+ return wav * (10 ** (dBFS_change / 20))
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py
@@ -0,0 +1,40 @@
+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+
+
+def embed_utterance(fpaths, encoder_model_fpath):
+ if not encoder.is_loaded():
+ encoder.load_model(encoder_model_fpath)
+
+ # Compute the speaker embedding of the utterance
+ wav_fpath, embed_fpath = fpaths
+ wav = np.load(wav_fpath)
+ wav = encoder.preprocess_wav(wav)
+ embed = encoder.embed_utterance(wav)
+ np.save(embed_fpath, embed, allow_pickle=False)
+
+
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+
+ wav_dir = outdir_root.joinpath("audio")
+ metadata_fpath = synthesizer_root.joinpath("train.txt")
+ assert wav_dir.exists() and metadata_fpath.exists()
+ embed_dir = synthesizer_root.joinpath("embeds")
+ embed_dir.mkdir(exist_ok=True)
+
+ # Gather the input wave filepath and the target output embed filepath
+ with metadata_fpath.open("r") as metadata_file:
+ metadata = [line.split("|") for line in metadata_file]
+ fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+
+ # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+ # Embed the utterances in separate threads
+ func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+ job = Pool(n_processes).imap(func, fpaths)
+ list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py
@@ -0,0 +1,45 @@
+librispeech_datasets = {
+ "train": {
+ "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+ "other": ["LibriSpeech/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriSpeech/test-clean"],
+ "other": ["LibriSpeech/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriSpeech/dev-clean"],
+ "other": ["LibriSpeech/dev-other"]
+ },
+}
+libritts_datasets = {
+ "train": {
+ "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+ "other": ["LibriTTS/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriTTS/test-clean"],
+ "other": ["LibriTTS/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriTTS/dev-clean"],
+ "other": ["LibriTTS/dev-other"]
+ },
+}
+voxceleb_datasets = {
+ "voxceleb1" : {
+ "train": ["VoxCeleb1/wav"],
+ "test": ["VoxCeleb1/test_wav"]
+ },
+ "voxceleb2" : {
+ "train": ["VoxCeleb2/dev/aac"],
+ "test": ["VoxCeleb2/test_wav"]
+ }
+}
+
+other_datasets = [
+ "LJSpeech-1.1",
+ "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py
@@ -0,0 +1,2 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py
@@ -0,0 +1,37 @@
+import random
+
+class RandomCycler:
+ """
+ Creates an internal copy of a sequence and allows access to its items in a constrained random
+ order. For a source sequence of n items and one or several consecutive queries of a total
+ of m items, the following guarantees hold (one implies the other):
+ - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+ - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+ """
+
+ def __init__(self, source):
+ if len(source) == 0:
+ raise Exception("Can't create RandomCycler from an empty collection")
+ self.all_items = list(source)
+ self.next_items = []
+
+ def sample(self, count: int):
+ shuffle = lambda l: random.sample(l, len(l))
+
+ out = []
+ while count > 0:
+ if count >= len(self.all_items):
+ out.extend(shuffle(list(self.all_items)))
+ count -= len(self.all_items)
+ continue
+ n = min(count, len(self.next_items))
+ out.extend(self.next_items[:n])
+ count -= n
+ self.next_items = self.next_items[n:]
+ if len(self.next_items) == 0:
+ self.next_items = shuffle(list(self.all_items))
+ return out
+
+ def __next__(self):
+ return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py
@@ -0,0 +1,40 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+ def __init__(self, root: Path):
+ self.root = root
+ self.name = root.name
+ self.utterances = None
+ self.utterance_cycler = None
+
+ def _load_utterances(self):
+ with self.root.joinpath("_sources.txt").open("r") as sources_file:
+ sources = [l.split(",") for l in sources_file]
+ sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+ self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+ self.utterance_cycler = RandomCycler(self.utterances)
+
+ def random_partial(self, count, n_frames):
+ """
+ Samples a batch of unique partial utterances from the disk in a way that all
+ utterances come up at least once every two cycles and in a random order every time.
+
+ :param count: The number of partial utterances to sample from the set of utterances from
+ that speaker. Utterances are guaranteed not to be repeated if is not larger than
+ the number of utterances available.
+ :param n_frames: The number of frames in the partial utterance.
+ :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+ frames are the frames of the partial utterances and range is the range of the partial
+ utterance with regard to the complete utterance.
+ """
+ if self.utterances is None:
+ self._load_utterances()
+
+ utterances = self.utterance_cycler.sample(count)
+
+ a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+ return a
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py
@@ -0,0 +1,12 @@
+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+
+class SpeakerBatch:
+ def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+ self.speakers = speakers
+ self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+
+ # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+ # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+ self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,56 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+ def __init__(self, datasets_root: Path):
+ self.root = datasets_root
+ speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+ if len(speaker_dirs) == 0:
+ raise Exception("No speakers found. Make sure you are pointing to the directory "
+ "containing all preprocessed speaker directories.")
+ self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+ self.speaker_cycler = RandomCycler(self.speakers)
+
+ def __len__(self):
+ return int(1e10)
+
+ def __getitem__(self, index):
+ return next(self.speaker_cycler)
+
+ def get_logs(self):
+ log_string = ""
+ for log_fpath in self.root.glob("*.txt"):
+ with log_fpath.open("r") as log_file:
+ log_string += "".join(log_file.readlines())
+ return log_string
+
+
+class SpeakerVerificationDataLoader(DataLoader):
+ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+ batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+ worker_init_fn=None):
+ self.utterances_per_speaker = utterances_per_speaker
+
+ super().__init__(
+ dataset=dataset,
+ batch_size=speakers_per_batch,
+ shuffle=False,
+ sampler=sampler,
+ batch_sampler=batch_sampler,
+ num_workers=num_workers,
+ collate_fn=self.collate,
+ pin_memory=pin_memory,
+ drop_last=False,
+ timeout=timeout,
+ worker_init_fn=worker_init_fn
+ )
+
+ def collate(self, speakers):
+ return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+
+class Utterance:
+ def __init__(self, frames_fpath, wave_fpath):
+ self.frames_fpath = frames_fpath
+ self.wave_fpath = wave_fpath
+
+ def get_frames(self):
+ return np.load(self.frames_fpath)
+
+ def random_partial(self, n_frames):
+ """
+ Crops the frames into a partial utterance of n_frames
+
+ :param n_frames: The number of frames of the partial utterance
+ :return: the partial utterance frames and a tuple indicating the start and end of the
+ partial utterance in the complete utterance.
+ """
+ frames = self.get_frames()
+ if frames.shape[0] == n_frames:
+ start = 0
+ else:
+ start = np.random.randint(0, frames.shape[0] - n_frames)
+ end = start + n_frames
+ return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py
@@ -0,0 +1,31 @@
+## Mel-filterbank
+mel_window_length = 25 # In milliseconds
+mel_window_step = 10 # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160 # 1600 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30 # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py
@@ -0,0 +1,177 @@
+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device=None):
+ """
+ Loads the model in memory. If this function is not explicitely called, it will be run on the
+ first call to embed_frames() with the default weights file.
+
+ :param weights_fpath: the path to saved model weights.
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+ model will be loaded and will run on this device. Outputs will however always be on the cpu.
+ If None, will default to your GPU if it"s available, otherwise your CPU.
+ """
+ # TODO: I think the slow loading of the encoder might have something to do with the device it
+ # was saved on. Worth investigating.
+ global _model, _device
+ if device is None:
+ _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ elif isinstance(device, str):
+ _device = torch.device(device)
+ _model = SpeakerEncoder(_device, torch.device("cpu"))
+ checkpoint = torch.load(weights_fpath)
+ _model.load_state_dict(checkpoint["model_state"])
+ _model.eval()
+ print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+
+
+def is_loaded():
+ return _model is not None
+
+
+def embed_frames_batch(frames_batch):
+ """
+ Computes embeddings for a batch of mel spectrogram.
+
+ :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+ (batch_size, n_frames, n_channels)
+ :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+ """
+ if _model is None:
+ raise Exception("Model was not loaded. Call load_model() before inference.")
+
+ frames = torch.from_numpy(frames_batch).to(_device)
+ embed = _model.forward(frames).detach().cpu().numpy()
+ return embed
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+ min_pad_coverage=0.75, overlap=0.5):
+ """
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+ partial utterances of each. Both the waveform and the mel
+ spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+ its spectrogram. This function assumes that the mel spectrogram parameters used are those
+ defined in params_data.py.
+
+ The returned ranges may be indexing further than the length of the waveform. It is
+ recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+
+ :param n_samples: the number of samples in the waveform
+ :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+ utterance
+ :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+ it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+ utterance, this parameter is ignored so that the function always returns at least 1 slice.
+ :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+ utterances are entirely disjoint.
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
+ utterances.
+ """
+ assert 0 <= overlap < 1
+ assert 0 < min_pad_coverage <= 1
+
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+ frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+ # Compute the slices
+ wav_slices, mel_slices = [], []
+ steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+ for i in range(0, steps, frame_step):
+ mel_range = np.array([i, i + partial_utterance_n_frames])
+ wav_range = mel_range * samples_per_frame
+ mel_slices.append(slice(*mel_range))
+ wav_slices.append(slice(*wav_range))
+
+ # Evaluate whether extra padding is warranted or not
+ last_wav_range = wav_slices[-1]
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+ if coverage < min_pad_coverage and len(mel_slices) > 1:
+ mel_slices = mel_slices[:-1]
+ wav_slices = wav_slices[:-1]
+
+ return wav_slices, mel_slices
+
+
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+ """
+ Computes an embedding for a single utterance.
+
+ # TODO: handle multiple wavs to benefit from batching on GPU
+ :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+ :param using_partials: if True, then the utterance is split in partial utterances of
+ frames and the utterance embedding is computed from their
+ normalized average. If False, the utterance is instead computed from feeding the entire
+ spectogram to the network.
+ :param return_partials: if True, the partial embeddings will also be returned along with the
+ wav slices that correspond to the partial embeddings.
+ :param kwargs: additional arguments to compute_partial_splits()
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+ is True, the partial utterances as a numpy array of float32 of shape
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+ returned. If is simultaneously set to False, both these values will be None
+ instead.
+ """
+ # Process the entire utterance if not using partials
+ if not using_partials:
+ frames = audio.wav_to_mel_spectrogram(wav)
+ embed = embed_frames_batch(frames[None, ...])[0]
+ if return_partials:
+ return embed, None, None
+ return embed
+
+ # Compute where to split the utterance into partials and pad if necessary
+ wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+ max_wave_length = wave_slices[-1].stop
+ if max_wave_length >= len(wav):
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+ # Split the utterance into partials
+ frames = audio.wav_to_mel_spectrogram(wav)
+ frames_batch = np.array([frames[s] for s in mel_slices])
+ partial_embeds = embed_frames_batch(frames_batch)
+
+ # Compute the utterance embedding from the partial embeddings
+ raw_embed = np.mean(partial_embeds, axis=0)
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
+
+ if return_partials:
+ return embed, partial_embeds, wave_slices
+ return embed
+
+
+def embed_speaker(wavs, **kwargs):
+ raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+ if ax is None:
+ ax = plt.gca()
+
+ if shape is None:
+ height = int(np.sqrt(len(embed)))
+ shape = (height, -1)
+ embed = embed.reshape(shape)
+
+ cmap = cm.get_cmap()
+ mappable = ax.imshow(embed, cmap=cmap)
+ cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+ cbar.set_clim(*color_range)
+
+ ax.set_xticks([]), ax.set_yticks([])
+ ax.set_title(title)
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py
@@ -0,0 +1,135 @@
+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+ def __init__(self, device, loss_device):
+ super().__init__()
+ self.loss_device = loss_device
+
+ # Network defition
+ self.lstm = nn.LSTM(input_size=mel_n_channels, # 40
+ hidden_size=model_hidden_size, # 256
+ num_layers=model_num_layers, # 3
+ batch_first=True).to(device)
+ self.linear = nn.Linear(in_features=model_hidden_size,
+ out_features=model_embedding_size).to(device)
+ self.relu = torch.nn.ReLU().to(device)
+
+ # Cosine similarity scaling (with fixed initial parameter values)
+ self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+ self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+ # Loss
+ self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+
+ def do_gradient_ops(self):
+ # Gradient scale
+ self.similarity_weight.grad *= 0.01
+ self.similarity_bias.grad *= 0.01
+
+ # Gradient clipping
+ clip_grad_norm_(self.parameters(), 3, norm_type=2)
+
+ def forward(self, utterances, hidden_init=None):
+ """
+ Computes the embeddings of a batch of utterance spectrograms.
+
+ :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+ (batch_size, n_frames, n_channels)
+ :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+ batch_size, hidden_size). Will default to a tensor of zeros if None.
+ :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+ """
+ # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+ # and the final cell state.
+ out, (hidden, cell) = self.lstm(utterances, hidden_init)
+
+ # We take only the hidden state of the last layer
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+
+ # L2-normalize it
+ embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ return embeds
+
+ def similarity_matrix(self, embeds):
+ """
+ Computes the similarity matrix according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, speakers_per_batch)
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+ centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+ centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+ # Exclusive centroids (1 per utterance)
+ centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+ centroids_excl /= (utterances_per_speaker - 1)
+ centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+ # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+ # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+ # We vectorize the computation for efficiency.
+ sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+ speakers_per_batch).to(self.loss_device)
+ mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+ for j in range(speakers_per_batch):
+ mask = np.where(mask_matrix[j])[0]
+ sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+ sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+
+ ## Even more vectorized version (slower maybe because of transpose)
+ # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+ # ).to(self.loss_device)
+ # eye = np.eye(speakers_per_batch, dtype=np.int)
+ # mask = np.where(1 - eye)
+ # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+ # mask = np.where(eye)
+ # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+ # sim_matrix2 = sim_matrix2.transpose(1, 2)
+
+ sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+ return sim_matrix
+
+ def loss(self, embeds):
+ """
+ Computes the softmax loss according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the loss and the EER for this batch of embeddings.
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Loss
+ sim_matrix = self.similarity_matrix(embeds)
+ sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+ speakers_per_batch))
+ ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+ target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+ loss = self.loss_fn(sim_matrix, target)
+
+ # EER (not backpropagated)
+ with torch.no_grad():
+ inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+ labels = np.array([inv_argmax(i) for i in ground_truth])
+ preds = sim_matrix.detach().cpu().numpy()
+
+ # Snippet from https://yangcha.github.io/EER-ROC/
+ fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+ eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+ return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py
@@ -0,0 +1,29 @@
+
+## Mel-filterbank
+mel_window_length = 25 # In milliseconds
+mel_window_step = 10 # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160 # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80 # 800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30 # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py
@@ -0,0 +1,11 @@
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py
@@ -0,0 +1,285 @@
+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+ """
+ Registers metadata about the dataset in a text file.
+ """
+ def __init__(self, root, name):
+ self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+ self.sample_data = dict()
+
+ start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Creating dataset %s on %s" % (name, start_time))
+ self.write_line("-----")
+ self._log_params()
+
+ def _log_params(self):
+ from speaker_encoder import params_data
+ self.write_line("Parameter values:")
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ self.write_line("\t%s: %s" % (param_name, value))
+ self.write_line("-----")
+
+ def write_line(self, line):
+ self.text_file.write("%s\n" % line)
+
+ def add_sample(self, **kwargs):
+ for param_name, value in kwargs.items():
+ if not param_name in self.sample_data:
+ self.sample_data[param_name] = []
+ self.sample_data[param_name].append(value)
+
+ def finalize(self):
+ self.write_line("Statistics:")
+ for param_name, values in self.sample_data.items():
+ self.write_line("\t%s:" % param_name)
+ self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+ self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+ self.write_line("-----")
+ end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Finished on %s" % end_time)
+ self.text_file.close()
+
+
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+ dataset_root = datasets_root.joinpath(dataset_name)
+ if not dataset_root.exists():
+ print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+ return None, None
+ return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+ skip_existing, logger):
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+
+ # Function to preprocess utterances for one speaker
+ def preprocess_speaker(speaker_dir: Path):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ # There's a possibility that the preprocessing was interrupted earlier, check if
+ # there already is a sources file.
+ if sources_fpath.exists():
+ try:
+ with sources_fpath.open("r") as sources_file:
+ existing_fnames = {line.split(",")[0] for line in sources_file}
+ except:
+ existing_fnames = {}
+ else:
+ existing_fnames = {}
+
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+
+ sources_file.close()
+
+ # Process the utterances for each speaker
+ with ThreadPool(8) as pool:
+ list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+ unit="speakers"))
+ logger.finalize()
+ print("Done preprocessing %s.\n" % dataset_name)
+
+
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ # There's a possibility that the preprocessing was interrupted earlier, check if
+ # there already is a sources file.
+ # if sources_fpath.exists():
+ # try:
+ # with sources_fpath.open("r") as sources_file:
+ # existing_fnames = {line.split(",")[0] for line in sources_file}
+ # except:
+ # existing_fnames = {}
+ # else:
+ # existing_fnames = {}
+ existing_fnames = {}
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ # logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+
+ sources_file.close()
+ return len(wav)
+
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+ skip_existing, logger):
+ # from multiprocessing import Pool, cpu_count
+ from pathos.multiprocessing import ProcessingPool as Pool
+ # Function to preprocess utterances for one speaker
+ def __preprocess_speaker(speaker_dir: Path):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ existing_fnames = {}
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+ wav_lens = []
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ # logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+ wav_lens.append(len(wav))
+ sources_file.close()
+ return wav_lens
+
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+ # Process the utterances for each speaker
+ # with ThreadPool(8) as pool:
+ # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+ # unit="speakers"))
+ pool = Pool(processes=20)
+ for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+ for wav_len in wav_lens:
+ logger.add_sample(duration=wav_len / sampling_rate)
+ print(f'{i}/{len(speaker_dirs)} \r')
+
+ logger.finalize()
+ print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+ for dataset_name in librispeech_datasets["train"]["other"]:
+ # Initialize the preprocessing
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.glob("*"))
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb1"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the contents of the meta file
+ with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+ metadata = [line.split("\t") for line in metafile][1:]
+
+ # Select the ID and the nationality, filter out non-anglophone speakers
+ nationalities = {line[0]: line[3] for line in metadata}
+ # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+ # nationality.lower() in anglophone_nationalites]
+ keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]
+ print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+ (len(keep_speaker_ids), len(nationalities)))
+
+ # Get the speaker directories for anglophone speakers only
+ speaker_dirs = dataset_root.joinpath("wav").glob("*")
+ speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+ speaker_dir.name in keep_speaker_ids]
+ print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+ (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+ # Preprocess all speakers
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb2"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the speaker directories
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+ _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+ skip_existing, logger)
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py
@@ -0,0 +1,125 @@
+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+ # FIXME
+ return
+ # For correct profiling (cuda operations are async)
+ if device.type == "cuda":
+ torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+ backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+ no_visdom: bool):
+ # Create a dataset and a dataloader
+ dataset = SpeakerVerificationDataset(clean_data_root)
+ loader = SpeakerVerificationDataLoader(
+ dataset,
+ speakers_per_batch, # 64
+ utterances_per_speaker, # 10
+ num_workers=8,
+ )
+
+ # Setup the device on which to run the forward pass and the loss. These can be different,
+ # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+ # hyperparameters) faster on the CPU.
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ # FIXME: currently, the gradient is None if loss_device is cuda
+ loss_device = torch.device("cpu")
+
+ # Create the model and the optimizer
+ model = SpeakerEncoder(device, loss_device)
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+ init_step = 1
+
+ # Configure file path for the model
+ state_fpath = models_dir.joinpath(run_id + ".pt")
+ backup_dir = models_dir.joinpath(run_id + "_backups")
+
+ # Load any existing model
+ if not force_restart:
+ if state_fpath.exists():
+ print("Found existing model \"%s\", loading it and resuming training." % run_id)
+ checkpoint = torch.load(state_fpath)
+ init_step = checkpoint["step"]
+ model.load_state_dict(checkpoint["model_state"])
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
+ optimizer.param_groups[0]["lr"] = learning_rate_init
+ else:
+ print("No model \"%s\" found, starting training from scratch." % run_id)
+ else:
+ print("Starting the training from scratch.")
+ model.train()
+
+ # Initialize the visualization environment
+ vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+ vis.log_dataset(dataset)
+ vis.log_params()
+ device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+ vis.log_implementation({"Device": device_name})
+
+ # Training loop
+ profiler = Profiler(summarize_every=10, disabled=False)
+ for step, speaker_batch in enumerate(loader, init_step):
+ profiler.tick("Blocking, waiting for batch (threaded)")
+
+ # Forward pass
+ inputs = torch.from_numpy(speaker_batch.data).to(device)
+ sync(device)
+ profiler.tick("Data to %s" % device)
+ embeds = model(inputs)
+ sync(device)
+ profiler.tick("Forward pass")
+ embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+ loss, eer = model.loss(embeds_loss)
+ sync(loss_device)
+ profiler.tick("Loss")
+
+ # Backward pass
+ model.zero_grad()
+ loss.backward()
+ profiler.tick("Backward pass")
+ model.do_gradient_ops()
+ optimizer.step()
+ profiler.tick("Parameter update")
+
+ # Update visualizations
+ # learning_rate = optimizer.param_groups[0]["lr"]
+ vis.update(loss.item(), eer, step)
+
+ # Draw projections and save them to the backup folder
+ if umap_every != 0 and step % umap_every == 0:
+ print("Drawing and saving projections (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+ embeds = embeds.detach().cpu().numpy()
+ vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+ vis.save()
+
+ # Overwrite the latest version of the model
+ if save_every != 0 and step % save_every == 0:
+ print("Saving the model (step %d)" % step)
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, state_fpath)
+
+ # Make a backup
+ if backup_every != 0 and step % backup_every == 0:
+ print("Making a backup (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, backup_fpath)
+
+ profiler.tick("Extras (visualizations, saving)")
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py
@@ -0,0 +1,178 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+ [76, 255, 0],
+ [0, 127, 70],
+ [255, 0, 0],
+ [255, 217, 38],
+ [0, 135, 255],
+ [165, 0, 165],
+ [255, 167, 255],
+ [0, 255, 255],
+ [255, 96, 38],
+ [142, 76, 0],
+ [33, 0, 127],
+ [0, 0, 0],
+ [183, 183, 183],
+], dtype=np.float) / 255
+
+
+class Visualizations:
+ def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+ # Tracking data
+ self.last_update_timestamp = timer()
+ self.update_every = update_every
+ self.step_times = []
+ self.losses = []
+ self.eers = []
+ print("Updating the visualizations every %d steps." % update_every)
+
+ # If visdom is disabled TODO: use a better paradigm for that
+ self.disabled = disabled
+ if self.disabled:
+ return
+
+ # Set the environment name
+ now = str(datetime.now().strftime("%d-%m %Hh%M"))
+ if env_name is None:
+ self.env_name = now
+ else:
+ self.env_name = "%s (%s)" % (env_name, now)
+
+ # Connect to visdom and open the corresponding window in the browser
+ try:
+ self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+ except ConnectionError:
+ raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+ "start it.")
+ # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+
+ # Create the windows
+ self.loss_win = None
+ self.eer_win = None
+ # self.lr_win = None
+ self.implementation_win = None
+ self.projection_win = None
+ self.implementation_string = ""
+
+ def log_params(self):
+ if self.disabled:
+ return
+ from speaker_encoder import params_data
+ from speaker_encoder import params_model
+ param_string = "Model parameters:
"
+ for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+ value = getattr(params_model, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ param_string += "Data parameters:
"
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ self.vis.text(param_string, opts={"title": "Parameters"})
+
+ def log_dataset(self, dataset: SpeakerVerificationDataset):
+ if self.disabled:
+ return
+ dataset_string = ""
+ dataset_string += "Speakers: %s\n" % len(dataset.speakers)
+ dataset_string += "\n" + dataset.get_logs()
+ dataset_string = dataset_string.replace("\n", "
")
+ self.vis.text(dataset_string, opts={"title": "Dataset"})
+
+ def log_implementation(self, params):
+ if self.disabled:
+ return
+ implementation_string = ""
+ for param, value in params.items():
+ implementation_string += "%s: %s\n" % (param, value)
+ implementation_string = implementation_string.replace("\n", "
")
+ self.implementation_string = implementation_string
+ self.implementation_win = self.vis.text(
+ implementation_string,
+ opts={"title": "Training implementation"}
+ )
+
+ def update(self, loss, eer, step):
+ # Update the tracking data
+ now = timer()
+ self.step_times.append(1000 * (now - self.last_update_timestamp))
+ self.last_update_timestamp = now
+ self.losses.append(loss)
+ self.eers.append(eer)
+ print(".", end="")
+
+ # Update the plots every steps
+ if step % self.update_every != 0:
+ return
+ time_string = "Step time: mean: %5dms std: %5dms" % \
+ (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+ print("\nStep %6d Loss: %.4f EER: %.4f %s" %
+ (step, np.mean(self.losses), np.mean(self.eers), time_string))
+ if not self.disabled:
+ self.loss_win = self.vis.line(
+ [np.mean(self.losses)],
+ [step],
+ win=self.loss_win,
+ update="append" if self.loss_win else None,
+ opts=dict(
+ legend=["Avg. loss"],
+ xlabel="Step",
+ ylabel="Loss",
+ title="Loss",
+ )
+ )
+ self.eer_win = self.vis.line(
+ [np.mean(self.eers)],
+ [step],
+ win=self.eer_win,
+ update="append" if self.eer_win else None,
+ opts=dict(
+ legend=["Avg. EER"],
+ xlabel="Step",
+ ylabel="EER",
+ title="Equal error rate"
+ )
+ )
+ if self.implementation_win is not None:
+ self.vis.text(
+ self.implementation_string + ("%s" % time_string),
+ win=self.implementation_win,
+ opts={"title": "Training implementation"},
+ )
+
+ # Reset the tracking
+ self.losses.clear()
+ self.eers.clear()
+ self.step_times.clear()
+
+ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+ max_speakers=10):
+ max_speakers = min(max_speakers, len(colormap))
+ embeds = embeds[:max_speakers * utterances_per_speaker]
+
+ n_speakers = len(embeds) // utterances_per_speaker
+ ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+ colors = [colormap[i] for i in ground_truth]
+
+ reducer = umap.UMAP()
+ projected = reducer.fit_transform(embeds)
+ plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+ plt.gca().set_aspect("equal", "datalim")
+ plt.title("UMAP projection (step %d)" % step)
+ if not self.disabled:
+ self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+ if out_fpath is not None:
+ plt.savefig(out_fpath)
+ plt.clf()
+
+ def save(self):
+ if not self.disabled:
+ self.vis.save([self.env_name])
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py
@@ -0,0 +1,173 @@
+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+ def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+ """
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
+ If None, defaults to cuda if it is available on your machine, otherwise the model will
+ run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+ """
+ super().__init__()
+
+ # Define the network
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+ self.relu = nn.ReLU()
+
+ # Get the target device
+ if device is None:
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ elif isinstance(device, str):
+ device = torch.device(device)
+ self.device = device
+
+ # Load the pretrained model'speaker weights
+ # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+ # if not weights_fpath.exists():
+ # raise Exception("Couldn't find the voice encoder pretrained model at %s." %
+ # weights_fpath)
+
+ start = timer()
+ checkpoint = torch.load(weights_fpath, map_location="cpu")
+
+ self.load_state_dict(checkpoint["model_state"], strict=False)
+ self.to(device)
+
+ if verbose:
+ print("Loaded the voice encoder model on %s in %.2f seconds." %
+ (device.type, timer() - start))
+
+ def forward(self, mels: torch.FloatTensor):
+ """
+ Computes the embeddings of a batch of utterance spectrograms.
+ :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
+ (batch_size, n_frames, n_channels)
+ :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
+ Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+ """
+ # Pass the input through the LSTM layers and retrieve the final hidden state of the last
+ # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+ _, (hidden, _) = self.lstm(mels)
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ @staticmethod
+ def compute_partial_slices(n_samples: int, rate, min_coverage):
+ """
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to
+ obtain partial utterances of each. Both the waveform and the
+ mel spectrogram slices are returned, so as to make each partial utterance waveform
+ correspond to its spectrogram.
+
+ The returned ranges may be indexing further than the length of the waveform. It is
+ recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+
+ :param n_samples: the number of samples in the waveform
+ :param rate: how many partial utterances should occur per second. Partial utterances must
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+ the minimum rate is thus 0.625.
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+ it will be discarded. If there aren't enough frames for one partial utterance,
+ this parameter is ignored so that the function always returns at least one slice.
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
+ utterances.
+ """
+ assert 0 < min_coverage <= 1
+
+ # Compute how many frames separate two partial utterances
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+ frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+ assert 0 < frame_step, "The rate is too high"
+ assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+ (sampling_rate / (samples_per_frame * partials_n_frames))
+
+ # Compute the slices
+ wav_slices, mel_slices = [], []
+ steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+ for i in range(0, steps, frame_step):
+ mel_range = np.array([i, i + partials_n_frames])
+ wav_range = mel_range * samples_per_frame
+ mel_slices.append(slice(*mel_range))
+ wav_slices.append(slice(*wav_range))
+
+ # Evaluate whether extra padding is warranted or not
+ last_wav_range = wav_slices[-1]
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+ if coverage < min_coverage and len(mel_slices) > 1:
+ mel_slices = mel_slices[:-1]
+ wav_slices = wav_slices[:-1]
+
+ return wav_slices, mel_slices
+
+ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+ """
+ Computes an embedding for a single utterance. The utterance is divided in partial
+ utterances and an embedding is computed for each. The complete utterance embedding is the
+ L2-normed average embedding of the partial utterances.
+
+ TODO: independent batched version of this function
+
+ :param wav: a preprocessed utterance waveform as a numpy array of float32
+ :param return_partials: if True, the partial embeddings will also be returned along with
+ the wav slices corresponding to each partial utterance.
+ :param rate: how many partial utterances should occur per second. Partial utterances must
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+ the minimum rate is thus 0.625.
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+ it will be discarded. If there aren't enough frames for one partial utterance,
+ this parameter is ignored so that the function always returns at least one slice.
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+ is True, the partial utterances as a numpy array of float32 of shape
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+ returned.
+ """
+ # Compute where to split the utterance into partials and pad the waveform with zeros if
+ # the partial utterances cover a larger range.
+ wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+ max_wave_length = wav_slices[-1].stop
+ if max_wave_length >= len(wav):
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+ # Split the utterance into partials and forward them through the model
+ mel = audio.wav_to_mel_spectrogram(wav)
+ mels = np.array([mel[s] for s in mel_slices])
+ with torch.no_grad():
+ mels = torch.from_numpy(mels).to(self.device)
+ partial_embeds = self(mels).cpu().numpy()
+
+ # Compute the utterance embedding from the partial embeddings
+ raw_embed = np.mean(partial_embeds, axis=0)
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
+
+ if return_partials:
+ return embed, partial_embeds, wav_slices
+ return embed
+
+ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+ """
+ Compute the embedding of a collection of wavs (presumably from the same speaker) by
+ averaging their embedding and L2-normalizing it.
+
+ :param wavs: list of wavs a numpy arrays of float32.
+ :param kwargs: extra arguments to embed_utterance()
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+ """
+ raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+ for wav in wavs], axis=0)
+ return raw_embed / np.linalg.norm(raw_embed, 2)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/spk_ext.py b/dreamvoice/train_utils/prepare_freevc/spk_ext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a71ad6a6131fb67729f1cc6f161dd3fcf276b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/spk_ext.py
@@ -0,0 +1,90 @@
+import os
+import torch
+import librosa
+from tqdm import tqdm
+from speaker_encoder.voice_encoder import SpeakerEncoder
+from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
+
+
+@torch.no_grad()
+def se_extractor(audio_path, smodel):
+ # vad
+ SAMPLE_RATE = 16000
+ audio_vad = get_audio_tensor(audio_path)
+ segments = get_vad_segments(
+ audio_vad,
+ output_sample=True,
+ min_speech_duration=0.1,
+ min_silence_duration=1,
+ method="silero",
+ )
+ segments = [(seg["start"], seg["end"]) for seg in segments]
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
+
+ if len(segments) == 0:
+ segments = [(0, len(audio_vad)/SAMPLE_RATE)]
+ print(segments)
+
+ # spk
+ gs = []
+
+ audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
+ # audio = torch.tensor(audio).float().to(device)
+
+ for s, e in segments:
+ y = audio[int(SAMPLE_RATE*s):int(SAMPLE_RATE*e)]
+ g = smodel.embed_utterance(y)
+ g = torch.from_numpy(g).unsqueeze(0)
+ gs.append(g)
+
+ gs = torch.stack(gs).mean(0)
+ return gs.cpu()
+
+
+def process_audio_folder(input_folder, output_folder, model, device):
+ """
+ Process all audio files in a folder and its subfolders,
+ save the extracted features as .pt files in the output folder with the same structure.
+
+ Args:
+ input_folder (str): Path to the input folder containing audio files.
+ output_folder (str): Path to the output folder to save .pt files.
+ model: Pre-trained model for feature extraction.
+ device: Torch device (e.g., 'cpu' or 'cuda').
+ """
+ # Collect all audio file paths
+ audio_files = []
+ for root, _, files in os.walk(input_folder):
+ for file in files:
+ if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
+ audio_files.append(os.path.join(root, file))
+
+ # Process each audio file with tqdm for progress
+ for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
+ # Construct output path
+ relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
+ output_dir = os.path.join(output_folder, relative_path)
+ os.makedirs(output_dir, exist_ok=True)
+ output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
+
+ # Check if the .pt file already exists
+ if os.path.exists(output_path):
+ # print(f"Skipped (already exists): {output_path}")
+ continue # Skip processing this file
+ # Extract features
+ target_se = se_extractor(audio_path, model).to(device)
+ # Save the feature as .pt
+ torch.save(target_se, output_path)
+ # print(f"Processed and saved: {output_path}")
+
+
+if __name__ == '__main__':
+ smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+ device = 'cuda'
+ # input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
+ # output_folder = 'spk/LibriTTS-R/train-clean-360/'
+ # process_audio_folder(input_folder, output_folder, smodel, device)
+
+ input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
+ output_folder = 'spk/VCTK/VCTK-Corpus/'
+ process_audio_folder(input_folder, output_folder, smodel, device)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/configs/plugin.py b/dreamvoice/train_utils/src/configs/plugin.py
index 5e9a409af86ef67361bae0b7c3ee3b747ee907eb..70a55a8f5edd7d95486b9d28b076fab96b916fb7 100644
--- a/dreamvoice/train_utils/src/configs/plugin.py
+++ b/dreamvoice/train_utils/src/configs/plugin.py
@@ -18,7 +18,7 @@ class AttrDict(dict):
all_params = {
- 'Plugin_base': AttrDict(
+ 'Plugin_freevc': AttrDict(
# Diff params
diff=AttrDict(
num_train_steps=1000,
diff --git a/dreamvoice/train_utils/src/dataset/vcdata.py b/dreamvoice/train_utils/src/dataset/vcdata.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7a1d99a243869794900eed6189a32bad930aea4
--- /dev/null
+++ b/dreamvoice/train_utils/src/dataset/vcdata.py
@@ -0,0 +1,146 @@
+import pandas as pd
+import os
+import random
+import ast
+import numpy as np
+import torch
+from einops import repeat, rearrange
+import librosa
+
+from torch.utils.data import Dataset
+import torchaudio
+
+
+def log_f0(f0, f0_min=librosa.note_to_hz('C2'), scales=4):
+ f0[f0 < f0_min] = 0.0
+ f0_log = torch.zeros_like(f0)
+ f0_log[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
+ # f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
+ # f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
+ f0_log /= (scales*12)
+ return f0_log
+
+
+class VCData(Dataset):
+ def __init__(self,
+ data_dir, meta_dir, subset, prompt_dir,
+ seg_length=1.92, speaker_length=4,
+ sr=24000, content_sr=50, speaker_sr=16000,
+ plugin_mode=False
+ ):
+ self.datadir = data_dir
+ meta = pd.read_csv(meta_dir)
+ self.meta = meta[meta['subset'] == subset]
+ self.subset = subset
+ self.prompts = pd.read_csv(prompt_dir)
+ self.seg_len = seg_length
+ self.speaker_length = speaker_length
+ self.sr = sr
+ self.content_sr = content_sr
+ self.speaker_sr = speaker_sr
+ self.plugin_mode = plugin_mode
+
+ def get_audio_content(self, audio_path, content_path, f0_path):
+ audio_path = self.datadir + audio_path
+ audio, sr = torchaudio.load(audio_path)
+ assert sr == self.sr
+
+ # 1, T, C
+ content = torch.load(self.datadir + content_path)
+
+ total_length = content.shape[1]
+ if int(total_length - int(self.content_sr * self.seg_len)) > 0:
+ start = np.random.randint(0, int(total_length - self.content_sr * self.seg_len) + 1)
+ else:
+ start = 0
+ end = min(start + int(self.seg_len * self.content_sr), content.shape[1])
+
+ # use last frame for padding
+ content_clip = repeat(content[:, -1, :], "b c-> b t c", t=int(self.content_sr * self.seg_len)).clone()
+ content_clip[:, :end - start, :] = content[:, start: end, :]
+
+ audio_clip = torch.zeros(int(self.seg_len * self.sr))
+ # print(start)
+ # print(end)
+ audio_start = round(start * self.sr / self.content_sr)
+ audio_end = round(end * self.sr / self.content_sr)
+ # print(audio_start)
+ # print(audio_end)
+ # print(audio.shape)
+
+ audio_clip[:audio_end - audio_start] = audio[0, audio_start: audio_end].clone()
+
+ if f0_path:
+ f0 = torch.load(self.datadir + f0_path).float()
+ f0_clip = torch.zeros(int(self.content_sr * self.seg_len))
+ f0_clip[:end-start] = f0[start:end]
+ f0_clip = log_f0(f0_clip)
+ f0_clip = f0_clip.unsqueeze(-1)
+ else:
+ f0_clip = None
+
+ return audio_clip, content_clip[0], f0_clip
+
+ def get_speaker(self, speaker_path):
+ audio_path = self.datadir + speaker_path
+ audio, sr = torchaudio.load(audio_path)
+ assert sr == self.speaker_sr
+ # if sr != self.speaker_sr:
+ # resampler = torchaudio.transforms.Resample(sr, self.speaker_sr, dtype=audio.dtype)
+ # audio = resampler(audio)
+
+ audio_clip = torch.zeros(self.speaker_length * self.speaker_sr)
+
+ total_length = audio.shape[1]
+ if int(total_length - self.speaker_sr * self.speaker_length) > 0:
+ start = np.random.randint(0, int(total_length - self.speaker_sr * self.speaker_length) + 1)
+ else:
+ start = 0
+ end = min(start + self.speaker_sr * self.speaker_length, total_length)
+
+ audio_clip[:end-start] = audio[0, start: end]
+
+ return audio_clip
+
+ def __getitem__(self, index):
+ row = self.meta.iloc[index]
+
+ if self.plugin_mode:
+ audio_clip, content_clip, f0_clip = [''], [''], ['']
+ else:
+ # load current audio
+ audio_path = row['audio_path']
+ content_path = row['content_path']
+ f0_path = row['f0_path']
+ audio_clip, content_clip, f0_clip = self.get_audio_content(audio_path, content_path, f0_path)
+
+ # get speaker
+ if self.subset == 'train':
+ speaker = row['speaker']
+ else:
+ speaker = row['speaker_val']
+
+ speaker_row = self.meta[self.meta['speaker'] == speaker].sample(1)
+ speaker_path = speaker_row.iloc[0]['speaker_path']
+ speaker_clip = self.get_speaker(speaker_path)
+ # print(speaker_clip.shape)
+ # print(speaker_path)
+ # print(speaker)
+
+ # get prompt
+ prompts = self.prompts[self.prompts['ID'] == speaker]['prompts'].iloc[0]
+ prompts = ast.literal_eval(prompts)
+ prompt = random.choice(prompts)
+
+ return audio_clip, content_clip, f0_clip, speaker_clip, prompt
+
+ def __len__(self):
+ return len(self.meta)
+
+
+if __name__ == '__main__':
+ from tqdm import tqdm
+ data = VCData('../../features/', '../../data/meta_val.csv', 'val', '../../data/speaker_gender.csv')
+ for i in tqdm(range(len(data))):
+ x = data[i]
+ # print(x[-1])
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/.gitattributes b/dreamvoice/train_utils/src/freevc/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/.gitattributes
@@ -0,0 +1,34 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/dreamvoice/train_utils/src/freevc/.gitignore b/dreamvoice/train_utils/src/freevc/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+flagged
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/README.md b/dreamvoice/train_utils/src/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/train_utils/src/freevc/app.py b/dreamvoice/train_utils/src/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/app.py
@@ -0,0 +1,92 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+ shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# print("Loading FreeVC...")
+# hps = utils.get_hparams_from_file("configs/freevc.json")
+# freevc = SynthesizerTrn(
+# hps.data.filter_length // 2 + 1,
+# hps.train.segment_size // hps.data.hop_length,
+# **hps.model).to(device)
+# _ = freevc.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+# print("Loading FreeVC-s...")
+# hps = utils.get_hparams_from_file("configs/freevc-s.json")
+# freevc_s = SynthesizerTrn(
+# hps.data.filter_length // 2 + 1,
+# hps.train.segment_size // hps.data.hop_length,
+# **hps.model).to(device)
+# _ = freevc_s.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+#
+# print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+def convert(model, cmodel, src, tgt):
+ with torch.no_grad():
+ # tgt
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+
+ # src
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+ # infer
+ if model == "FreeVC":
+ audio = freevc.infer(c, g=g_tgt)
+ elif model == "FreeVC-s":
+ audio = freevc_s.infer(c, mel=mel_tgt)
+ else:
+ audio = freevc_24.infer(c, g=g_tgt)
+ audio = audio[0][0].data.cpu().float().numpy()
+ if model == "FreeVC" or model == "FreeVC-s":
+ write("out.wav", hps.data.sampling_rate, audio)
+ else:
+ write("out.wav", 24000, audio)
+ out = "out.wav"
+ return out
+
+# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
+# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath')
+# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath')
+# inputs = [model, audio1, audio2]
+# outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
+#
+# title = "FreeVC"
+# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please trim it before submitting."
+# article = "Paper | Github Repo
"
+#
+# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+#
+# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
+convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav')
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/commons.py b/dreamvoice/train_utils/src/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+ """KL(P||Q)"""
+ kl = (logs_q - logs_p) - 0.5
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+ return kl
+
+
+def rand_gumbel(shape):
+ """Sample from the Gumbel distribution, protect from overflows."""
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+ return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+ return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, :, idx_str:idx_end]
+ return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def get_timing_signal_1d(
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
+ position = torch.arange(length, dtype=torch.float)
+ num_timescales = channels // 2
+ log_timescale_increment = (
+ math.log(float(max_timescale) / float(min_timescale)) /
+ (num_timescales - 1))
+ inv_timescales = min_timescale * torch.exp(
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
+ signal = signal.view(1, channels, length)
+ return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+ return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def shift_1d(x):
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+ return x
+
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+ """
+ duration: [b, 1, t_x]
+ mask: [b, 1, t_y, t_x]
+ """
+ device = duration.device
+
+ b, _, t_y, t_x = mask.shape
+ cum_duration = torch.cumsum(duration, -1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path.unsqueeze(1).transpose(2,3) * mask
+ return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ norm_type = float(norm_type)
+ if clip_value is not None:
+ clip_value = float(clip_value)
+
+ total_norm = 0
+ for p in parameters:
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item() ** norm_type
+ if clip_value is not None:
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
+ total_norm = total_norm ** (1. / norm_type)
+ return total_norm
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-24.json b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8640,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8008"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,6,4,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-s.json b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": false
+ }
+}
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc.json b/dreamvoice/train_utils/src/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+ "train": {
+ "log_interval": 200,
+ "eval_interval": 10000,
+ "seed": 1234,
+ "epochs": 10000,
+ "learning_rate": 2e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 64,
+ "fp16_run": false,
+ "lr_decay": 0.999875,
+ "segment_size": 8960,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "use_sr": true,
+ "max_speclen": 128,
+ "port": "8001"
+ },
+ "data": {
+ "training_files":"filelists/train.txt",
+ "validation_files":"filelists/val.txt",
+ "max_wav_value": 32768.0,
+ "sampling_rate": 16000,
+ "filter_length": 1280,
+ "hop_length": 320,
+ "win_length": 1280,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [16,16,4,4],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "ssl_dim": 1024,
+ "use_spk": true
+ }
+}
diff --git a/dreamvoice/train_utils/src/freevc/mel_processing.py b/dreamvoice/train_utils/src/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+ output = dynamic_range_compression_torch(magnitudes)
+ return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+ output = dynamic_range_decompression_torch(magnitudes)
+ return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+ global mel_basis
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+ return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+ if torch.min(y) < -1.:
+ print('min value is ', torch.min(y))
+ if torch.max(y) > 1.:
+ print('max value is ', torch.max(y))
+
+ global mel_basis, hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+ y = y.squeeze(1)
+
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+
+ return spec
diff --git a/dreamvoice/train_utils/src/freevc/models.py b/dreamvoice/train_utils/src/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .commons import sequence_mask, rand_slice_segments
+from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+ self.flows.append(Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+
+class Encoder(nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+ resblock = ResBlock1 if resblock == '1' else ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(weight_norm(
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+ k, u, padding=(k-u)//2)))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel//(2**(i+1))
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+ ])
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(DiscriminatorS, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ])
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ fmap = []
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2,3,5,7,11]
+
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class SpeakerEncoder(torch.nn.Module):
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+ super(SpeakerEncoder, self).__init__()
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+ self.relu = nn.ReLU()
+
+ def forward(self, mels):
+ self.lstm.flatten_parameters()
+ _, (hidden, _) = self.lstm(mels)
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+ mel_slices = []
+ for i in range(0, total_frames-partial_frames, partial_hop):
+ mel_range = torch.arange(i, i+partial_frames)
+ mel_slices.append(mel_range)
+
+ return mel_slices
+
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+ mel_len = mel.size(1)
+ last_mel = mel[:,-partial_frames:]
+
+ if mel_len > partial_frames:
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+ mels = list(mel[:,s] for s in mel_slices)
+ mels.append(last_mel)
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
+
+ with torch.no_grad():
+ partial_embeds = self(mels)
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+ #embed = embed / torch.linalg.norm(embed, 2)
+ else:
+ with torch.no_grad():
+ embed = self(last_mel)
+
+ return embed
+
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(self,
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels,
+ ssl_dim,
+ use_spk,
+ **kwargs):
+
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.gin_channels = gin_channels
+ self.ssl_dim = ssl_dim
+ self.use_spk = use_spk
+
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+ if not self.use_spk:
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if spec_lengths == None:
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+
+ if not self.use_spk:
+ g = self.enc_spk(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+ z_p = self.flow(z, spec_mask, g=g)
+
+ z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
+ o = self.dec(z_slice, g=g)
+
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+ def infer(self, c, g=None, mel=None, c_lengths=None):
+ if c_lengths == None:
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+ if not self.use_spk:
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+ g = g.unsqueeze(-1)
+
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
+ o = self.dec(z * c_mask, g=g)
+
+ return o
diff --git a/dreamvoice/train_utils/src/freevc/modules.py b/dreamvoice/train_utils/src/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/modules.py
@@ -0,0 +1,341 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+ assert n_layers > 1, "Number of layers should be larger than 0."
+
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.relu_drop = nn.Sequential(
+ nn.ReLU(),
+ nn.Dropout(p_dropout))
+ for _ in range(n_layers-1):
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ x_org = x
+ for i in range(self.n_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.norm_layers[i](x)
+ x = self.relu_drop(x)
+ x = x_org + self.proj(x)
+ return x * x_mask
+
+
+class DDSConv(nn.Module):
+ """
+ Dialted and Depth-Separable Convolution
+ """
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+ super().__init__()
+ self.channels = channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+
+ self.drop = nn.Dropout(p_dropout)
+ self.convs_sep = nn.ModuleList()
+ self.convs_1x1 = nn.ModuleList()
+ self.norms_1 = nn.ModuleList()
+ self.norms_2 = nn.ModuleList()
+ for i in range(n_layers):
+ dilation = kernel_size ** i
+ padding = (kernel_size * dilation - dilation) // 2
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+ groups=channels, dilation=dilation, padding=padding
+ ))
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+ self.norms_1.append(LayerNorm(channels))
+ self.norms_2.append(LayerNorm(channels))
+
+ def forward(self, x, x_mask, g=None):
+ if g is not None:
+ x = x + g
+ for i in range(self.n_layers):
+ y = self.convs_sep[i](x * x_mask)
+ y = self.norms_1[i](y)
+ y = F.gelu(y)
+ y = self.convs_1x1[i](y)
+ y = self.norms_2[i](y)
+ y = F.gelu(y)
+ y = self.drop(y)
+ x = x + y
+ return x * x_mask
+
+
+class WN(torch.nn.Module):
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ self.hidden_channels =hidden_channels
+ self.kernel_size = kernel_size,
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+ self.p_dropout = p_dropout
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.drop = nn.Dropout(p_dropout)
+
+ if gin_channels != 0:
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+ for i in range(n_layers):
+ dilation = dilation_rate ** i
+ padding = int((kernel_size * dilation - dilation) / 2)
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2 * hidden_channels
+ else:
+ res_skip_channels = hidden_channels
+
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, x, x_mask, g=None, **kwargs):
+ output = torch.zeros_like(x)
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+ if g is not None:
+ g = self.cond_layer(g)
+
+ for i in range(self.n_layers):
+ x_in = self.in_layers[i](x)
+ if g is not None:
+ cond_offset = i * 2 * self.hidden_channels
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+ else:
+ g_l = torch.zeros_like(x_in)
+
+ acts = fused_add_tanh_sigmoid_multiply(
+ x_in,
+ g_l,
+ n_channels_tensor)
+ acts = self.drop(acts)
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
+ x = (x + res_acts) * x_mask
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
+ else:
+ output = output + res_skip_acts
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.gin_channels != 0:
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for l in self.in_layers:
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.res_skip_layers:
+ torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+ super(ResBlock1, self).__init__()
+ self.convs1 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2])))
+ ])
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1)))
+ ])
+ self.convs2.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c1, c2 in zip(self.convs1, self.convs2):
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c1(xt)
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c2(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+ super(ResBlock2, self).__init__()
+ self.convs = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1])))
+ ])
+ self.convs.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c in self.convs:
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_weight_norm(l)
+
+
+class Log(nn.Module):
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+ logdet = torch.sum(-y, [1, 2])
+ return y, logdet
+ else:
+ x = torch.exp(x) * x_mask
+ return x
+
+
+class Flip(nn.Module):
+ def forward(self, x, *args, reverse=False, **kwargs):
+ x = torch.flip(x, [1])
+ if not reverse:
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+ return x, logdet
+ else:
+ return x
+
+
+class ElementwiseAffine(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.channels = channels
+ self.m = nn.Parameter(torch.zeros(channels,1))
+ self.logs = nn.Parameter(torch.zeros(channels,1))
+
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = self.m + torch.exp(self.logs) * x
+ y = y * x_mask
+ logdet = torch.sum(self.logs * x_mask, [1,2])
+ return y, logdet
+ else:
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
+ return x
+
+
+class ResidualCouplingLayer(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=0,
+ gin_channels=0,
+ mean_only=False):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1,2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
diff --git a/dreamvoice/train_utils/src/freevc/requirements.txt b/dreamvoice/train_utils/src/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/train_utils/src/freevc/utils.py b/dreamvoice/train_utils/src/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from .commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+ checkpoint = torch.load('wavlm/WavLM-Large.pt')
+ cfg = WavLMConfig(checkpoint['cfg'])
+ cmodel = WavLM(cfg).cuda(rank)
+ cmodel.load_state_dict(checkpoint['model'])
+ cmodel.eval()
+ return cmodel
+
+
+def get_content(cmodel, y):
+ with torch.no_grad():
+ c = cmodel.extract_features(y.squeeze(1))[0]
+ c = c.transpose(1, 2)
+ return c
+
+
+def get_vocoder(rank):
+ with open("hifigan/config.json", "r") as f:
+ config = json.load(f)
+ config = hifigan.AttrDict(config)
+ vocoder = hifigan.Generator(config)
+ ckpt = torch.load("hifigan/generator_v1")
+ vocoder.load_state_dict(ckpt["generator"])
+ vocoder.eval()
+ vocoder.remove_weight_norm()
+ vocoder.cuda(rank)
+ return vocoder
+
+
+def transform(mel, height): # 68-92
+ #r = np.random.random()
+ #rate = r * 0.3 + 0.85 # 0.85-1.15
+ #height = int(mel.size(-2) * rate)
+ tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+ if height >= mel.size(-2):
+ return tgt[:, :mel.size(-2), :]
+ else:
+ silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
+ silence += torch.randn_like(silence) / 10
+ return torch.cat((tgt, silence), 1)
+
+
+def stretch(mel, width): # 0.5-2
+ return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+ assert os.path.isfile(checkpoint_path)
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+ iteration = checkpoint_dict['iteration']
+ learning_rate = checkpoint_dict['learning_rate']
+ if optimizer is not None:
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
+ saved_state_dict = checkpoint_dict['model']
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ new_state_dict= {}
+ for k, v in state_dict.items():
+ try:
+ new_state_dict[k] = saved_state_dict[k]
+ except:
+ logger.info("%s is not in the checkpoint" % k)
+ new_state_dict[k] = v
+ if hasattr(model, 'module'):
+ model.module.load_state_dict(new_state_dict)
+ else:
+ model.load_state_dict(new_state_dict)
+ logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+ checkpoint_path, iteration))
+ return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
+ iteration, checkpoint_path))
+ if hasattr(model, 'module'):
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+ torch.save({'model': state_dict,
+ 'iteration': iteration,
+ 'optimizer': optimizer.state_dict(),
+ 'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+ for k, v in scalars.items():
+ writer.add_scalar(k, v, global_step)
+ for k, v in histograms.items():
+ writer.add_histogram(k, v, global_step)
+ for k, v in images.items():
+ writer.add_image(k, v, global_step, dataformats='HWC')
+ for k, v in audios.items():
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+ f_list = glob.glob(os.path.join(dir_path, regex))
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+ x = f_list[-1]
+ print(x)
+ return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(10,2))
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+ interpolation='none')
+ plt.colorbar(im, ax=ax)
+ plt.xlabel("Frames")
+ plt.ylabel("Channels")
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+ global MATPLOTLIB_FLAG
+ if not MATPLOTLIB_FLAG:
+ import matplotlib
+ matplotlib.use("Agg")
+ MATPLOTLIB_FLAG = True
+ mpl_logger = logging.getLogger('matplotlib')
+ mpl_logger.setLevel(logging.WARNING)
+ import matplotlib.pylab as plt
+ import numpy as np
+
+ fig, ax = plt.subplots(figsize=(6, 4))
+ im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+ interpolation='none')
+ fig.colorbar(im, ax=ax)
+ xlabel = 'Decoder timestep'
+ if info is not None:
+ xlabel += '\n\n' + info
+ plt.xlabel(xlabel)
+ plt.ylabel('Encoder timestep')
+ plt.tight_layout()
+
+ fig.canvas.draw()
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+ plt.close()
+ return data
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+ with open(filename, encoding='utf-8') as f:
+ filepaths_and_text = [line.strip().split(split) for line in f]
+ return filepaths_and_text
+
+
+def get_hparams(init=True):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+ help='JSON file for configuration')
+ parser.add_argument('-m', '--model', type=str, required=True,
+ help='Model name')
+
+ args = parser.parse_args()
+ model_dir = os.path.join("./logs", args.model)
+
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+
+ config_path = args.config
+ config_save_path = os.path.join(model_dir, "config.json")
+ if init:
+ with open(config_path, "r") as f:
+ data = f.read()
+ with open(config_save_path, "w") as f:
+ f.write(data)
+ else:
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams = HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_dir(model_dir):
+ config_save_path = os.path.join(model_dir, "config.json")
+ with open(config_save_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ hparams.model_dir = model_dir
+ return hparams
+
+
+def get_hparams_from_file(config_path):
+ with open(config_path, "r") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams =HParams(**config)
+ return hparams
+
+
+def check_git_hash(model_dir):
+ source_dir = os.path.dirname(os.path.realpath(__file__))
+ if not os.path.exists(os.path.join(source_dir, ".git")):
+ logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+ source_dir
+ ))
+ return
+
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+ path = os.path.join(model_dir, "githash")
+ if os.path.exists(path):
+ saved_hash = open(path).read()
+ if saved_hash != cur_hash:
+ logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+ saved_hash[:8], cur_hash[:8]))
+ else:
+ open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+ global logger
+ logger = logging.getLogger(os.path.basename(model_dir))
+ logger.setLevel(logging.DEBUG)
+
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+ h = logging.FileHandler(os.path.join(model_dir, filename))
+ h.setLevel(logging.DEBUG)
+ h.setFormatter(formatter)
+ logger.addHandler(h)
+ return logger
+
+
+class HParams():
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ if type(v) == dict:
+ v = HParams(**v)
+ self[k] = v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+ def __setitem__(self, key, value):
+ return setattr(self, key, value)
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return self.__dict__.__repr__()
diff --git a/dreamvoice/train_utils/src/freevc_wrapper.py b/dreamvoice/train_utils/src/freevc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a46fa184110368939cfc1bf3cc9f47a9c8092d
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc_wrapper.py
@@ -0,0 +1,63 @@
+import os
+import torch
+import librosa
+import soundfile as sf
+from pathlib import Path
+
+from transformers import WavLMModel
+from freevc.utils import load_checkpoint, get_hparams_from_file
+from freevc.models import SynthesizerTrn
+# from mel_processing import mel_spectrogram_torch
+# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+
+def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
+ hps = get_hparams_from_file("freevc/configs/freevc.json")
+ freevc = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ **hps.model).to(device)
+ freevc.eval()
+ load_checkpoint("../prepare_freevc/ckpts/freevc.pth", freevc, None)
+
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+ cmodel.eval()
+
+ # smodel = spk_encoder.load_model(Path(speaker_path), device)
+ # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda')
+ smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
+
+ return freevc, cmodel, smodel, hps
+
+
+@torch.no_grad()
+def convert(freevc, content, speaker):
+ audio = freevc.infer(content, g=speaker)
+ audio = audio[0][0].data.cpu().float().numpy()
+ return audio, 24000
+
+
+if __name__ == '__main__':
+ freevc_24, cmodel, smodel, hps = get_freevc_models()
+
+ tgt = 'p226_002.wav'
+ # src = 'p226_002.wav'
+ src = 'p225_001.wav'
+ device = 'cuda'
+
+ # tgt
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+ g_tgt = smodel.embed_utterance(wav_tgt)
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+ # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda())
+
+ # src
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+ content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+
+ output, sr = convert(freevc_24, content, g_tgt)
+
+ sf.write('output.wav', output, sr)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/inference_freevc.py b/dreamvoice/train_utils/src/inference_freevc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdf218f87549460a47cb224c509c3f0fc80d6b0
--- /dev/null
+++ b/dreamvoice/train_utils/src/inference_freevc.py
@@ -0,0 +1,124 @@
+import os
+import torch
+import soundfile as sf
+import pandas as pd
+import librosa
+from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift_re
+from freevc_wrapper import convert
+import time
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+ """
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+ """
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+ # rescale the results from guidance (fixes overexposure)
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+ return noise_cfg
+
+
+@torch.no_grad()
+def inference_timbre(gen_shape, text,
+ model, scheduler,
+ guidance_scale=5, guidance_rescale=0.7,
+ ddim_steps=50, eta=1, random_seed=2023,
+ device='cuda',
+ ):
+ text, text_mask = text
+ model.eval()
+
+ if random_seed is not None:
+ generator = torch.Generator(device=device).manual_seed(random_seed)
+ else:
+ generator = torch.Generator(device=device)
+ generator.seed()
+
+ scheduler.set_timesteps(ddim_steps)
+
+ # init noise
+ noise = torch.randn(gen_shape, generator=generator, device=device)
+ latents = noise
+
+ for t in scheduler.timesteps:
+ latents = scheduler.scale_model_input(latents, t)
+
+ if guidance_scale:
+ output_text = model(latents, t, text, text_mask, train_cfg=False)
+ output_uncond = model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+ if guidance_rescale > 0.0:
+ output_pred = rescale_noise_cfg(output_pred, output_text,
+ guidance_rescale=guidance_rescale)
+ else:
+ output_pred = model(latents, t, text, text_mask, train_cfg=False)
+
+ latents = scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+ eta=eta, generator=generator).prev_sample
+
+ # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+ pred = scale_shift_re(latents, 20, -0.035)
+ pred = torch.clip(pred, min=0.0, max=0.5)
+ return pred
+
+
+@torch.no_grad()
+def eval_plugin(freevc, cmodel, text_model,
+ timbre_model, timbre_scheduler, timbre_shape,
+ val_meta, val_folder,
+ guidance_scale=3, guidance_rescale=0.7,
+ ddim_steps=50, eta=1, random_seed=2024,
+ device='cuda',
+ epoch=0, save_path='logs/eval/', val_num=10, sr=16000):
+
+ tokenizer, text_encoder = text_model
+
+ df = pd.read_csv(val_meta)
+
+ save_path = save_path + str(epoch) + '/'
+ os.makedirs(save_path, exist_ok=True)
+
+ step = 0
+
+ for i in range(len(df)):
+ row = df.iloc[i]
+
+ source_path = val_folder + row['path']
+ # prompt = [row['prompt']]
+ prompt = ["female's voice"]
+ with torch.no_grad():
+ text_batch = tokenizer(prompt,
+ max_length=32,
+ padding='max_length', truncation=True, return_tensors="pt")
+ text, text_mask = text_batch.input_ids.to(device), \
+ text_batch.attention_mask.to(device)
+ text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+ audio_clip = librosa.load(source_path, sr=16000)[0]
+ audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device)
+
+ content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device)
+
+ # start_time = time.time()
+ spk_embed = inference_timbre(timbre_shape, [text, text_mask],
+ timbre_model, timbre_scheduler,
+ guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
+ ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
+ device=device)
+ spk_embed = spk_embed.squeeze(-1)
+
+ output, out_sr = convert(freevc, content, spk_embed)
+ # end_time = time.time()
+ # print(end_time-start_time)
+ # print(pred.shape)
+ sf.write(save_path + f'{step}_{prompt[0]}' + '.wav', output, samplerate=sr)
+
+ step += 1
+
+ if step >= val_num:
+ break
diff --git a/dreamvoice/train_utils/src/speaker_encoder/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/src/speaker_encoder/audio.py b/dreamvoice/train_utils/src/speaker_encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/audio.py
@@ -0,0 +1,107 @@
+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+ source_sr: Optional[int] = None):
+ """
+ Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+ either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+ just .wav), either the waveform as a numpy array of floats.
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+ preprocessing. After preprocessing, the waveform's sampling rate will match the data
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+ this argument will be ignored.
+ """
+ # Load the wav from disk if needed
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+ wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+ else:
+ wav = fpath_or_wav
+
+ # Resample the wav if needed
+ if source_sr is not None and source_sr != sampling_rate:
+ wav = librosa.resample(wav, source_sr, sampling_rate)
+
+ # Apply the preprocessing: normalize volume and shorten long silences
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+ wav = trim_long_silences(wav)
+
+ return wav
+
+
+def wav_to_mel_spectrogram(wav):
+ """
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+ Note: this not a log-mel spectrogram.
+ """
+ frames = librosa.feature.melspectrogram(
+ y=wav,
+ sr=sampling_rate,
+ n_fft=int(sampling_rate * mel_window_length / 1000),
+ hop_length=int(sampling_rate * mel_window_step / 1000),
+ n_mels=mel_n_channels
+ )
+ return frames.astype(np.float32).T
+
+
+def trim_long_silences(wav):
+ """
+ Ensures that segments without voice in the waveform remain no longer than a
+ threshold determined by the VAD parameters in params.py.
+
+ :param wav: the raw waveform as a numpy array of floats
+ :return: the same waveform with silences trimmed away (length <= original wav length)
+ """
+ # Compute the voice detection window size
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
+
+ # Trim the end of the audio to have a multiple of the window size
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+
+ # Convert the float waveform to 16-bit mono PCM
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+
+ # Perform voice activation detection
+ voice_flags = []
+ vad = webrtcvad.Vad(mode=3)
+ for window_start in range(0, len(wav), samples_per_window):
+ window_end = window_start + samples_per_window
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+ sample_rate=sampling_rate))
+ voice_flags = np.array(voice_flags)
+
+ # Smooth the voice detection with a moving average
+ def moving_average(array, width):
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+ ret = np.cumsum(array_padded, dtype=float)
+ ret[width:] = ret[width:] - ret[:-width]
+ return ret[width - 1:] / width
+
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
+ audio_mask = np.round(audio_mask).astype(np.bool)
+
+ # Dilate the voiced regions
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+ audio_mask = np.repeat(audio_mask, samples_per_window)
+
+ return wav[audio_mask == True]
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+ if increase_only and decrease_only:
+ raise ValueError("Both increase only and decrease only are set")
+ dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+ if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+ return wav
+ return wav * (10 ** (dBFS_change / 20))
diff --git a/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379
diff --git a/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py
@@ -0,0 +1,40 @@
+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+
+
+def embed_utterance(fpaths, encoder_model_fpath):
+ if not encoder.is_loaded():
+ encoder.load_model(encoder_model_fpath)
+
+ # Compute the speaker embedding of the utterance
+ wav_fpath, embed_fpath = fpaths
+ wav = np.load(wav_fpath)
+ wav = encoder.preprocess_wav(wav)
+ embed = encoder.embed_utterance(wav)
+ np.save(embed_fpath, embed, allow_pickle=False)
+
+
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+
+ wav_dir = outdir_root.joinpath("audio")
+ metadata_fpath = synthesizer_root.joinpath("train.txt")
+ assert wav_dir.exists() and metadata_fpath.exists()
+ embed_dir = synthesizer_root.joinpath("embeds")
+ embed_dir.mkdir(exist_ok=True)
+
+ # Gather the input wave filepath and the target output embed filepath
+ with metadata_fpath.open("r") as metadata_file:
+ metadata = [line.split("|") for line in metadata_file]
+ fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+
+ # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+ # Embed the utterances in separate threads
+ func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+ job = Pool(n_processes).imap(func, fpaths)
+ list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/config.py b/dreamvoice/train_utils/src/speaker_encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/config.py
@@ -0,0 +1,45 @@
+librispeech_datasets = {
+ "train": {
+ "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+ "other": ["LibriSpeech/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriSpeech/test-clean"],
+ "other": ["LibriSpeech/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriSpeech/dev-clean"],
+ "other": ["LibriSpeech/dev-other"]
+ },
+}
+libritts_datasets = {
+ "train": {
+ "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+ "other": ["LibriTTS/train-other-500"]
+ },
+ "test": {
+ "clean": ["LibriTTS/test-clean"],
+ "other": ["LibriTTS/test-other"]
+ },
+ "dev": {
+ "clean": ["LibriTTS/dev-clean"],
+ "other": ["LibriTTS/dev-other"]
+ },
+}
+voxceleb_datasets = {
+ "voxceleb1" : {
+ "train": ["VoxCeleb1/wav"],
+ "test": ["VoxCeleb1/test_wav"]
+ },
+ "voxceleb2" : {
+ "train": ["VoxCeleb2/dev/aac"],
+ "test": ["VoxCeleb2/test_wav"]
+ }
+}
+
+other_datasets = [
+ "LJSpeech-1.1",
+ "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py
@@ -0,0 +1,2 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py
@@ -0,0 +1,37 @@
+import random
+
+class RandomCycler:
+ """
+ Creates an internal copy of a sequence and allows access to its items in a constrained random
+ order. For a source sequence of n items and one or several consecutive queries of a total
+ of m items, the following guarantees hold (one implies the other):
+ - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+ - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+ """
+
+ def __init__(self, source):
+ if len(source) == 0:
+ raise Exception("Can't create RandomCycler from an empty collection")
+ self.all_items = list(source)
+ self.next_items = []
+
+ def sample(self, count: int):
+ shuffle = lambda l: random.sample(l, len(l))
+
+ out = []
+ while count > 0:
+ if count >= len(self.all_items):
+ out.extend(shuffle(list(self.all_items)))
+ count -= len(self.all_items)
+ continue
+ n = min(count, len(self.next_items))
+ out.extend(self.next_items[:n])
+ count -= n
+ self.next_items = self.next_items[n:]
+ if len(self.next_items) == 0:
+ self.next_items = shuffle(list(self.all_items))
+ return out
+
+ def __next__(self):
+ return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py
@@ -0,0 +1,40 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+ def __init__(self, root: Path):
+ self.root = root
+ self.name = root.name
+ self.utterances = None
+ self.utterance_cycler = None
+
+ def _load_utterances(self):
+ with self.root.joinpath("_sources.txt").open("r") as sources_file:
+ sources = [l.split(",") for l in sources_file]
+ sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+ self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+ self.utterance_cycler = RandomCycler(self.utterances)
+
+ def random_partial(self, count, n_frames):
+ """
+ Samples a batch of unique partial utterances from the disk in a way that all
+ utterances come up at least once every two cycles and in a random order every time.
+
+ :param count: The number of partial utterances to sample from the set of utterances from
+ that speaker. Utterances are guaranteed not to be repeated if is not larger than
+ the number of utterances available.
+ :param n_frames: The number of frames in the partial utterance.
+ :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+ frames are the frames of the partial utterances and range is the range of the partial
+ utterance with regard to the complete utterance.
+ """
+ if self.utterances is None:
+ self._load_utterances()
+
+ utterances = self.utterance_cycler.sample(count)
+
+ a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+ return a
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py
@@ -0,0 +1,12 @@
+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+
+class SpeakerBatch:
+ def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+ self.speakers = speakers
+ self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+
+ # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+ # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+ self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,56 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+ def __init__(self, datasets_root: Path):
+ self.root = datasets_root
+ speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+ if len(speaker_dirs) == 0:
+ raise Exception("No speakers found. Make sure you are pointing to the directory "
+ "containing all preprocessed speaker directories.")
+ self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+ self.speaker_cycler = RandomCycler(self.speakers)
+
+ def __len__(self):
+ return int(1e10)
+
+ def __getitem__(self, index):
+ return next(self.speaker_cycler)
+
+ def get_logs(self):
+ log_string = ""
+ for log_fpath in self.root.glob("*.txt"):
+ with log_fpath.open("r") as log_file:
+ log_string += "".join(log_file.readlines())
+ return log_string
+
+
+class SpeakerVerificationDataLoader(DataLoader):
+ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
+ batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+ worker_init_fn=None):
+ self.utterances_per_speaker = utterances_per_speaker
+
+ super().__init__(
+ dataset=dataset,
+ batch_size=speakers_per_batch,
+ shuffle=False,
+ sampler=sampler,
+ batch_sampler=batch_sampler,
+ num_workers=num_workers,
+ collate_fn=self.collate,
+ pin_memory=pin_memory,
+ drop_last=False,
+ timeout=timeout,
+ worker_init_fn=worker_init_fn
+ )
+
+ def collate(self, speakers):
+ return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+
+class Utterance:
+ def __init__(self, frames_fpath, wave_fpath):
+ self.frames_fpath = frames_fpath
+ self.wave_fpath = wave_fpath
+
+ def get_frames(self):
+ return np.load(self.frames_fpath)
+
+ def random_partial(self, n_frames):
+ """
+ Crops the frames into a partial utterance of n_frames
+
+ :param n_frames: The number of frames of the partial utterance
+ :return: the partial utterance frames and a tuple indicating the start and end of the
+ partial utterance in the complete utterance.
+ """
+ frames = self.get_frames()
+ if frames.shape[0] == n_frames:
+ start = 0
+ else:
+ start = np.random.randint(0, frames.shape[0] - n_frames)
+ end = start + n_frames
+ return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/hparams.py b/dreamvoice/train_utils/src/speaker_encoder/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/hparams.py
@@ -0,0 +1,31 @@
+## Mel-filterbank
+mel_window_length = 25 # In milliseconds
+mel_window_step = 10 # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160 # 1600 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30 # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/inference.py b/dreamvoice/train_utils/src/speaker_encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/inference.py
@@ -0,0 +1,177 @@
+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device=None):
+ """
+ Loads the model in memory. If this function is not explicitely called, it will be run on the
+ first call to embed_frames() with the default weights file.
+
+ :param weights_fpath: the path to saved model weights.
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+ model will be loaded and will run on this device. Outputs will however always be on the cpu.
+ If None, will default to your GPU if it"s available, otherwise your CPU.
+ """
+ # TODO: I think the slow loading of the encoder might have something to do with the device it
+ # was saved on. Worth investigating.
+ global _model, _device
+ if device is None:
+ _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ elif isinstance(device, str):
+ _device = torch.device(device)
+ _model = SpeakerEncoder(_device, torch.device("cpu"))
+ checkpoint = torch.load(weights_fpath)
+ _model.load_state_dict(checkpoint["model_state"])
+ _model.eval()
+ print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+
+
+def is_loaded():
+ return _model is not None
+
+
+def embed_frames_batch(frames_batch):
+ """
+ Computes embeddings for a batch of mel spectrogram.
+
+ :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+ (batch_size, n_frames, n_channels)
+ :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+ """
+ if _model is None:
+ raise Exception("Model was not loaded. Call load_model() before inference.")
+
+ frames = torch.from_numpy(frames_batch).to(_device)
+ embed = _model.forward(frames).detach().cpu().numpy()
+ return embed
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+ min_pad_coverage=0.75, overlap=0.5):
+ """
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+ partial utterances of each. Both the waveform and the mel
+ spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+ its spectrogram. This function assumes that the mel spectrogram parameters used are those
+ defined in params_data.py.
+
+ The returned ranges may be indexing further than the length of the waveform. It is
+ recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+
+ :param n_samples: the number of samples in the waveform
+ :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+ utterance
+ :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+ it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+ utterance, this parameter is ignored so that the function always returns at least 1 slice.
+ :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+ utterances are entirely disjoint.
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
+ utterances.
+ """
+ assert 0 <= overlap < 1
+ assert 0 < min_pad_coverage <= 1
+
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+ frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+ # Compute the slices
+ wav_slices, mel_slices = [], []
+ steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+ for i in range(0, steps, frame_step):
+ mel_range = np.array([i, i + partial_utterance_n_frames])
+ wav_range = mel_range * samples_per_frame
+ mel_slices.append(slice(*mel_range))
+ wav_slices.append(slice(*wav_range))
+
+ # Evaluate whether extra padding is warranted or not
+ last_wav_range = wav_slices[-1]
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+ if coverage < min_pad_coverage and len(mel_slices) > 1:
+ mel_slices = mel_slices[:-1]
+ wav_slices = wav_slices[:-1]
+
+ return wav_slices, mel_slices
+
+
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+ """
+ Computes an embedding for a single utterance.
+
+ # TODO: handle multiple wavs to benefit from batching on GPU
+ :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+ :param using_partials: if True, then the utterance is split in partial utterances of
+ frames and the utterance embedding is computed from their
+ normalized average. If False, the utterance is instead computed from feeding the entire
+ spectogram to the network.
+ :param return_partials: if True, the partial embeddings will also be returned along with the
+ wav slices that correspond to the partial embeddings.
+ :param kwargs: additional arguments to compute_partial_splits()
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+ is True, the partial utterances as a numpy array of float32 of shape
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+ returned. If is simultaneously set to False, both these values will be None
+ instead.
+ """
+ # Process the entire utterance if not using partials
+ if not using_partials:
+ frames = audio.wav_to_mel_spectrogram(wav)
+ embed = embed_frames_batch(frames[None, ...])[0]
+ if return_partials:
+ return embed, None, None
+ return embed
+
+ # Compute where to split the utterance into partials and pad if necessary
+ wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+ max_wave_length = wave_slices[-1].stop
+ if max_wave_length >= len(wav):
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+ # Split the utterance into partials
+ frames = audio.wav_to_mel_spectrogram(wav)
+ frames_batch = np.array([frames[s] for s in mel_slices])
+ partial_embeds = embed_frames_batch(frames_batch)
+
+ # Compute the utterance embedding from the partial embeddings
+ raw_embed = np.mean(partial_embeds, axis=0)
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
+
+ if return_partials:
+ return embed, partial_embeds, wave_slices
+ return embed
+
+
+def embed_speaker(wavs, **kwargs):
+ raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+ if ax is None:
+ ax = plt.gca()
+
+ if shape is None:
+ height = int(np.sqrt(len(embed)))
+ shape = (height, -1)
+ embed = embed.reshape(shape)
+
+ cmap = cm.get_cmap()
+ mappable = ax.imshow(embed, cmap=cmap)
+ cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+ cbar.set_clim(*color_range)
+
+ ax.set_xticks([]), ax.set_yticks([])
+ ax.set_title(title)
diff --git a/dreamvoice/train_utils/src/speaker_encoder/model.py b/dreamvoice/train_utils/src/speaker_encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/model.py
@@ -0,0 +1,135 @@
+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+ def __init__(self, device, loss_device):
+ super().__init__()
+ self.loss_device = loss_device
+
+ # Network defition
+ self.lstm = nn.LSTM(input_size=mel_n_channels, # 40
+ hidden_size=model_hidden_size, # 256
+ num_layers=model_num_layers, # 3
+ batch_first=True).to(device)
+ self.linear = nn.Linear(in_features=model_hidden_size,
+ out_features=model_embedding_size).to(device)
+ self.relu = torch.nn.ReLU().to(device)
+
+ # Cosine similarity scaling (with fixed initial parameter values)
+ self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+ self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+ # Loss
+ self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+
+ def do_gradient_ops(self):
+ # Gradient scale
+ self.similarity_weight.grad *= 0.01
+ self.similarity_bias.grad *= 0.01
+
+ # Gradient clipping
+ clip_grad_norm_(self.parameters(), 3, norm_type=2)
+
+ def forward(self, utterances, hidden_init=None):
+ """
+ Computes the embeddings of a batch of utterance spectrograms.
+
+ :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+ (batch_size, n_frames, n_channels)
+ :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+ batch_size, hidden_size). Will default to a tensor of zeros if None.
+ :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+ """
+ # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+ # and the final cell state.
+ out, (hidden, cell) = self.lstm(utterances, hidden_init)
+
+ # We take only the hidden state of the last layer
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+
+ # L2-normalize it
+ embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ return embeds
+
+ def similarity_matrix(self, embeds):
+ """
+ Computes the similarity matrix according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, speakers_per_batch)
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+ centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+ centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+ # Exclusive centroids (1 per utterance)
+ centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+ centroids_excl /= (utterances_per_speaker - 1)
+ centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+ # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+ # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+ # We vectorize the computation for efficiency.
+ sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+ speakers_per_batch).to(self.loss_device)
+ mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+ for j in range(speakers_per_batch):
+ mask = np.where(mask_matrix[j])[0]
+ sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+ sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+
+ ## Even more vectorized version (slower maybe because of transpose)
+ # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+ # ).to(self.loss_device)
+ # eye = np.eye(speakers_per_batch, dtype=np.int)
+ # mask = np.where(1 - eye)
+ # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+ # mask = np.where(eye)
+ # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+ # sim_matrix2 = sim_matrix2.transpose(1, 2)
+
+ sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+ return sim_matrix
+
+ def loss(self, embeds):
+ """
+ Computes the softmax loss according the section 2.1 of GE2E.
+
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+ utterances_per_speaker, embedding_size)
+ :return: the loss and the EER for this batch of embeddings.
+ """
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+
+ # Loss
+ sim_matrix = self.similarity_matrix(embeds)
+ sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+ speakers_per_batch))
+ ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+ target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+ loss = self.loss_fn(sim_matrix, target)
+
+ # EER (not backpropagated)
+ with torch.no_grad():
+ inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+ labels = np.array([inv_argmax(i) for i in ground_truth])
+ preds = sim_matrix.detach().cpu().numpy()
+
+ # Snippet from https://yangcha.github.io/EER-ROC/
+ fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+ eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+ return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_data.py b/dreamvoice/train_utils/src/speaker_encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/params_data.py
@@ -0,0 +1,29 @@
+
+## Mel-filterbank
+mel_window_length = 25 # In milliseconds
+mel_window_step = 10 # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160 # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80 # 800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30 # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_model.py b/dreamvoice/train_utils/src/speaker_encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/params_model.py
@@ -0,0 +1,11 @@
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/src/speaker_encoder/preprocess.py b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py
@@ -0,0 +1,285 @@
+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+ """
+ Registers metadata about the dataset in a text file.
+ """
+ def __init__(self, root, name):
+ self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+ self.sample_data = dict()
+
+ start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Creating dataset %s on %s" % (name, start_time))
+ self.write_line("-----")
+ self._log_params()
+
+ def _log_params(self):
+ from speaker_encoder import params_data
+ self.write_line("Parameter values:")
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ self.write_line("\t%s: %s" % (param_name, value))
+ self.write_line("-----")
+
+ def write_line(self, line):
+ self.text_file.write("%s\n" % line)
+
+ def add_sample(self, **kwargs):
+ for param_name, value in kwargs.items():
+ if not param_name in self.sample_data:
+ self.sample_data[param_name] = []
+ self.sample_data[param_name].append(value)
+
+ def finalize(self):
+ self.write_line("Statistics:")
+ for param_name, values in self.sample_data.items():
+ self.write_line("\t%s:" % param_name)
+ self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+ self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+ self.write_line("-----")
+ end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+ self.write_line("Finished on %s" % end_time)
+ self.text_file.close()
+
+
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+ dataset_root = datasets_root.joinpath(dataset_name)
+ if not dataset_root.exists():
+ print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+ return None, None
+ return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+ skip_existing, logger):
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+
+ # Function to preprocess utterances for one speaker
+ def preprocess_speaker(speaker_dir: Path):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ # There's a possibility that the preprocessing was interrupted earlier, check if
+ # there already is a sources file.
+ if sources_fpath.exists():
+ try:
+ with sources_fpath.open("r") as sources_file:
+ existing_fnames = {line.split(",")[0] for line in sources_file}
+ except:
+ existing_fnames = {}
+ else:
+ existing_fnames = {}
+
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+
+ sources_file.close()
+
+ # Process the utterances for each speaker
+ with ThreadPool(8) as pool:
+ list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+ unit="speakers"))
+ logger.finalize()
+ print("Done preprocessing %s.\n" % dataset_name)
+
+
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ # There's a possibility that the preprocessing was interrupted earlier, check if
+ # there already is a sources file.
+ # if sources_fpath.exists():
+ # try:
+ # with sources_fpath.open("r") as sources_file:
+ # existing_fnames = {line.split(",")[0] for line in sources_file}
+ # except:
+ # existing_fnames = {}
+ # else:
+ # existing_fnames = {}
+ existing_fnames = {}
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ # logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+
+ sources_file.close()
+ return len(wav)
+
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+ skip_existing, logger):
+ # from multiprocessing import Pool, cpu_count
+ from pathos.multiprocessing import ProcessingPool as Pool
+ # Function to preprocess utterances for one speaker
+ def __preprocess_speaker(speaker_dir: Path):
+ # Give a name to the speaker that includes its dataset
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+ # Create an output directory with that name, as well as a txt file containing a
+ # reference to each source file.
+ speaker_out_dir = out_dir.joinpath(speaker_name)
+ speaker_out_dir.mkdir(exist_ok=True)
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+ existing_fnames = {}
+ # Gather all audio files for that speaker recursively
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
+ wav_lens = []
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+ # Check if the target output file already exists
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
+ if skip_existing and out_fname in existing_fnames:
+ continue
+
+ # Load and preprocess the waveform
+ wav = audio.preprocess_wav(in_fpath)
+ if len(wav) == 0:
+ continue
+
+ # Create the mel spectrogram, discard those that are too short
+ frames = audio.wav_to_mel_spectrogram(wav)
+ if len(frames) < partials_n_frames:
+ continue
+
+ out_fpath = speaker_out_dir.joinpath(out_fname)
+ np.save(out_fpath, frames)
+ # logger.add_sample(duration=len(wav) / sampling_rate)
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+ wav_lens.append(len(wav))
+ sources_file.close()
+ return wav_lens
+
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+ # Process the utterances for each speaker
+ # with ThreadPool(8) as pool:
+ # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+ # unit="speakers"))
+ pool = Pool(processes=20)
+ for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+ for wav_len in wav_lens:
+ logger.add_sample(duration=wav_len / sampling_rate)
+ print(f'{i}/{len(speaker_dirs)} \r')
+
+ logger.finalize()
+ print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+ for dataset_name in librispeech_datasets["train"]["other"]:
+ # Initialize the preprocessing
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.glob("*"))
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb1"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the contents of the meta file
+ with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+ metadata = [line.split("\t") for line in metafile][1:]
+
+ # Select the ID and the nationality, filter out non-anglophone speakers
+ nationalities = {line[0]: line[3] for line in metadata}
+ # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
+ # nationality.lower() in anglophone_nationalites]
+ keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]
+ print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
+ (len(keep_speaker_ids), len(nationalities)))
+
+ # Get the speaker directories for anglophone speakers only
+ speaker_dirs = dataset_root.joinpath("wav").glob("*")
+ speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+ speaker_dir.name in keep_speaker_ids]
+ print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
+ (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+ # Preprocess all speakers
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+ skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+ # Initialize the preprocessing
+ dataset_name = "VoxCeleb2"
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+ if not dataset_root:
+ return
+
+ # Get the speaker directories
+ # Preprocess all speakers
+ speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+ _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+ skip_existing, logger)
diff --git a/dreamvoice/train_utils/src/speaker_encoder/train.py b/dreamvoice/train_utils/src/speaker_encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/train.py
@@ -0,0 +1,125 @@
+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+ # FIXME
+ return
+ # For correct profiling (cuda operations are async)
+ if device.type == "cuda":
+ torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+ backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+ no_visdom: bool):
+ # Create a dataset and a dataloader
+ dataset = SpeakerVerificationDataset(clean_data_root)
+ loader = SpeakerVerificationDataLoader(
+ dataset,
+ speakers_per_batch, # 64
+ utterances_per_speaker, # 10
+ num_workers=8,
+ )
+
+ # Setup the device on which to run the forward pass and the loss. These can be different,
+ # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+ # hyperparameters) faster on the CPU.
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ # FIXME: currently, the gradient is None if loss_device is cuda
+ loss_device = torch.device("cpu")
+
+ # Create the model and the optimizer
+ model = SpeakerEncoder(device, loss_device)
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+ init_step = 1
+
+ # Configure file path for the model
+ state_fpath = models_dir.joinpath(run_id + ".pt")
+ backup_dir = models_dir.joinpath(run_id + "_backups")
+
+ # Load any existing model
+ if not force_restart:
+ if state_fpath.exists():
+ print("Found existing model \"%s\", loading it and resuming training." % run_id)
+ checkpoint = torch.load(state_fpath)
+ init_step = checkpoint["step"]
+ model.load_state_dict(checkpoint["model_state"])
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
+ optimizer.param_groups[0]["lr"] = learning_rate_init
+ else:
+ print("No model \"%s\" found, starting training from scratch." % run_id)
+ else:
+ print("Starting the training from scratch.")
+ model.train()
+
+ # Initialize the visualization environment
+ vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+ vis.log_dataset(dataset)
+ vis.log_params()
+ device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+ vis.log_implementation({"Device": device_name})
+
+ # Training loop
+ profiler = Profiler(summarize_every=10, disabled=False)
+ for step, speaker_batch in enumerate(loader, init_step):
+ profiler.tick("Blocking, waiting for batch (threaded)")
+
+ # Forward pass
+ inputs = torch.from_numpy(speaker_batch.data).to(device)
+ sync(device)
+ profiler.tick("Data to %s" % device)
+ embeds = model(inputs)
+ sync(device)
+ profiler.tick("Forward pass")
+ embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+ loss, eer = model.loss(embeds_loss)
+ sync(loss_device)
+ profiler.tick("Loss")
+
+ # Backward pass
+ model.zero_grad()
+ loss.backward()
+ profiler.tick("Backward pass")
+ model.do_gradient_ops()
+ optimizer.step()
+ profiler.tick("Parameter update")
+
+ # Update visualizations
+ # learning_rate = optimizer.param_groups[0]["lr"]
+ vis.update(loss.item(), eer, step)
+
+ # Draw projections and save them to the backup folder
+ if umap_every != 0 and step % umap_every == 0:
+ print("Drawing and saving projections (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+ embeds = embeds.detach().cpu().numpy()
+ vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+ vis.save()
+
+ # Overwrite the latest version of the model
+ if save_every != 0 and step % save_every == 0:
+ print("Saving the model (step %d)" % step)
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, state_fpath)
+
+ # Make a backup
+ if backup_every != 0 and step % backup_every == 0:
+ print("Making a backup (step %d)" % step)
+ backup_dir.mkdir(exist_ok=True)
+ backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+ torch.save({
+ "step": step + 1,
+ "model_state": model.state_dict(),
+ "optimizer_state": optimizer.state_dict(),
+ }, backup_fpath)
+
+ profiler.tick("Extras (visualizations, saving)")
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/visualizations.py b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py
@@ -0,0 +1,178 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+ [76, 255, 0],
+ [0, 127, 70],
+ [255, 0, 0],
+ [255, 217, 38],
+ [0, 135, 255],
+ [165, 0, 165],
+ [255, 167, 255],
+ [0, 255, 255],
+ [255, 96, 38],
+ [142, 76, 0],
+ [33, 0, 127],
+ [0, 0, 0],
+ [183, 183, 183],
+], dtype=np.float) / 255
+
+
+class Visualizations:
+ def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+ # Tracking data
+ self.last_update_timestamp = timer()
+ self.update_every = update_every
+ self.step_times = []
+ self.losses = []
+ self.eers = []
+ print("Updating the visualizations every %d steps." % update_every)
+
+ # If visdom is disabled TODO: use a better paradigm for that
+ self.disabled = disabled
+ if self.disabled:
+ return
+
+ # Set the environment name
+ now = str(datetime.now().strftime("%d-%m %Hh%M"))
+ if env_name is None:
+ self.env_name = now
+ else:
+ self.env_name = "%s (%s)" % (env_name, now)
+
+ # Connect to visdom and open the corresponding window in the browser
+ try:
+ self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+ except ConnectionError:
+ raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+ "start it.")
+ # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+
+ # Create the windows
+ self.loss_win = None
+ self.eer_win = None
+ # self.lr_win = None
+ self.implementation_win = None
+ self.projection_win = None
+ self.implementation_string = ""
+
+ def log_params(self):
+ if self.disabled:
+ return
+ from speaker_encoder import params_data
+ from speaker_encoder import params_model
+ param_string = "Model parameters:
"
+ for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+ value = getattr(params_model, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ param_string += "Data parameters:
"
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+ value = getattr(params_data, param_name)
+ param_string += "\t%s: %s
" % (param_name, value)
+ self.vis.text(param_string, opts={"title": "Parameters"})
+
+ def log_dataset(self, dataset: SpeakerVerificationDataset):
+ if self.disabled:
+ return
+ dataset_string = ""
+ dataset_string += "Speakers: %s\n" % len(dataset.speakers)
+ dataset_string += "\n" + dataset.get_logs()
+ dataset_string = dataset_string.replace("\n", "
")
+ self.vis.text(dataset_string, opts={"title": "Dataset"})
+
+ def log_implementation(self, params):
+ if self.disabled:
+ return
+ implementation_string = ""
+ for param, value in params.items():
+ implementation_string += "%s: %s\n" % (param, value)
+ implementation_string = implementation_string.replace("\n", "
")
+ self.implementation_string = implementation_string
+ self.implementation_win = self.vis.text(
+ implementation_string,
+ opts={"title": "Training implementation"}
+ )
+
+ def update(self, loss, eer, step):
+ # Update the tracking data
+ now = timer()
+ self.step_times.append(1000 * (now - self.last_update_timestamp))
+ self.last_update_timestamp = now
+ self.losses.append(loss)
+ self.eers.append(eer)
+ print(".", end="")
+
+ # Update the plots every steps
+ if step % self.update_every != 0:
+ return
+ time_string = "Step time: mean: %5dms std: %5dms" % \
+ (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+ print("\nStep %6d Loss: %.4f EER: %.4f %s" %
+ (step, np.mean(self.losses), np.mean(self.eers), time_string))
+ if not self.disabled:
+ self.loss_win = self.vis.line(
+ [np.mean(self.losses)],
+ [step],
+ win=self.loss_win,
+ update="append" if self.loss_win else None,
+ opts=dict(
+ legend=["Avg. loss"],
+ xlabel="Step",
+ ylabel="Loss",
+ title="Loss",
+ )
+ )
+ self.eer_win = self.vis.line(
+ [np.mean(self.eers)],
+ [step],
+ win=self.eer_win,
+ update="append" if self.eer_win else None,
+ opts=dict(
+ legend=["Avg. EER"],
+ xlabel="Step",
+ ylabel="EER",
+ title="Equal error rate"
+ )
+ )
+ if self.implementation_win is not None:
+ self.vis.text(
+ self.implementation_string + ("%s" % time_string),
+ win=self.implementation_win,
+ opts={"title": "Training implementation"},
+ )
+
+ # Reset the tracking
+ self.losses.clear()
+ self.eers.clear()
+ self.step_times.clear()
+
+ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+ max_speakers=10):
+ max_speakers = min(max_speakers, len(colormap))
+ embeds = embeds[:max_speakers * utterances_per_speaker]
+
+ n_speakers = len(embeds) // utterances_per_speaker
+ ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+ colors = [colormap[i] for i in ground_truth]
+
+ reducer = umap.UMAP()
+ projected = reducer.fit_transform(embeds)
+ plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+ plt.gca().set_aspect("equal", "datalim")
+ plt.title("UMAP projection (step %d)" % step)
+ if not self.disabled:
+ self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+ if out_fpath is not None:
+ plt.savefig(out_fpath)
+ plt.clf()
+
+ def save(self):
+ if not self.disabled:
+ self.vis.save([self.env_name])
+
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py
@@ -0,0 +1,173 @@
+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+ def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+ """
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
+ If None, defaults to cuda if it is available on your machine, otherwise the model will
+ run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+ """
+ super().__init__()
+
+ # Define the network
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+ self.relu = nn.ReLU()
+
+ # Get the target device
+ if device is None:
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ elif isinstance(device, str):
+ device = torch.device(device)
+ self.device = device
+
+ # Load the pretrained model'speaker weights
+ # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+ # if not weights_fpath.exists():
+ # raise Exception("Couldn't find the voice encoder pretrained model at %s." %
+ # weights_fpath)
+
+ start = timer()
+ checkpoint = torch.load(weights_fpath, map_location="cpu")
+
+ self.load_state_dict(checkpoint["model_state"], strict=False)
+ self.to(device)
+
+ if verbose:
+ print("Loaded the voice encoder model on %s in %.2f seconds." %
+ (device.type, timer() - start))
+
+ def forward(self, mels: torch.FloatTensor):
+ """
+ Computes the embeddings of a batch of utterance spectrograms.
+ :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
+ (batch_size, n_frames, n_channels)
+ :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
+ Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+ """
+ # Pass the input through the LSTM layers and retrieve the final hidden state of the last
+ # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+ _, (hidden, _) = self.lstm(mels)
+ embeds_raw = self.relu(self.linear(hidden[-1]))
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+
+ @staticmethod
+ def compute_partial_slices(n_samples: int, rate, min_coverage):
+ """
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to
+ obtain partial utterances of each. Both the waveform and the
+ mel spectrogram slices are returned, so as to make each partial utterance waveform
+ correspond to its spectrogram.
+
+ The returned ranges may be indexing further than the length of the waveform. It is
+ recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+
+ :param n_samples: the number of samples in the waveform
+ :param rate: how many partial utterances should occur per second. Partial utterances must
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+ the minimum rate is thus 0.625.
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+ it will be discarded. If there aren't enough frames for one partial utterance,
+ this parameter is ignored so that the function always returns at least one slice.
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
+ utterances.
+ """
+ assert 0 < min_coverage <= 1
+
+ # Compute how many frames separate two partial utterances
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+ frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+ assert 0 < frame_step, "The rate is too high"
+ assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+ (sampling_rate / (samples_per_frame * partials_n_frames))
+
+ # Compute the slices
+ wav_slices, mel_slices = [], []
+ steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+ for i in range(0, steps, frame_step):
+ mel_range = np.array([i, i + partials_n_frames])
+ wav_range = mel_range * samples_per_frame
+ mel_slices.append(slice(*mel_range))
+ wav_slices.append(slice(*wav_range))
+
+ # Evaluate whether extra padding is warranted or not
+ last_wav_range = wav_slices[-1]
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+ if coverage < min_coverage and len(mel_slices) > 1:
+ mel_slices = mel_slices[:-1]
+ wav_slices = wav_slices[:-1]
+
+ return wav_slices, mel_slices
+
+ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+ """
+ Computes an embedding for a single utterance. The utterance is divided in partial
+ utterances and an embedding is computed for each. The complete utterance embedding is the
+ L2-normed average embedding of the partial utterances.
+
+ TODO: independent batched version of this function
+
+ :param wav: a preprocessed utterance waveform as a numpy array of float32
+ :param return_partials: if True, the partial embeddings will also be returned along with
+ the wav slices corresponding to each partial utterance.
+ :param rate: how many partial utterances should occur per second. Partial utterances must
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
+ the minimum rate is thus 0.625.
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
+ enough frames. If at least of are present,
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
+ it will be discarded. If there aren't enough frames for one partial utterance,
+ this parameter is ignored so that the function always returns at least one slice.
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+ is True, the partial utterances as a numpy array of float32 of shape
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+ returned.
+ """
+ # Compute where to split the utterance into partials and pad the waveform with zeros if
+ # the partial utterances cover a larger range.
+ wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+ max_wave_length = wav_slices[-1].stop
+ if max_wave_length >= len(wav):
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+
+ # Split the utterance into partials and forward them through the model
+ mel = audio.wav_to_mel_spectrogram(wav)
+ mels = np.array([mel[s] for s in mel_slices])
+ with torch.no_grad():
+ mels = torch.from_numpy(mels).to(self.device)
+ partial_embeds = self(mels).cpu().numpy()
+
+ # Compute the utterance embedding from the partial embeddings
+ raw_embed = np.mean(partial_embeds, axis=0)
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
+
+ if return_partials:
+ return embed, partial_embeds, wav_slices
+ return embed
+
+ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+ """
+ Compute the embedding of a collection of wavs (presumably from the same speaker) by
+ averaging their embedding and L2-normalizing it.
+
+ :param wavs: list of wavs a numpy arrays of float32.
+ :param kwargs: extra arguments to embed_utterance()
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+ """
+ raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+ for wav in wavs], axis=0)
+ return raw_embed / np.linalg.norm(raw_embed, 2)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/spk_ext.py b/dreamvoice/train_utils/src/spk_ext.py
index 6f20b75c46bb518143d9d5acd3481e84c71e0e47..5f348d653b2a945493ede4a2d7e55f5fd1d62288 100644
--- a/dreamvoice/train_utils/src/spk_ext.py
+++ b/dreamvoice/train_utils/src/spk_ext.py
@@ -46,4 +46,69 @@ def se_extractor(audio_path, vc):
gs.append(g.detach())
gs = torch.stack(gs).mean(0)
- return gs.cpu()
\ No newline at end of file
+ return gs.cpu()
+
+
+def process_audio_folder(input_folder, output_folder, model, device):
+ """
+ Process all audio files in a folder and its subfolders,
+ save the extracted features as .pt files in the output folder with the same structure.
+
+ Args:
+ input_folder (str): Path to the input folder containing audio files.
+ output_folder (str): Path to the output folder to save .pt files.
+ model: Pre-trained model for feature extraction.
+ device: Torch device (e.g., 'cpu' or 'cuda').
+ """
+ # Collect all audio file paths
+ audio_files = []
+ for root, _, files in os.walk(input_folder):
+ for file in files:
+ if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
+ audio_files.append(os.path.join(root, file))
+
+ # Process each audio file with tqdm for progress
+ for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
+ # Construct output path
+ relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
+ output_dir = os.path.join(output_folder, relative_path)
+ os.makedirs(output_dir, exist_ok=True)
+ output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
+
+ # Check if the .pt file already exists
+ if os.path.exists(output_path):
+ # print(f"Skipped (already exists): {output_path}")
+ continue # Skip processing this file
+ # Extract features
+ target_se = se_extractor(audio_path, model).to(device)
+ # Save the feature as .pt
+ torch.save(target_se, output_path)
+ # print(f"Processed and saved: {output_path}")
+
+
+if __name__ == '__main__':
+ ckpt_converter = 'checkpoints_v2/converter'
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+ model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+
+ # audio_path = 'debug.wav'
+ # target_se = se_extractor(audio_path, model).to(device)
+
+ # source_path = 'source.wav'
+ # source_se = se_extractor(source_path, model).to(device)
+
+ # encode_message = "@MyShell"
+ # model.convert(
+ # audio_src_path=source_path,
+ # src_se=source_se,
+ # tgt_se=target_se,
+ # output_path='output.wav',
+ # message=encode_message)
+ # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
+ # output_folder = 'spk/VCTK-Corpus/'
+ # process_audio_folder(input_folder, output_folder, model, device)
+
+ input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
+ output_folder = 'spk/LibriTTS-R/train-clean-360/'
+ process_audio_folder(input_folder, output_folder, model, device)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/train_freevc.py b/dreamvoice/train_utils/src/train_freevc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e1fac17288da07a1f2bcb6f42f6f6c7b2e0b81
--- /dev/null
+++ b/dreamvoice/train_utils/src/train_freevc.py
@@ -0,0 +1,214 @@
+import yaml
+import random
+import argparse
+import os
+import time
+from tqdm import tqdm
+from pathlib import Path
+
+import torch
+from torch.utils.data import DataLoader
+
+from accelerate import Accelerator
+from diffusers import DDIMScheduler
+
+from configs.plugin import get_params
+from model.p2e_cross import P2E_Cross
+from modules.speaker_encoder.encoder import inference as spk_encoder
+from transformers import T5Tokenizer, T5EncoderModel, AutoModel
+from inference_freevc import eval_plugin
+from dataset.dreamvc import DreamData
+# from vc_wrapper import load_diffvc_models
+from freevc_wrapper import get_freevc_models
+from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift
+
+parser = argparse.ArgumentParser()
+
+# config settings
+parser.add_argument('--config-name', type=str, default='Plugin_freevc')
+parser.add_argument('--vc-unet-path', type=str, default='freevc')
+parser.add_argument('--speaker-path', type=str, default='speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+
+# training settings
+parser.add_argument("--amp", type=str, default='fp16')
+parser.add_argument('--epochs', type=int, default=200)
+parser.add_argument('--batch-size', type=int, default=32)
+parser.add_argument('--num-workers', type=int, default=8)
+parser.add_argument('--num-threads', type=int, default=1)
+parser.add_argument('--save-every', type=int, default=10)
+
+# log and random seed
+parser.add_argument('--random-seed', type=int, default=2023)
+parser.add_argument('--log-step', type=int, default=200)
+parser.add_argument('--log-dir', type=str, default='../logs/')
+parser.add_argument('--save-dir', type=str, default='../ckpts/')
+
+args = parser.parse_args()
+params = get_params(args.config_name)
+args.log_dir = args.log_dir + args.config_name + '/'
+
+with open('model/p2e_cross.yaml', 'r') as fp:
+ config = yaml.safe_load(fp)
+
+if os.path.exists(args.save_dir + args.config_name) is False:
+ os.makedirs(args.save_dir + args.config_name)
+
+if os.path.exists(args.log_dir) is False:
+ os.makedirs(args.log_dir)
+
+if __name__ == '__main__':
+ # Fix the random seed
+ random.seed(args.random_seed)
+ torch.manual_seed(args.random_seed)
+
+ # Set device
+ torch.set_num_threads(args.num_threads)
+ if torch.cuda.is_available():
+ args.device = 'cuda'
+ torch.cuda.manual_seed(args.random_seed)
+ torch.cuda.manual_seed_all(args.random_seed)
+ torch.backends.cuda.matmul.allow_tf32 = True
+ if torch.backends.cudnn.is_available():
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.allow_tf32 = True
+ torch.backends.cudnn.benchmark = False
+ else:
+ args.device = 'cpu'
+
+ train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv',
+ subset='train', prompt_dir='../prepare/prompts.csv',)
+ train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
+
+ # use accelerator for multi-gpu training
+ accelerator = Accelerator(mixed_precision=args.amp)
+
+ # vc_unet, hifigan, _, logmel, vc_scheduler = load_diffvc_models(args.vc_unet_path,
+ # args.vocoder_path,
+ # args.speaker_path,
+ # args.vc_config_path,
+ # accelerator.device)
+ freevc_24, cmodel, _, hps = get_freevc_models(args.vc_unet_path, args.speaker_path, accelerator.device)
+ # speaker
+ # spk_encoder.load_model(Path(args.speaker_path), accelerator.device)
+
+ # text encoder
+ tokenizer = T5Tokenizer.from_pretrained(params.text_encoder.model)
+ text_encoder = T5EncoderModel.from_pretrained(params.text_encoder.model).to(accelerator.device)
+ text_encoder.eval()
+
+ # main U-Net
+ model = P2E_Cross(config['diffwrap']).to(accelerator.device)
+ model.load_state_dict(torch.load('../ckpts/Plugin_freevc/49.pt')['model'])
+
+ total_params = sum([param.nelement() for param in model.parameters()])
+ print("Number of parameter: %.2fM" % (total_params / 1e6))
+
+ if params.diff.v_prediction:
+ print('v prediction')
+ noise_scheduler = DDIMScheduler(num_train_timesteps=params.diff.num_train_steps,
+ beta_start=params.diff.beta_start, beta_end=params.diff.beta_end,
+ rescale_betas_zero_snr=True,
+ timestep_spacing="trailing",
+ clip_sample=False,
+ prediction_type='v_prediction')
+ else:
+ print('noise prediction')
+ noise_scheduler = DDIMScheduler(num_train_timesteps=args.num_train_steps,
+ beta_start=args.beta_start, beta_end=args.beta_end,
+ clip_sample=False,
+ prediction_type='epsilon')
+
+ optimizer = torch.optim.AdamW(model.parameters(),
+ lr=params.opt.learning_rate,
+ betas=(params.opt.beta1, params.opt.beta2),
+ weight_decay=params.opt.weight_decay,
+ eps=params.opt.adam_epsilon,
+ )
+ loss_func = torch.nn.MSELoss()
+
+ model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
+
+ global_step = 0
+ losses = 0
+
+ if accelerator.is_main_process:
+ eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder],
+ model, noise_scheduler, (1, 256, 1),
+ val_meta='../prepare/val_meta.csv',
+ val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/',
+ guidance_scale=3.0, guidance_rescale=0.0,
+ ddim_steps=100, eta=1, random_seed=None,
+ device=accelerator.device,
+ epoch='test', save_path=args.log_dir + 'output/', val_num=10)
+ accelerator.wait_for_everyone()
+
+ for epoch in range(args.epochs):
+ model.train()
+ for step, batch in enumerate(tqdm(train_loader)):
+ spk_embed, prompt = batch
+ spk_embed = spk_embed.unsqueeze(-1)
+
+ with torch.no_grad():
+ text_batch = tokenizer(prompt,
+ max_length=32,
+ padding='max_length', truncation=True, return_tensors="pt")
+ text, text_mask = text_batch.input_ids.to(spk_embed.device), \
+ text_batch.attention_mask.to(spk_embed.device)
+ text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+ spk_embed = scale_shift(spk_embed, 20, -0.035)
+ # spk_embed = minmax_norm_diff(spk_embed, vmax=0.5, vmin=0.0)
+ # content_clip = align_seq(content_clip, audio_clip.shape[-1])
+ # f0_clip = align_seq(f0_clip, audio_clip.shape[-1])
+
+ # adding noise
+ noise = torch.randn(spk_embed.shape).to(accelerator.device)
+ timesteps = torch.randint(0, params.diff.num_train_steps, (noise.shape[0],),
+ device=accelerator.device, ).long()
+ noisy_target = noise_scheduler.add_noise(spk_embed, noise, timesteps)
+ # v prediction - model output
+ velocity = noise_scheduler.get_velocity(spk_embed, noise, timesteps)
+
+ # inference
+ pred = model(noisy_target, timesteps, text, text_mask, train_cfg=True, cfg_prob=0.25)
+ # backward
+ if params.diff.v_prediction:
+ loss = loss_func(pred, velocity)
+ else:
+ loss = loss_func(pred, noise)
+
+ accelerator.backward(loss)
+ optimizer.step()
+ optimizer.zero_grad()
+
+ global_step += 1
+ losses += loss.item()
+
+ if accelerator.is_main_process:
+ if global_step % args.log_step == 0:
+ n = open(args.log_dir + 'diff_vc.txt', mode='a')
+ n.write(time.asctime(time.localtime(time.time())))
+ n.write('\n')
+ n.write('Epoch: [{}][{}] Batch: [{}][{}] Loss: {:.6f}\n'.format(
+ epoch + 1, args.epochs, step + 1, len(train_loader), losses / args.log_step))
+ n.close()
+ losses = 0.0
+
+ accelerator.wait_for_everyone()
+
+ if (epoch + 1) % args.save_every == 0:
+ if accelerator.is_main_process:
+ eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder],
+ model, noise_scheduler, (1, 256, 1),
+ val_meta='../prepare/val_meta.csv',
+ val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/',
+ guidance_scale=3, guidance_rescale=0.0,
+ ddim_steps=50, eta=1, random_seed=2024,
+ device=accelerator.device,
+ epoch=epoch, save_path=args.log_dir + 'output/', val_num=10)
+
+ unwrapped_unet = accelerator.unwrap_model(model)
+ accelerator.save({
+ "model": unwrapped_unet.state_dict(),
+ }, args.save_dir + args.config_name + '/' + str(epoch) + '.pt')
diff --git a/dreamvoice/train_utils/src/train.py b/dreamvoice/train_utils/src/train_openvoice.py
similarity index 95%
rename from dreamvoice/train_utils/src/train.py
rename to dreamvoice/train_utils/src/train_openvoice.py
index a5fb5ac6226f985925fa2fbaf417fcfdd6782443..0bfd7d2ae77310eb645a2cfdf613ed6ecd6dc9b9 100644
--- a/dreamvoice/train_utils/src/train.py
+++ b/dreamvoice/train_utils/src/train_openvoice.py
@@ -25,7 +25,7 @@ from utils import minmax_norm_diff, reverse_minmax_norm_diff
parser = argparse.ArgumentParser()
# config settings
-parser.add_argument('--config-name', type=str, default='Plugin_base')
+parser.add_argument('--config-name', type=str, default='Plugin_freevc')
# training settings
parser.add_argument("--amp", type=str, default='fp16')
@@ -73,7 +73,7 @@ if __name__ == '__main__':
else:
args.device = 'cpu'
- train_set = DreamData(data_dir='../prepare/spk/', meta_dir='../prepare/plugin_meta.csv',
+ train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv',
subset='train', prompt_dir='../prepare/prompts.csv',)
train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
diff --git a/dreamvoice/train_utils/src/utils.py b/dreamvoice/train_utils/src/utils.py
index 0fe1dcba21ce183e2e7c26a711c702ca089813d3..6dc3e7629165253801ceb946e6d9ac80a89a25f2 100644
--- a/dreamvoice/train_utils/src/utils.py
+++ b/dreamvoice/train_utils/src/utils.py
@@ -15,3 +15,23 @@ def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: floa
tensor = (tensor + 1) / 2
tensor = tensor * (vmax - vmin) + vmin
return tensor
+
+
+def scale_shift(x, scale, shift):
+ return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+ return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+ source_len = source.shape[1]
+ if mapping_method == 'hard':
+ mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+ output = source[:, mapping_idx]
+ else:
+ # TBD
+ raise NotImplementedError
+
+ return output
\ No newline at end of file
diff --git a/freevc_example.py b/freevc_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..0228cdaf097068b714c511890a78e1e08196989e
--- /dev/null
+++ b/freevc_example.py
@@ -0,0 +1,25 @@
+import torch
+import librosa
+import soundfile as sf
+from dreamvoice import DreamVoice_Plugin
+from dreamvoice.freevc_wrapper import get_freevc_models, convert
+
+freevc, cmodel, hps = get_freevc_models('ckpts_freevc/', 'dreamvoice/', 'cuda')
+device = 'cuda'
+
+# init dreamvoice
+dreamvoice = DreamVoice_Plugin(config='plugin_freevc.yaml', device=device)
+
+# generate speaker
+prompt = "old female's voice, deep and dark"
+target_se = dreamvoice.gen_spk(prompt)
+
+# content source
+source_path = 'examples/test1.wav'
+audio_clip = librosa.load(source_path, sr=16000)[0]
+audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device)
+
+content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device)
+
+output, out_sr = convert(freevc, content, target_se)
+sf.write('output.wav', output, out_sr)
\ No newline at end of file
diff --git a/examples/openvoice_example.py b/openvoice_example.py
similarity index 92%
rename from examples/openvoice_example.py
rename to openvoice_example.py
index dccbe4b6688e717b551979409cb778dd856f6a1d..4d20e44399cb7d8c87399bc99622d9965b128295 100644
--- a/examples/openvoice_example.py
+++ b/openvoice_example.py
@@ -14,12 +14,12 @@ openvoice = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
openvoice.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
# generate speaker
-prompt = 'rough boy voice, young'
+prompt = 'female voice, bright and cute'
target_se = dreamvoice.gen_spk(prompt)
target_se = target_se.unsqueeze(-1)
# content source
-source_path = 'examples/test2.wav'
+source_path = 'segment_1.mp3'
source_se = se_extractor(source_path, openvoice).to(device)
# voice conversion