diff --git a/ckpts_freevc/freevc.json b/ckpts_freevc/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/ckpts_freevc/freevc.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/ckpts_freevc/freevc.pth b/ckpts_freevc/freevc.pth
new file mode 100644
index 0000000000000000000000000000000000000000..976143bef5d846836704a38f7ad57cb0535d40b8
--- /dev/null
+++ b/ckpts_freevc/freevc.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2cc2d047f63b80d1d6780e37611cec11a01d597560393b1fe6118158b3bd47f
+size 472644351
diff --git a/dreamvoice/freevc/.gitattributes b/dreamvoice/freevc/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26
--- /dev/null
+++ b/dreamvoice/freevc/.gitattributes
@@ -0,0 +1,34 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/dreamvoice/freevc/.gitignore b/dreamvoice/freevc/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4
--- /dev/null
+++ b/dreamvoice/freevc/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+flagged
\ No newline at end of file
diff --git a/dreamvoice/freevc/README.md b/dreamvoice/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/freevc/app.py b/dreamvoice/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79
--- /dev/null
+++ b/dreamvoice/freevc/app.py
@@ -0,0 +1,92 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+    shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# print("Loading FreeVC...")
+# hps = utils.get_hparams_from_file("configs/freevc.json")
+# freevc = SynthesizerTrn(
+#     hps.data.filter_length // 2 + 1,
+#     hps.train.segment_size // hps.data.hop_length,
+#     **hps.model).to(device)
+# _ = freevc.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+# print("Loading FreeVC-s...")
+# hps = utils.get_hparams_from_file("configs/freevc-s.json")
+# freevc_s = SynthesizerTrn(
+#     hps.data.filter_length // 2 + 1,
+#     hps.train.segment_size // hps.data.hop_length,
+#     **hps.model).to(device)
+# _ = freevc_s.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+#
+# print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+ 
+def convert(model, cmodel, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        g_tgt = smodel.embed_utterance(wav_tgt)
+        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+    
+# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
+# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath')
+# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath')
+# inputs = [model, audio1, audio2]
+# outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
+#
+# title = "FreeVC"
+# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please <strong>trim</strong> it before submitting."
+# article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Paper</a> | <a href='https://github.com/OlaWod/FreeVC' target='_blank'>Github Repo</a></p>"
+#
+# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+#
+# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
+convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav')
\ No newline at end of file
diff --git a/dreamvoice/freevc/commons.py b/dreamvoice/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+
+
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+
+
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm
diff --git a/dreamvoice/freevc/configs/freevc-24.json b/dreamvoice/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8640,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8008"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,4,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/freevc/configs/freevc-s.json b/dreamvoice/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": false
+  }
+}
diff --git a/dreamvoice/freevc/configs/freevc.json b/dreamvoice/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/freevc/mel_processing.py b/dreamvoice/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
diff --git a/dreamvoice/freevc/models.py b/dreamvoice/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361
--- /dev/null
+++ b/dreamvoice/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .commons import sequence_mask, rand_slice_segments
+from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(Flip())
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+
+
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = ResBlock1 if resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+        
+        
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+            
+        return mel_slices
+    
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+        
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        
+        return embed
+
+
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+
+  def __init__(self, 
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock, 
+    resblock_kernel_sizes, 
+    resblock_dilation_sizes, 
+    upsample_rates, 
+    upsample_initial_channel, 
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+      
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+      
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) 
+    z_p = self.flow(z, spec_mask, g=g)
+
+    z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    
+    return o
diff --git a/dreamvoice/freevc/modules.py b/dreamvoice/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5
--- /dev/null
+++ b/dreamvoice/freevc/modules.py
@@ -0,0 +1,341 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+
+ 
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+
+
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+
+
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+    if g is not None:
+      g = self.cond_layer(g)
+
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+
+      acts = fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+    
+
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+
+
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+
+
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
diff --git a/dreamvoice/freevc/requirements.txt b/dreamvoice/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/freevc/utils.py b/dreamvoice/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69
--- /dev/null
+++ b/dreamvoice/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from .commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+    checkpoint = torch.load('wavlm/WavLM-Large.pt')
+    cfg = WavLMConfig(checkpoint['cfg'])
+    cmodel = WavLM(cfg).cuda(rank)
+    cmodel.load_state_dict(checkpoint['model'])
+    cmodel.eval()
+    return cmodel
+    
+    
+def get_content(cmodel, y):
+    with torch.no_grad():
+        c = cmodel.extract_features(y.squeeze(1))[0]
+    c = c.transpose(1, 2)
+    return c
+
+
+def get_vocoder(rank):
+    with open("hifigan/config.json", "r") as f:
+        config = json.load(f)
+    config = hifigan.AttrDict(config)
+    vocoder = hifigan.Generator(config)
+    ckpt = torch.load("hifigan/generator_v1")
+    vocoder.load_state_dict(ckpt["generator"])
+    vocoder.eval()
+    vocoder.remove_weight_norm()
+    vocoder.cuda(rank)
+    return vocoder
+    
+    
+def transform(mel, height): # 68-92
+    #r = np.random.random()
+    #rate = r * 0.3 + 0.85 # 0.85-1.15
+    #height = int(mel.size(-2) * rate)
+    tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+    if height >= mel.size(-2):
+        return tgt[:, :mel.size(-2), :]
+    else:
+        silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) 
+        silence += torch.randn_like(silence) / 10
+        return torch.cat((tgt, silence), 1)
+        
+        
+def stretch(mel, width): # 0.5-2
+    return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None:
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logger.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+
+
+def get_hparams(init=True):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                      help='JSON file for configuration')
+  parser.add_argument('-m', '--model', type=str, required=True,
+                      help='Model name')
+  
+  args = parser.parse_args()
+  model_dir = os.path.join("./logs", args.model)
+
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+
+  config_path = args.config
+  config_save_path = os.path.join(model_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+  
+  hparams = HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  return hparams
+
+
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+
+
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+    
+  def keys(self):
+    return self.__dict__.keys()
+
+  def items(self):
+    return self.__dict__.items()
+
+  def values(self):
+    return self.__dict__.values()
+
+  def __len__(self):
+    return len(self.__dict__)
+
+  def __getitem__(self, key):
+    return getattr(self, key)
+
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+
+  def __contains__(self, key):
+    return key in self.__dict__
+
+  def __repr__(self):
+    return self.__dict__.__repr__()
diff --git a/dreamvoice/freevc_wrapper.py b/dreamvoice/freevc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..958cd74a44e119cbafb2365ca1ebb4a7eac66c7e
--- /dev/null
+++ b/dreamvoice/freevc_wrapper.py
@@ -0,0 +1,63 @@
+import os
+import torch
+import librosa
+import soundfile as sf
+from pathlib import Path
+
+from transformers import WavLMModel
+from .freevc.utils import load_checkpoint, get_hparams_from_file
+from .freevc.models import SynthesizerTrn
+# from mel_processing import mel_spectrogram_torch
+# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder
+# from speaker_encoder.voice_encoder import SpeakerEncoder
+
+
+def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
+    hps = get_hparams_from_file(f"{path}/freevc.json")
+    freevc = SynthesizerTrn(
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model).to(device)
+    freevc.eval()
+    load_checkpoint(f"{path}/freevc.pth", freevc, None)
+
+    cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+    cmodel.eval()
+
+    # smodel = spk_encoder.load_model(Path(speaker_path), device)
+    # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda')
+    # smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
+
+    return freevc, cmodel, hps
+
+
+@torch.no_grad()
+def convert(freevc, content, speaker):
+    audio = freevc.infer(content, g=speaker)
+    audio = audio[0][0].data.cpu().float().numpy()
+    return audio, 16000
+
+
+if __name__ == '__main__':
+    freevc_24, cmodel, smodel, hps = get_freevc_models()
+
+    tgt = 'p226_002.wav'
+    # src = 'p226_002.wav'
+    src = 'p225_001.wav'
+    device = 'cuda'
+
+    # tgt
+    wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+    wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+    g_tgt = smodel.embed_utterance(wav_tgt)
+    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+    # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda())
+
+    # src
+    wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+    wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+    content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+
+    output, sr = convert(freevc_24, content, g_tgt)
+
+    sf.write('output.wav', output, sr)
\ No newline at end of file
diff --git a/dreamvoice/plugin.py b/dreamvoice/plugin.py
index 12243ecb47d63270aef13fd44c4dbd040198879d..aeef16a90eaa8851293ca2090bdcecb5544dde02 100644
--- a/dreamvoice/plugin.py
+++ b/dreamvoice/plugin.py
@@ -108,7 +108,6 @@ class DreamVoice_Plugin:
         self.spk_encoder = spk_encoder
         self.spk_embed_cache = None
 
-
     @torch.no_grad()
     def gen_spk(self, prompt,
                 prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
diff --git a/dreamvoice/plugin_ckpts/freevc.pt b/dreamvoice/plugin_ckpts/freevc.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5676c4bbc95085ed5a7da8b7d1d479849b1bd39
--- /dev/null
+++ b/dreamvoice/plugin_ckpts/freevc.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0589fd38d965a7f8aab6eb3bedae5d1c007acb0f305e04bbe0fd4a771fff717d
+size 104892189
diff --git a/dreamvoice/plugin_freevc.yaml b/dreamvoice/plugin_freevc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e67b8c26e8d4c0eb36d0650639e8a547f6e90691
--- /dev/null
+++ b/dreamvoice/plugin_freevc.yaml
@@ -0,0 +1,8 @@
+version: 1.1
+
+lm_path: 'google/flan-t5-base'
+
+dreamvg:
+  config_path: 'src/configs/plugin_cross_freevc.yaml'
+  ckpt_path: 'plugin_ckpts/freevc.pt'
+  ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/freevc.pt'
\ No newline at end of file
diff --git a/dreamvoice/src/configs/plugin_cross.yaml b/dreamvoice/src/configs/plugin_cross_freevc.yaml
similarity index 100%
rename from dreamvoice/src/configs/plugin_cross.yaml
rename to dreamvoice/src/configs/plugin_cross_freevc.yaml
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/app.py b/dreamvoice/train_utils/prepare_freevc/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..040c13a7f789e9edf88565c756d1059c2a3f1e01
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/app.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+    shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc.eval()
+_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+print("Loading FreeVC-s...")
+hps = utils.get_hparams_from_file("configs/freevc-s.json")
+freevc_s = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_s.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+ 
+def convert(model, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        if model == "FreeVC" or model == "FreeVC (24kHz)":
+            g_tgt = smodel.embed_utterance(wav_tgt)
+            g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        else:
+            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
+            mel_tgt = mel_spectrogram_torch(
+                wav_tgt, 
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+    
+model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model") 
+audio1 = gr.Audio(label="Source Audio", type='filepath')
+audio2 = gr.Audio(label="Reference Audio", type='filepath')
+inputs = [model, audio1, audio2]
+outputs =  gr.Audio(label="Output Audio", type='filepath')
+
+title = "FreeVC"
+description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please <strong>trim</strong> it before submitting."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Paper</a> | <a href='https://github.com/OlaWod/FreeVC' target='_blank'>Github Repo</a></p>"
+
+examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+
+gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/commons.py b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+
+
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+
+
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8640,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8008"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,4,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": false
+  }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16a7adcabb167ddc2c95e6d4bc722542f5fb716
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py
@@ -0,0 +1,69 @@
+import os
+import torch
+import torch.nn.functional as F
+import librosa
+import sounddevice as sd
+from transformers import WavLMModel
+from scipy.io.wavfile import write
+from models import SynthesizerTrn
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import utils
+import numpy as np
+from transformers import T5Tokenizer, T5EncoderModel
+from src.plugin_wrapper import DreamVG
+import soundfile as sf
+
+
+# Load configurations and models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+freevc.eval()
+utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+
+print("Loading Speaker Encoder...")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+lm_path = 'google/flan-t5-base'
+tokenizer = T5Tokenizer.from_pretrained(lm_path)
+text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
+                  ckpt_path='checkpoints/dreamvc_plugin.pt',
+                  device=device)
+
+
+prompt = "girl's voice, very young and cute"
+prompt_guidance_scale = 3.0
+
+text_batch = tokenizer(prompt, max_length=32,
+                       padding='max_length', truncation=True, return_tensors="pt")
+text, text_mask = text_batch.input_ids.to(device), \
+    text_batch.attention_mask.to(device)
+text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+target_embedding = dreamvg.inference([text, text_mask],
+                                     guidance_scale=prompt_guidance_scale,
+                                     guidance_rescale=0.0,
+                                     ddim_steps=100, eta=1,
+                                     random_seed=None)
+
+# Convert to tensor and pad
+audio, sr = librosa.load('segment_1.mp3', sr=16000)
+audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
+audio = F.pad(audio, (40, 40))
+
+# Extract content features using WavLM
+c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)
+
+audio = freevc.infer(c, g=target_embedding)
+audio = audio[0][0].data.cpu().float().numpy()
+
+sf.write('freevc_out.wav', audio, 16000)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..f732af47416bc0ed884a821e063fed5b7eab7957
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import commons
+import modules
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(modules.Flip())
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+
+
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+        
+        
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+            
+        return mel_slices
+    
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+        
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        
+        return embed
+
+
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+
+  def __init__(self, 
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock, 
+    resblock_kernel_sizes, 
+    resblock_dilation_sizes, 
+    upsample_rates, 
+    upsample_initial_channel, 
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+      
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+      
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) 
+    z_p = self.flow(z, spec_mask, g=g)
+
+    z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    
+    return o
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/modules.py b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eeb47c190cdc4d42d5de5fa47f94ecc1b931c5d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/modules.py
@@ -0,0 +1,342 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+import commons
+from commons import init_weights, get_padding
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+
+ 
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+
+
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+
+
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+    if g is not None:
+      g = self.cond_layer(g)
+
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+    
+
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+
+
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+
+
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e084cf69514c429559d9a086b97f3721bd7a8b23
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml
@@ -0,0 +1,47 @@
+version: 1.0
+
+system: "base"
+
+model:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d8b894cd095accdcb9eab7788e8088d0430eae1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml
@@ -0,0 +1,34 @@
+version: 1.0
+
+system: "base"
+
+diffwrap:
+  cls_embedding:
+    speaker_dim: 256
+    feature_dim: 512
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+    
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "AttnDownBlock2D",
+        "AttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "AttnUpBlock2D",
+        "AttnUpBlock2D",
+        "UpBlock2D"
+      ]
+    attention_head_dim: 32
+    class_embed_type: 'identity'
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c41681e2b762ad7d037780e560f706eba443fd66
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml
@@ -0,0 +1,45 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: false
+
+  unet:  
+    sample_size: [128, 256]
+    in_channels: 257
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+  
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 2.75
+  shift: 5.80
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af34723cf72c0cdbb079f0d8797a39527c04f0ff
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml
@@ -0,0 +1,33 @@
+version: 1.0
+
+system: "cross"
+
+diffwrap:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+    use_pitch: true
+    pitch_dim: 1
+    pitch_hidden: 128
+
+  unet:  
+    sample_size: [100, 256]
+    in_channels: 385
+    out_channels: 1
+    layers_per_block: 2
+    block_out_channels: [128, 256, 512]
+    down_block_types:
+      [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7189aa2355830ed46a97fcb3f29b94b2e423198e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml
@@ -0,0 +1,39 @@
+version: 1.0
+
+system: "cross"
+
+model:
+  cls_embedding:
+    content_dim: 768
+    content_hidden: 256
+
+  unet:  
+    sample_size: [1, 1]
+    in_channels: 256
+    out_channels: 256
+    layers_per_block: 2
+    block_out_channels: [256]
+    down_block_types:
+      [
+        "CrossAttnDownBlock2D",
+      ]
+    up_block_types:
+      [
+        "CrossAttnUpBlock2D",
+      ]
+    attention_head_dim: 32
+    cross_attention_dim: 768
+
+scheduler:
+  num_train_steps: 1000
+  beta_schedule: 'linear'
+  beta_start: 0.0001
+  beta_end: 0.02
+  num_infer_steps: 50
+  rescale_betas_zero_snr: true
+  timestep_spacing: "trailing"
+  clip_sample: false
+  prediction_type: 'v_prediction'
+  scale: 0.05 
+  shift: -0.035
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1e827b1e8f82be63a40ce6204d1d83c10afc3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+import pandas as pd
+# from feats.hubert_model import get_soft_model, get_hubert_soft_content
+from feats.contentvec_hf import get_content_model, get_content
+# from modules.speaker_encoder.encoder import inference as spk_encoder
+# from pathlib import Path
+from tqdm import tqdm
+from multiprocessing import Process
+import pyworld as pw
+
+
+def resample_save(infolder, audio_path, model,
+                  audio_sr=24000, content_sr=16000, min_length=1.92,
+                  content_resolution=50,
+                  save_path='features'):
+    if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
+        audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
+        final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
+        # final_length = final_length / content_sr
+
+        length = max(round(min_length*content_sr), round(final_length))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+
+        # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
+        content = content.cpu()
+        os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
+        torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
+        # print(audio_save.shape)
+        # print(content.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
+        # print(save_path + '/' + 'audio_16k/' + audio_path)
+
+        audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
+        length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
+        assert length % 10 == 0
+        audio = audio[:length]
+        audio_save = np.zeros(length, dtype=audio.dtype)
+        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
+        # print(audio_save.shape)
+        os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
+        sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
+
+
+def extract_f0(in_folder, audio_path, save_path):
+    audio, sr = librosa.load(in_folder + audio_path, sr=None)
+    assert sr == 16000
+    if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
+        # wav = audio
+        # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
+        # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
+        #                         fmin=librosa.note_to_hz('C2'),
+        #                         fmax=librosa.note_to_hz('C6'))
+
+        _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
+        f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
+
+        f0 = np.nan_to_num(f0)
+        os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
+        # print(save_path + '/' + 'f0/' + audio_path + '.pt')
+        torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_f0_main(in_folder, audio_paths, save_path):
+    for audio_path in tqdm(audio_paths):
+        extract_f0(in_folder, audio_path, save_path)
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('../test_data/vc_meta.csv')
+    # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
+    model = get_content_model().to('cuda')
+    # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
+    for i in tqdm(range(len(df))):
+        row = df.iloc[i]
+        in_path = row['path']
+        resample_save('../test_data/', in_path, model, save_path='../features/')
+
+    in_folder = '../features/audio_16k/'
+    audio_files = list(df['path'])
+    save_path = '../features/'
+    cores = 6
+
+    subsets = chunks(audio_files, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
+        t.start()
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f5888a5f0e1eb5e9cf3c68814a0365ff75c30
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py
@@ -0,0 +1,42 @@
+import torch
+import librosa
+from fairseq import checkpoint_utils
+import torch.nn.functional as F
+
+
+def get_model(vec_path):
+    print("load model(s) from {}".format(vec_path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    feats = wav_16k_tensor
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(wav_16k_tensor.device),
+        "padding_mask": padding_mask.to(wav_16k_tensor.device),
+        "output_layer": layer
+    }
+    logits = hmodel.extract_features(**inputs)[0]
+    # feats = hmodel.final_proj(logits[0])
+    return logits
+
+
+if __name__ == '__main__':
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
+    model = model.cuda()
+    content = get_content(model, torch.tensor([audio]))
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dad4889234a27fd1631d9265684af14560c2638
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py
@@ -0,0 +1,40 @@
+from transformers import HubertModel
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import librosa
+
+
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+
+
+def get_content_model(config='lengyue233/content-vec-best'):
+    model = HubertModelWithFinalProj.from_pretrained(config)
+    model.eval()
+    return model
+
+
+@torch.no_grad()
+def get_content(model, wav_16k_tensor, device='cuda'):
+    # print(layer)
+    wav_16k_tensor = wav_16k_tensor.to(device)
+    # so that the output shape will be len(audio//320)
+    wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
+    logits = model(wav_16k_tensor)['last_hidden_state']
+    return logits
+
+
+if __name__ == '__main__':
+    model = get_content_model().cuda()
+    audio, sr = librosa.load('test.wav', sr=16000)
+    audio = audio[:100*320]
+    audio = torch.tensor([audio])
+    content = get_content(model, audio, 'cuda')
+    print(content)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0202868f93e8b1be2f925f2ec6b22f3df691e8c3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# VSCode project settings
+.vscode
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Benjamin van Niekerk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68602858ed726acd4f99ce9fecca008f3511dc90
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md
@@ -0,0 +1,161 @@
+# HuBERT
+
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2111.02392)
+[![demo](https://img.shields.io/static/v1?message=Audio%20Samples&logo=Github&labelColor=grey&color=blue&logoColor=white&label=%20&style=flat)](https://bshall.github.io/soft-vc/)
+[![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb)
+
+Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484).
+For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb).
+
+<div align="center">
+    <img width="100%" alt="Soft-VC"
+      src="https://raw.githubusercontent.com/bshall/hubert/main/content-encoder.png">
+</div>
+<div>
+  <sup>
+    <strong>Fig 1:</strong> Architecture of the voice conversion system. a) The <strong>discrete</strong> content encoder clusters audio features to produce a sequence of discrete speech units. b) The <strong>soft</strong> content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform.
+  </sup>
+</div>
+
+## Example Usage
+
+### Programmatic Usage
+
+```python
+import torch, torchaudio
+
+# Load checkpoint (either hubert_soft or hubert_discrete)
+hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()
+
+# Load audio
+wav, sr = torchaudio.load("path/to/wav")
+assert sr == 16000
+wav = wav.unsqueeze(0).cuda()
+
+# Extract speech units
+units = hubert.units(x)
+```
+
+### Script-Based Usage
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+  {soft,discrete}       available models (HuBERT-Soft or HuBERT-Discrete)
+  in-dir                path to the dataset directory.
+  out-dir               path to the output directory.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --extension EXTENSION
+                        extension of the audio files (defaults to .flac).
+```
+
+## Training
+
+### Step 1: Dataset Preparation
+
+Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory:
+
+```
+│   lengths.json
+│
+└───wavs
+    ├───dev-*
+    │   ├───84
+    │   ├───...
+    │   └───8842
+    └───train-*
+        ├───19
+        ├───...
+        └───8975
+```
+
+The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples:
+
+```json
+{
+    "dev-clean/1272/128104/1272-128104-0000": 93680,
+    "dev-clean/1272/128104/1272-128104-0001": 77040,
+}
+```
+
+### Step 2: Extract Discrete Speech Units
+
+Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script:
+
+```
+usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
+
+Encode an audio dataset.
+
+positional arguments:
+  {soft,discrete}       available models (HuBERT-Soft or HuBERT-Discrete)
+  in-dir                path to the dataset directory.
+  out-dir               path to the output directory.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --extension EXTENSION
+                        extension of the audio files (defaults to .flac).
+```
+
+for example:
+
+```
+python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete
+```
+
+At this point the directory tree should look like:
+
+```
+│   lengths.json
+│
+├───discrete
+│   ├───...
+└───wavs
+    ├───...
+```
+
+### Step 3: Train the HuBERT-Soft Content Encoder
+
+```
+usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir
+
+Train HuBERT soft content encoder.
+
+positional arguments:
+  dataset-dir      path to the data directory.
+  checkpoint-dir   path to the checkpoint directory.
+
+optional arguments:
+  -h, --help       show this help message and exit
+  --resume RESUME  path to the checkpoint to resume from.
+  --warmstart      whether to initialize from the fairseq HuBERT checkpoint.
+  --mask           whether to use input masking.
+  --alpha ALPHA    weight for the masked loss.
+```
+
+## Links
+
+- [Soft-VC repo](https://github.com/bshall/soft-vc)
+- [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484)
+- [Official HuBERT repo](https://github.com/pytorch/fairseq)
+- [HuBERT paper](https://arxiv.org/abs/2106.07447)
+
+## Citation
+
+If you found this work helpful please consider citing our paper:
+
+```
+@inproceedings{
+    soft-vc-2022,
+    author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman},
+    booktitle={ICASSP}, 
+    title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion}, 
+    year={2022}
+}
+```
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b754c73c63b79e943d51e76414f0056f05589f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+import logging
+import argparse
+
+import torch
+import numpy as np
+from sklearn.cluster import KMeans
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def cluster(args):
+    with open(args.subset) as file:
+        subset = [line.strip() for line in file]
+
+    logger.info(f"Loading features from {args.in_dir}")
+    features = []
+    for path in subset:
+        in_path = args.in_dir / path
+        features.append(np.load(in_path.with_suffix(".npy")))
+    features = np.concatenate(features, axis=0)
+
+    logger.info(f"Clustering features of shape: {features.shape}")
+    kmeans = KMeans(n_clusters=args.n_clusters).fit(features)
+
+    checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt"
+    checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
+    torch.save(
+        checkpoint_path,
+        {
+            "n_features_in_": kmeans.n_features_in_,
+            "_n_threads": kmeans._n_threads,
+            "cluster_centers_": kmeans.cluster_centers_,
+        },
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Cluster speech features features.")
+    parser.add_argument(
+        "in_dir",
+        metavar="in-dir",
+        help="path to the encoded dataset",
+        type=Path,
+    )
+    parser.add_argument(
+        "subset",
+        matavar="subset",
+        help="path to the .txt file containing the list of files to cluster",
+        type=Path,
+    )
+    parser.add_argument(
+        "checkpoint_dir",
+        metavar="checkpoint-dir",
+        help="path to the checkpoint directory",
+        type=Path,
+    )
+    parser.add_argument(
+        "--n-clusters",
+        help="number of clusters",
+        type=int,
+        default=100,
+    )
+    args = parser.parse_args()
+    cluster(args)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc59d538a9383896cf0c36e1d4a3f5030fce38fe
Binary files /dev/null and b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png differ
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..14246e985fb0e9dc157d290853af6dcf6036f61c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py
@@ -0,0 +1,60 @@
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+
+import torch
+import torchaudio
+from torchaudio.functional import resample
+
+
+def encode_dataset(args):
+    print(f"Loading hubert checkpoint")
+    hubert = torch.hub.load(
+        "bshall/hubert:main",
+        f"hubert_{args.model}",
+        trust_repo=True,
+    ).cuda()
+
+    print(f"Encoding dataset at {args.in_dir}")
+    for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))):
+        wav, sr = torchaudio.load(in_path)
+        wav = resample(wav, sr, 16000)
+        wav = wav.unsqueeze(0).cuda()
+
+        with torch.inference_mode():
+            units = hubert.units(wav)
+
+        out_path = args.out_dir / in_path.relative_to(args.in_dir)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Encode an audio dataset.")
+    parser.add_argument(
+        "model",
+        help="available models (HuBERT-Soft or HuBERT-Discrete)",
+        choices=["soft", "discrete"],
+    )
+    parser.add_argument(
+        "in_dir",
+        metavar="in-dir",
+        help="path to the dataset directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "out_dir",
+        metavar="out-dir",
+        help="path to the output directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--extension",
+        help="extension of the audio files (defaults to .flac).",
+        default=".flac",
+        type=str,
+    )
+    args = parser.parse_args()
+    encode_dataset(args)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58749e4a40b29eab470686b27e06a97bfecb321
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py
@@ -0,0 +1,80 @@
+dependencies = ["torch", "torchaudio", "sklearn"]
+
+URLS = {
+    "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt",
+    "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt",
+    "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt",
+}
+
+import torch
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from sklearn.cluster import KMeans
+
+from hubert import HubertDiscrete, HubertSoft
+
+
+def hubert_discrete(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> HubertDiscrete:
+    r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    kmeans = kmeans100(pretrained=pretrained, progress=progress)
+    hubert = HubertDiscrete(kmeans)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+        hubert.load_state_dict(checkpoint["hubert"])
+        hubert.eval()
+    return hubert
+
+
+def hubert_soft(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model.
+        progress (bool): show progress bar when downloading model.
+    """
+    hubert = HubertSoft()
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-soft"],
+            progress=progress,
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+        hubert.load_state_dict(checkpoint["hubert"])
+        hubert.eval()
+    return hubert
+
+
+def _kmeans(
+    num_clusters: int, pretrained: bool = True, progress: bool = True
+) -> KMeans:
+    kmeans = KMeans(num_clusters)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS[f"kmeans{num_clusters}"], progress=progress
+        )
+        kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
+        kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
+        kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
+    return kmeans
+
+
+def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
+    r"""
+    k-means checkpoint for HuBERT-Discrete with 100 clusters.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    return _kmeans(100, pretrained, progress)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e07f859e99f51dcf35639f26a3eb53a81c993f3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py
@@ -0,0 +1,5 @@
+from .model import (
+    Hubert,
+    HubertDiscrete,
+    HubertSoft,
+)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac2b84f95340e088913e06db8e5db0a68e83c2e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py
@@ -0,0 +1,91 @@
+import random
+from pathlib import Path
+import numpy as np
+import json
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import torchaudio
+
+
+class AcousticUnitsDataset(Dataset):
+    def __init__(
+        self,
+        root: Path,
+        sample_rate: int = 16000,
+        label_rate: int = 50,
+        min_samples: int = 32000,
+        max_samples: int = 250000,
+        train: bool = True,
+    ):
+        self.wavs_dir = root / "wavs"
+        self.units_dir = root / "discrete"
+
+        with open(root / "lengths.json") as file:
+            self.lenghts = json.load(file)
+
+        pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
+        metadata = (
+            (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
+            for path in self.wavs_dir.rglob(pattern)
+        )
+        metadata = ((path, key) for path, key in metadata if key in self.lenghts)
+        self.metadata = [
+            path for path, key in metadata if self.lenghts[key] > min_samples
+        ]
+
+        self.sample_rate = sample_rate
+        self.label_rate = label_rate
+        self.min_samples = min_samples
+        self.max_samples = max_samples
+        self.train = train
+
+    def __len__(self):
+        return len(self.metadata)
+
+    def __getitem__(self, index):
+        wav_path = self.metadata[index]
+        units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
+
+        wav, _ = torchaudio.load(wav_path)
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        codes = np.load(units_path.with_suffix(".npy"))
+
+        return wav, torch.from_numpy(codes).long()
+
+    def collate(self, batch):
+        wavs, codes = zip(*batch)
+        wavs, codes = list(wavs), list(codes)
+
+        wav_lengths = [wav.size(-1) for wav in wavs]
+        code_lengths = [code.size(-1) for code in codes]
+
+        wav_frames = min(self.max_samples, *wav_lengths)
+
+        collated_wavs, wav_offsets = [], []
+        for wav in wavs:
+            wav_diff = wav.size(-1) - wav_frames
+            wav_offset = random.randint(0, wav_diff)
+            wav = wav[:, wav_offset : wav_offset + wav_frames]
+
+            collated_wavs.append(wav)
+            wav_offsets.append(wav_offset)
+
+        rate = self.label_rate / self.sample_rate
+        code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
+        code_frames = round(wav_frames * rate)
+        remaining_code_frames = [
+            length - offset for length, offset in zip(code_lengths, code_offsets)
+        ]
+        code_frames = min(code_frames, *remaining_code_frames)
+
+        collated_codes = []
+        for code, code_offset in zip(codes, code_offsets):
+            code = code[code_offset : code_offset + code_frames]
+            collated_codes.append(code)
+
+        wavs = torch.stack(collated_wavs, dim=0)
+        codes = torch.stack(collated_codes, dim=0)
+
+        return wavs, codes
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..523dd95633ba73babff8b6836324ae0a7c2d267f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py
@@ -0,0 +1,241 @@
+import copy
+from typing import Optional, Tuple
+import random
+
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+
+    def encode(
+        self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+
+
+class HubertSoft(Hubert):
+    """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+    def __init__(self):
+        super().__init__()
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        """Extract soft speech units.
+
+        Args:
+            wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+        Returns:
+            Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions.
+        """
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+
+
+class HubertDiscrete(Hubert):
+    """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
+
+    def __init__(self, kmeans: KMeans):
+        super().__init__(504)
+        self.kmeans = kmeans
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.LongTensor:
+        """Extract discrete speech units.
+
+        Args:
+            wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
+
+        Returns:
+            LongTensor: soft speech units of shape (N,), where N is the number of frames.
+        """
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav, layer=7)
+        x = self.kmeans.predict(x.squeeze().cpu().numpy())
+        return torch.tensor(x, dtype=torch.long, device=wav.device)
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.gelu(self.norm0(self.conv0(x)))
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = F.gelu(self.conv3(x))
+        x = F.gelu(self.conv4(x))
+        x = F.gelu(self.conv5(x))
+        x = F.gelu(self.conv6(x))
+        return x
+
+
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = F.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        mask: torch.Tensor = None,
+        src_key_padding_mask: torch.Tensor = None,
+        output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+
+
+def _compute_mask(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    device: torch.device,
+    min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+
+    return mask
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42ba3acb822938f246dba27b3de81ec51aa72b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/utils.py
@@ -0,0 +1,61 @@
+import torch
+
+
+class Metric:
+    def __init__(self):
+        self.steps = 0
+        self.value = 0
+
+    def update(self, value):
+        self.steps += 1
+        self.value += (value - self.value) / self.steps
+        return self.value
+
+    def reset(self):
+        self.steps = 0
+        self.value = 0
+
+
+def save_checkpoint(
+    checkpoint_dir,
+    hubert,
+    optimizer,
+    scaler,
+    step,
+    loss,
+    best,
+    logger,
+):
+    state = {
+        "hubert": hubert.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "scaler": scaler.state_dict(),
+        "step": step,
+        "loss": loss,
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+    torch.save(state, checkpoint_path)
+    if best:
+        best_path = checkpoint_dir / "model-best.pt"
+        torch.save(state, best_path)
+    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+
+
+def load_checkpoint(
+    load_path,
+    hubert,
+    optimizer,
+    scaler,
+    rank,
+    logger,
+):
+    logger.info(f"Loading checkpoint from {load_path}")
+    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+    hubert.load_state_dict(checkpoint["hubert"])
+    if "scaler" in checkpoint:
+        scaler.load_state_dict(checkpoint["scaler"])
+    if "optimizer" in checkpoint:
+        optimizer.load_state_dict(checkpoint["optimizer"])
+    step, loss = checkpoint.get("step", 0), checkpoint.get("loss", float("inf"))
+    return step, loss
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5ca9de087f72e343ffb4e5ef00cdbb90765097
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/train.py
@@ -0,0 +1,459 @@
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+from hubert.model import Hubert, URLS
+from hubert.dataset import AcousticUnitsDataset
+from hubert.utils import Metric, save_checkpoint, load_checkpoint
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+########################################################################################
+# Define hyperparameters for training:
+########################################################################################
+
+BATCH_SIZE = 32
+LEARNING_RATE = 2e-5
+BETAS = (0.9, 0.98)
+EPS = 1e-06
+WEIGHT_DECAY = 1e-2
+MAX_NORM = 10
+STEPS = 25000
+LOG_INTERVAL = 5
+VALIDATION_INTERVAL = 1000
+CHECKPOINT_INTERVAL = 5000
+BACKEND = "nccl"
+INIT_METHOD = "tcp://localhost:54321"
+
+
+def train(rank, world_size, args):
+    dist.init_process_group(
+        BACKEND,
+        rank=rank,
+        world_size=world_size,
+        init_method=INIT_METHOD,
+    )
+
+    ####################################################################################
+    # Setup logging utilities:
+    ####################################################################################
+
+    log_dir = args.checkpoint_dir / "logs"
+    log_dir.mkdir(exist_ok=True, parents=True)
+
+    if rank == 0:
+        logger.setLevel(logging.INFO)
+        handler = logging.FileHandler(log_dir / f"{args.checkpoint_dir.stem}.log")
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "%(asctime)s [%(levelname)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S"
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    writer = SummaryWriter(log_dir) if rank == 0 else None
+
+    ####################################################################################
+    # Initialize models
+    ####################################################################################
+
+    hubert = Hubert(mask=args.mask).to(rank)
+
+    if args.warmstart:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], map_location={"cuda:0": f"cuda:{rank}"}
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+
+        # don't use warmstart weights for label embeddings and proj layer
+        del checkpoint["hubert"]["label_embedding.weight"]
+        del checkpoint["hubert"]["proj.weight"]
+        del checkpoint["hubert"]["proj.bias"]
+
+        hubert.load_state_dict(checkpoint["hubert"], strict=False)
+
+    hubert = DDP(hubert, device_ids=[rank])
+
+    ####################################################################################
+    # Initialze optimizer and grad scaler
+    ####################################################################################
+
+    optimizer = optim.AdamW(
+        hubert.parameters(),
+        lr=LEARNING_RATE,
+        betas=BETAS,
+        eps=EPS,
+        weight_decay=WEIGHT_DECAY,
+    )
+    scaler = amp.GradScaler()
+
+    ####################################################################################
+    # Initialize datasets and dataloaders
+    ####################################################################################
+
+    train_dataset = AcousticUnitsDataset(
+        root=args.dataset_dir,
+        train=True,
+    )
+    train_sampler = DistributedSampler(train_dataset, drop_last=True)
+    train_loader = DataLoader(
+        train_dataset,
+        collate_fn=train_dataset.collate,
+        batch_size=BATCH_SIZE,
+        sampler=train_sampler,
+        num_workers=8,
+        pin_memory=True,
+        shuffle=False,
+        drop_last=True,
+    )
+
+    validation_dataset = AcousticUnitsDataset(
+        root=args.dataset_dir,
+        train=False,
+    )
+    validation_loader = DataLoader(
+        validation_dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=8,
+        pin_memory=True,
+    )
+
+    ####################################################################################
+    # Load checkpoint if args.resume is set
+    ####################################################################################
+
+    if args.resume is not None:
+        global_step, best_loss = load_checkpoint(
+            load_path=args.resume,
+            hubert=hubert,
+            optimizer=optimizer,
+            scaler=scaler,
+            rank=rank,
+            logger=logger,
+        )
+    else:
+        global_step, best_loss = 0, float("inf")
+
+    # =================================================================================#
+    # Start training loop
+    # =================================================================================#
+
+    n_epochs = STEPS // len(train_loader) + 1
+    start_epoch = global_step // len(train_loader) + 1
+
+    logger.info("**" * 40)
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"CUDA version: {torch.version.cuda}")
+    logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
+    logger.info(f"CUDNN enabled: {torch.backends.cudnn.enabled}")
+    logger.info(f"CUDNN deterministic: {torch.backends.cudnn.deterministic}")
+    logger.info(f"CUDNN benchmark: {torch.backends.cudnn.benchmark}")
+    logger.info(f"# of GPUS: {torch.cuda.device_count()}")
+    logger.info(f"batch size: {BATCH_SIZE}")
+    logger.info(f"iterations per epoch: {len(train_loader)}")
+    logger.info(f"# of epochs: {n_epochs}")
+    logger.info(f"started at epoch: {start_epoch}")
+    logger.info("**" * 40 + "\n")
+
+    if args.mask:
+        average_masked_loss = Metric()
+        average_unmasked_loss = Metric()
+        average_masked_accuracy = Metric()
+        average_unmasked_accuracy = Metric()
+
+        epoch_masked_loss = Metric()
+        epoch_unmasked_loss = Metric()
+        epoch_masked_accuracy = Metric()
+        epoch_unmasked_accuracy = Metric()
+    else:
+        average_loss = Metric()
+        average_accuracy = Metric()
+
+        epoch_loss = Metric()
+        epoch_accuracy = Metric()
+
+    validation_loss = Metric()
+    validation_accuracy = Metric()
+
+    for epoch in range(start_epoch, n_epochs + 1):
+        train_sampler.set_epoch(epoch)
+
+        hubert.train()
+        if args.mask:
+            epoch_masked_loss.reset()
+            epoch_unmasked_loss.reset()
+            epoch_masked_accuracy.reset()
+            epoch_unmasked_accuracy.reset()
+        else:
+            epoch_loss.reset()
+            epoch_accuracy.reset()
+
+        for wavs, codes in train_loader:
+            global_step += 1
+            wavs, codes = wavs.to(rank), codes.to(rank)
+
+            ############################################################################
+            # Compute training loss
+            ############################################################################
+
+            optimizer.zero_grad()
+
+            with amp.autocast():
+                logits, mask = hubert(wavs)
+                length = min(
+                    mask.size(-1) if args.mask else float("inf"), codes.size(-1)
+                )
+                logits = logits[:, :length, :]
+                codes = codes[:, :length]
+                if args.mask:
+                    mask = mask[:, :length]
+
+                if args.mask:
+                    masked_loss = F.cross_entropy(logits[mask], codes[mask])
+                    unmasked_loss = F.cross_entropy(logits[~mask], codes[~mask])
+                    loss = args.alpha * masked_loss + (1 - args.alpha) * unmasked_loss
+                else:
+                    loss = F.cross_entropy(logits.transpose(1, 2), codes)
+
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+
+            nn.utils.clip_grad_norm_(hubert.parameters(), MAX_NORM)
+
+            scaler.step(optimizer)
+            scaler.update()
+
+            if args.mask:
+                masked_accuracy = logits[mask].argmax(dim=-1) == codes[mask]
+                masked_accuracy = torch.mean(masked_accuracy.float())
+
+                unmasked_accuracy = logits[~mask].argmax(dim=-1) == codes[~mask]
+                unmasked_accuracy = torch.mean(unmasked_accuracy.float())
+            else:
+                accuracy = logits.argmax(dim=-1) == codes
+                accuracy = torch.mean(accuracy.float())
+
+            ############################################################################
+            # Update and log training metrics
+            ############################################################################
+
+            if args.mask:
+                average_masked_loss.update(masked_loss.item())
+                average_unmasked_loss.update(unmasked_loss.item())
+                average_masked_accuracy.update(masked_accuracy.item())
+                average_unmasked_accuracy.update(unmasked_accuracy.item())
+
+                epoch_masked_loss.update(masked_loss.item())
+                epoch_unmasked_loss.update(unmasked_loss.item())
+                epoch_masked_accuracy.update(masked_accuracy.item())
+                epoch_unmasked_accuracy.update(unmasked_accuracy.item())
+            else:
+                average_loss.update(loss.item())
+                average_accuracy.update(accuracy.item())
+
+                epoch_loss.update(loss.item())
+                epoch_accuracy.update(accuracy.item())
+
+            if rank == 0 and global_step % LOG_INTERVAL == 0:
+                if args.mask:
+                    writer.add_scalar(
+                        "train/masked_loss",
+                        average_masked_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/unmasked_loss",
+                        average_unmasked_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/masked_accuracy",
+                        average_masked_accuracy.value * 100,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/unmasked_accuracy",
+                        average_unmasked_accuracy.value * 100,
+                        global_step,
+                    )
+                    average_masked_loss.reset()
+                    average_unmasked_loss.reset()
+                    average_masked_accuracy.reset()
+                    average_unmasked_accuracy.reset()
+                else:
+                    writer.add_scalar(
+                        "train/loss",
+                        average_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "train/accuracy",
+                        average_accuracy.value,
+                        global_step,
+                    )
+                    average_loss.reset()
+                    average_accuracy.reset()
+
+            # --------------------------------------------------------------------------#
+            # Start validation loop
+            # --------------------------------------------------------------------------#
+
+            if global_step % VALIDATION_INTERVAL == 0:
+                hubert.eval()
+                validation_loss.reset()
+                validation_accuracy.reset()
+                for wavs, codes in validation_loader:
+                    wavs, codes = wavs.to(rank), codes.to(rank)
+
+                    with torch.no_grad():
+                        logits, _ = hubert(wavs)
+                        logits = logits.transpose(1, 2)
+
+                    loss = F.cross_entropy(logits, codes)
+
+                    accuracy = logits.argmax(dim=1) == codes
+                    accuracy = torch.mean(accuracy.float())
+
+                    ####################################################################
+                    # Update validation metrics
+                    ####################################################################
+
+                    validation_loss.update(loss.item())
+                    validation_accuracy.update(accuracy.item())
+
+                hubert.train()
+
+                ############################################################################
+                # Log validation metrics
+                ############################################################################
+
+                if rank == 0:
+                    writer.add_scalar(
+                        "validation/unit_loss",
+                        validation_loss.value,
+                        global_step,
+                    )
+                    writer.add_scalar(
+                        "validation/unit_accuracy",
+                        validation_accuracy.value * 100,
+                        global_step,
+                    )
+                    logger.info(
+                        f"valid -- epoch: {epoch}, loss: {validation_loss.value:.4f}, accuracy: {validation_accuracy.value * 100:.2f}"
+                    )
+
+                ############################################################################
+                # Save model checkpoint
+                ############################################################################
+
+                new_best = best_loss > validation_loss.value
+                if new_best or global_step % CHECKPOINT_INTERVAL == 0:
+                    if new_best:
+                        logger.info("-------- new best model found!")
+                        best_loss = validation_loss.value
+
+                    if rank == 0:
+                        save_checkpoint(
+                            checkpoint_dir=args.checkpoint_dir,
+                            hubert=hubert,
+                            optimizer=optimizer,
+                            scaler=scaler,
+                            step=global_step,
+                            loss=validation_loss.value,
+                            best=new_best,
+                            logger=logger,
+                        )
+
+            # -----------------------------------------------------------------------------#
+            # End validation loop
+            # -----------------------------------------------------------------------------#
+
+        ####################################################################################
+        # Log training metrics
+        ####################################################################################
+
+        logger.info(
+            f"""
+            train -- epoch: {epoch}, masked loss: {epoch_masked_loss.value:.4f}, unmasked loss: {epoch_unmasked_loss.value:.4f}, 
+                     masked accuracy: {epoch_masked_accuracy.value * 100:.2f}, umasked accuracy: {epoch_unmasked_accuracy.value * 100:.2f}
+            """
+        )
+
+        # ==================================================================================#
+        # End training loop
+        # ==================================================================================#
+
+    dist.destroy_process_group()
+
+
+def train_hubert(args):
+    world_size = torch.cuda.device_count()
+    mp.spawn(
+        train,
+        args=(world_size, args),
+        nprocs=world_size,
+        join=True,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train HuBERT soft content encoder.")
+    parser.add_argument(
+        "dataset_dir",
+        metavar="dataset-dir",
+        help="path to the data directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "checkpoint_dir",
+        metavar="checkpoint-dir",
+        help="path to the checkpoint directory.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--resume",
+        help="path to the checkpoint to resume from.",
+        type=Path,
+    )
+    parser.add_argument(
+        "--warmstart",
+        help="whether to initialize from the fairseq HuBERT checkpoint.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--mask",
+        help="whether to use input masking.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--alpha",
+        help="weight for the masked loss.",
+        default=1,
+        type=float,
+    )
+    args = parser.parse_args()
+
+    world_size = torch.cuda.device_count()
+    mp.spawn(
+        train,
+        args=(world_size, args),
+        nprocs=world_size,
+        join=True,
+    )
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a385090553f7106d30d530ea319f82c66a788ffd
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert_model.py
@@ -0,0 +1,24 @@
+import torch, torchaudio
+from .hubert.hubert import HubertSoft
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+import librosa
+
+
+def get_soft_model(model_path):
+    hubert = HubertSoft()
+    # Load checkpoint (either hubert_soft or hubert_discrete)
+    # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
+    checkpoint = torch.load(model_path)
+    consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
+    hubert.load_state_dict(checkpoint["hubert"])
+    hubert.eval()
+    return hubert
+
+
+@torch.no_grad()
+def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
+    wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
+    # print(wav_16k_tensor.shape)
+    units = hmodel.units(wav_16k_tensor)
+    # print(units.shape)
+    return units.cpu()
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fea82f9f64f7ae37aee38d799f703f11812ff2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class DiffVC(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.speaker_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['speaker_dim'], self.config['cls_embedding']['feature_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['feature_dim'], self.config['cls_embedding']['feature_dim']))
+        self.uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['speaker_dim']) /
+                                   self.config['cls_embedding']['speaker_dim'] ** 0.5)
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, speaker, pitch,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            uncond = repeat(self.uncond, "c-> b c", b=B).to(target.dtype)
+            batch_mask = rand_bool(shape=(B, 1), proba=speaker_cfg, device=target.device)
+            speaker = torch.where(batch_mask, uncond, speaker)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        speaker = self.speaker_embedding(speaker)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t, class_labels=speaker)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_base_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    spk = torch.rand(2, 256).to(device)
+
+    output = model(x, t, y, spk, pitch=p, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..774d3481fd23105e6f161e2b64ed2a757acba9c2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/model_cross.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class DiffVC_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+        self.content_embedding = nn.Sequential(
+            nn.Linear(self.config['cls_embedding']['content_dim'], self.config['cls_embedding']['content_hidden']),
+            nn.SiLU(),
+            nn.Linear(self.config['cls_embedding']['content_hidden'], self.config['cls_embedding']['content_hidden']))
+
+        if self.config['cls_embedding']['use_pitch']:
+            self.pitch_control = True
+            self.pitch_embedding = nn.Sequential(
+                nn.Linear(self.config['cls_embedding']['pitch_dim'], self.config['cls_embedding']['pitch_hidden']),
+                nn.SiLU(),
+                nn.Linear(self.config['cls_embedding']['pitch_hidden'],
+                          self.config['cls_embedding']['pitch_hidden']))
+
+            self.pitch_uncond = nn.Parameter(torch.randn(self.config['cls_embedding']['pitch_hidden']) /
+                                             self.config['cls_embedding']['pitch_hidden'] ** 0.5)
+        else:
+            print('no pitch module')
+            self.pitch_control = False
+
+    def forward(self, target, t, content, prompt, prompt_mask=None, pitch=None,
+                train_cfg=False, speaker_cfg=0.0, pitch_cfg=0.0):
+        B, C, M, L = target.shape
+        content = self.content_embedding(content)
+        content = repeat(content, "b t c-> b c m t", m=M)
+        target = target.to(content.dtype)
+        x = torch.cat([target, content], dim=1)
+
+        if self.pitch_control:
+            if pitch is not None:
+                pitch = self.pitch_embedding(pitch)
+            else:
+                pitch = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+
+        if train_cfg:
+            # Randomly mask embedding
+            batch_mask = rand_bool(shape=(B, 1, 1), proba=speaker_cfg, device=target.device)
+            fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+            prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+            if self.pitch_control:
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=pitch_cfg, device=target.device)
+                pitch_uncond = repeat(self.pitch_uncond, "c-> b t c", b=B, t=L).to(target.dtype)
+                pitch = torch.where(batch_mask, pitch_uncond, pitch)
+
+        prompt = self.context_embedding(prompt)
+
+        if self.pitch_control:
+            pitch = repeat(pitch, "b t c-> b c m t", m=M)
+            x = torch.cat([x, pitch], dim=1)
+
+        output = self.unet(sample=x, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output
+
+
+if __name__ == "__main__":
+    with open('diffvc_cross_pitch.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = DiffVC_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 1, 100, 256)).to(device)
+    y = torch.rand((2, 256, 768)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+    p = torch.rand(2, 256, 1).to(device)
+
+    output = model(x, t, y, prompt, prompt_mask, p, train_cfg=True, speaker_cfg=0.25, pitch_cfg=0.5)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e878e4daa06309e7ca9b6d970f333bcf9d4524
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/model/p2e_cross.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+from diffusers import UNet2DModel, UNet2DConditionModel
+import yaml
+from einops import repeat, rearrange
+
+from typing import Any
+from torch import Tensor
+
+
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, features=128):
+        super().__init__()
+        self.embedding = nn.Embedding(1, features)
+
+    def forward(self, y):
+        B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
+        embed = self.embedding(torch.zeros(B, device=device).long())
+        fixed_embedding = repeat(embed, "b c -> b l c", l=L)
+        return fixed_embedding
+
+
+class P2E_Cross(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.unet = UNet2DConditionModel(**self.config['unet'])
+        # self.unet.set_use_memory_efficient_attention_xformers(True)
+        self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
+
+        self.context_embedding = nn.Sequential(
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
+            nn.SiLU(),
+            nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
+
+    def forward(self, target, t, prompt, prompt_mask=None,
+                train_cfg=False, cfg_prob=0.0):
+        B, C = target.shape
+        target = target.unsqueeze(-1).unsqueeze(-1)
+
+        if train_cfg:
+            if cfg_prob > 0.0:
+                # Randomly mask embedding
+                batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
+                fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
+                prompt = torch.where(batch_mask, fixed_embedding, prompt)
+
+        prompt = self.context_embedding(prompt)
+        # fix the bug that prompt will copy dtype from target in diffusers
+        target = target.to(prompt.dtype)
+
+        output = self.unet(sample=target, timestep=t,
+                           encoder_hidden_states=prompt,
+                           encoder_attention_mask=prompt_mask)['sample']
+
+        return output.squeeze(-1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    with open('p2e_cross.yaml', 'r') as fp:
+        config = yaml.safe_load(fp)
+    device = 'cuda'
+
+    model = P2E_Cross(config['diffwrap']).to(device)
+
+    x = torch.rand((2, 256)).to(device)
+    t = torch.randint(0, 1000, (2,)).long().to(device)
+    prompt = torch.rand(2, 64, 768).to(device)
+    prompt_mask = torch.ones(2, 64).to(device)
+
+    output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/README.md
@@ -0,0 +1,95 @@
+## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
+
+<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
+
+
+### [Paper](https://arxiv.org/abs/2206.04658)
+### [Audio demo](https://bigvgan-demo.github.io/)
+
+## Installation
+Clone the repository and install dependencies.
+```shell
+# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
+git clone https://github.com/NVIDIA/BigVGAN
+pip install -r requirements.txt
+```
+
+Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
+``` shell
+cd LibriTTS && \
+ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
+ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
+ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
+ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
+ln -s /path/to/your/LibriTTS/dev-other dev-other && \
+ln -s /path/to/your/LibriTTS/test-clean test-clean && \
+ln -s /path/to/your/LibriTTS/test-other test-other && \
+cd ..
+```
+
+## Training
+Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
+```shell
+python train.py \
+--config configs/bigvgan_24khz_100band.json \
+--input_wavs_dir LibriTTS \
+--input_training_file LibriTTS/train-full.txt \
+--input_validation_file LibriTTS/val-full.txt \
+--list_input_unseen_wavs_dir LibriTTS LibriTTS \
+--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
+--checkpoint_path exp/bigvgan
+```
+
+## Synthesis
+Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
+It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
+```shell
+python inference.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_wavs_dir /path/to/your/input_wav \
+--output_dir /path/to/your/output_wav
+```
+
+`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
+It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
+
+Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
+```shell
+python inference_e2e.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_mels_dir /path/to/your/input_mel \
+--output_dir /path/to/your/output_wav
+```
+
+## Pretrained Models
+We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
+One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
+
+|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
+|------|---|---|---|---|------|---|
+|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
+|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
+|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
+|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
+
+The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
+We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
+Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
+
+
+## TODO
+
+Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. 
+
+
+## References
+* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
+
+* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
+
+* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
+
+* [Julius](https://github.com/adefossez/julius) (for low-pass filter)
+
+* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/activations/activations.py
@@ -0,0 +1,120 @@
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/act.py
@@ -0,0 +1,28 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/filter.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+
+        return out
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/alias_free_torch/resample.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/env.py
@@ -0,0 +1,18 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/inference.py
@@ -0,0 +1,36 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+import os
+import argparse
+import json
+import torch
+from scipy.io.wavfile import write
+from .env import AttrDict
+from .utils import MAX_WAV_VALUE
+from .models import BigVGAN as Generator
+import librosa
+
+
+def load_model(model_path, device='cuda'):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+
+    global h
+    json_config = json.loads(data)
+
+    h = AttrDict(json_config)
+
+    generator = Generator(h).to(device)
+
+    cp_dict = torch.load(model_path, map_location=device)
+    generator.load_state_dict(cp_dict['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/models.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 NVIDIA CORPORATION. 
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from .activations import activations
+from .utils import init_weights, get_padding
+from .alias_free_torch import *
+
+LRELU_SLOPE = 0.1
+
+
+class AMPBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+        super(AMPBlock1, self).__init__()
+        self.h = h
+
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+        super(AMPBlock2, self).__init__()
+        self.h = h
+
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        for c, a in zip (self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, h):
+        super(BigVGAN, self).__init__()
+        self.h = h
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(nn.ModuleList([
+                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+                                            h.upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+
+        # post conv
+        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.d_mult = h.discriminator_channel_mult
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, h):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.mpd_reshapes = h.mpd_reshapes
+        print("mpd_reshapes: {}".format(self.mpd_reshapes))
+        discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
+        self.discriminators = nn.ModuleList(discriminators)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(self, cfg, resolution):
+        super().__init__()
+
+        self.resolution = resolution
+        assert len(self.resolution) == 3, \
+            "MRD layer requires list with len=3, got {}".format(self.resolution)
+        self.lrelu_slope = LRELU_SLOPE
+
+        norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
+        if hasattr(cfg, "mrd_use_spectral_norm"):
+            print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
+            norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
+        self.d_mult = cfg.discriminator_channel_mult
+        if hasattr(cfg, "mrd_channel_mult"):
+            print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
+            self.d_mult = cfg.mrd_channel_mult
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.resolution
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
+        x = torch.view_as_real(x)  # [B, F, TT, 2]
+        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+        return mag
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(self, cfg, debug=False):
+        super().__init__()
+        self.resolutions = cfg.resolutions
+        assert len(self.resolutions) == 3,\
+            "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
+                format(self.resolutions)
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
+        )
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(x=y)
+            y_d_g, fmap_g = d(x=y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/BigVGAN/utils.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+from scipy.io.wavfile import write
+
+MAX_WAV_VALUE = 32768.0
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_spectrogram_clipped(spectrogram, clip_max=2.):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none', vmin=1e-6, vmax=clip_max)
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+def save_audio(audio, path, sr):
+    # wav: torch with 1d shape
+    audio = audio * MAX_WAV_VALUE
+    audio = audio.cpu().numpy().astype('int16')
+    write(path, sr, audio)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..e550b871f5cd9564f4cf043ec4aa649a48b0b41f
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/mel.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as transforms
+
+
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self, sr=24000, frame_length=1920, hop_length=480, n_mel=128, f_min=0, f_max=12000,):
+        super().__init__()
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.mel = transforms.MelSpectrogram(
+            sample_rate=sr,
+            n_fft=frame_length,
+            win_length=frame_length,
+            hop_length=hop_length,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            n_mels=n_mel,
+            mel_scale="slaney",
+            f_min=f_min,
+            f_max=f_max
+        )
+
+    @torch.no_grad()
+    def forward(self, x, target_length=None):
+        x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
+                      (self.frame_length - self.hop_length) // 2), "reflect")
+        mel = self.mel(x)
+
+        target_length = mel.shape[-1] if target_length is None else target_length
+        logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
+        logmel[:, :, :mel.shape[2]] = mel
+
+        logmel = torch.log(torch.clamp(logmel, min=1e-5))
+        return logmel
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5ed721bf8f29f5c8d947c2d333cc371021135fb0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/LICENSE
@@ -0,0 +1,24 @@
+MIT License
+
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95663cf5b29be905a8422176f661a8f7745b5cb0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/README.md
@@ -0,0 +1,64 @@
+# Real-Time Voice Cloning
+This repository is an implementation of [Transfer Learning from Speaker Verification to
+Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
+
+SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
+
+**Video demonstration** (click the picture):
+
+[![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
+
+
+
+### Papers implemented  
+| URL | Designation | Title | Implementation source |
+| --- | ----------- | ----- | --------------------- |
+|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
+|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
+|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
+|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
+
+## News
+**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
+
+**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
+
+**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
+
+**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
+- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
+- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
+
+**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
+
+
+## Setup
+
+### 1. Install Requirements
+1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
+2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
+3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
+4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
+5. Install the remaining requirements with `pip install -r requirements.txt`
+
+### 2. (Optional) Download Pretrained Models
+Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
+
+### 3. (Optional) Test Configuration
+Before you download any dataset, you can begin by testing your configuration with:
+
+`python demo_cli.py`
+
+If all tests pass, you're good to go.
+
+### 4. (Optional) Download Datasets
+For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
+
+### 5. Launch the Toolbox
+You can then try the toolbox:
+
+`python demo_toolbox.py -d <datasets_root>`  
+or  
+`python demo_toolbox.py`  
+
+depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..de650b972fc7a4f3f8a698c128ee4642a373a6d6
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/audio.py
@@ -0,0 +1,157 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from scipy.ndimage.morphology import binary_dilation
+from .params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+import torch
+from torchaudio.transforms import Resample
+from librosa.filters import mel as librosa_mel_fn
+
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
+
+    # Apply the preprocessing: normalize volume and shorten long silences 
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    
+    return wav
+
+
+def preprocess_wav_batch(wavs, source_sr=22050):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if sampling_rate != source_sr:
+        resample = Resample(source_sr, sampling_rate)
+        wavs = resample(wavs)
+    wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, 
+                                               increase_only=True)
+    # Trimming silence is not implemented in this version yet!
+    return wavs_preprocessed
+
+
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+
+
+def wav_to_mel_spectrogram_batch(wavs):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    n_fft = int(sampling_rate * mel_window_length / 1000)
+    hop_length = int(sampling_rate * mel_window_step / 1000)
+    win_length = int(sampling_rate * mel_window_length / 1000)
+    window = torch.hann_window(n_fft).to(wavs)
+    mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
+                                                n_mels=mel_n_channels)).to(wavs)
+    s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, 
+                   win_length=win_length, window=window, center=True, return_complex=False)
+    real_part, imag_part = s.unbind(-1)
+    stftm = real_part**2 + imag_part**2
+    mels = torch.matmul(mel_basis, stftm)
+    return torch.transpose(mels, 1, 2)
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))
+
+
+def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
+    scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
+    if increase_only:
+        mask = (dBFS_change > 0).to(scales)
+    elif decrease_only:
+        mask = (dBFS_change < 0).to(scales)
+    else:
+        mask = torch.zeros_like(scales)
+    scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
+    return wavs * scales.unsqueeze(-1)
+
+
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a 
+    threshold determined by the VAD parameters in params.py.
+
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    
+    return wav[audio_mask == True]
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1f5aab0d3899c5e5045b40d4cecee1a11d844c
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/config.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9af30b406f2a8debe81a8275cb2682cbd896245a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/__init__.py
@@ -0,0 +1,4 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .speaker_verification_dataset import SpeakerVerificationDataset
+from .speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd5bb005923852327581e2dcaa03fec7dbce5b8
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
@@ -0,0 +1,39 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import random
+
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random 
+    order. For a source sequence of n items and one or several consecutive queries of a total 
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    
+    def __next__(self):
+        return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d189c835859efefa686d49b53f4e79aa444d96
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+        
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+               
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all 
+        utterances come up at least once every two cycles and in a random order every time.
+        
+        :param count: The number of partial utterances to sample from the set of utterances from 
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
+        frames are the frames of the partial utterances and range is the range of the partial 
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+
+        utterances = self.utterance_cycler.sample(count)
+
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+        return a
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4080d636338bedcb8d1b8fc77945057027fd0ac1
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
@@ -0,0 +1,14 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+from typing import List
+from .speaker import Speaker
+
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc31fee9e0d62545caa2599aebc22decfb50aa0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,58 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .random_cycler import RandomCycler
+from .speaker_batch import SpeakerBatch
+from .speaker import Speaker
+from ..params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+
+    def __len__(self):
+        return int(1e10)
+        
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+    
+    
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+
+        super().__init__(
+            dataset=dataset, 
+            batch_size=speakers_per_batch, 
+            shuffle=False, 
+            sampler=sampler, 
+            batch_sampler=batch_sampler, 
+            num_workers=num_workers,
+            collate_fn=self.collate, 
+            pin_memory=pin_memory, 
+            drop_last=False, 
+            timeout=timeout, 
+            worker_init_fn=worker_init_fn
+        )
+
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b878c58fd7d70d3ba0b33def66912adc1c1a45d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/data_objects/utterance.py
@@ -0,0 +1,28 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+
+
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the 
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f1dc4fb86bbab07892e5e94464cc3e377f9b64
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/inference.py
@@ -0,0 +1,211 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_data import *
+from .model import SpeakerEncoder
+from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
+from matplotlib import cm
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device="cpu"):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    first call to embed_frames() with the default weights file.
+    
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath, map_location="cpu")
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    _model = _model.to(device)
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    
+    
+def is_loaded():
+    return _model is not None
+
+
+@torch.no_grad()
+def embed_frames_batch(frames, use_torch=False):
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+
+    if not use_torch:
+        frames = torch.from_numpy(frames)
+    frames = frames.to(_device)
+
+    embeds = _model.forward(frames)
+    if not use_torch:
+        embeds = embeds.detach().cpu().numpy()
+    return embeds
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+        
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    
+    return wav_slices, mel_slices
+
+
+@torch.no_grad()
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of 
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
+    normalized average. If False, the utterance is instead computed from feeding the entire 
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    
+    # Split the utterance into partials
+    frames = wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+
+
+@torch.no_grad()
+def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
+    # This torch version is designed to cope with a batch of same lengths wavs
+    if not using_partials:
+        frames = wav_to_mel_spectrogram_batch(wavs)
+        embeds = embed_frames_batch(frames)
+        if return_partials:
+            return embeds, None, None
+        return embeds
+
+    wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= wavs.shape[-1]:
+        wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 
+                                            dtype=wavs.dtype, device=wavs.device)], 1)
+
+    frames = wav_to_mel_spectrogram_batch(wavs)
+    frames_batch = []
+    for i in range(len(frames)):
+        frames_batch += [frames[i][s] for s in mel_slices]
+    frames_batch = torch.stack(frames_batch, 0)
+    partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
+    partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
+
+    raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
+    embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
+
+    if return_partials:
+        return embeds, partial_embeds, wave_slices
+    return embeds
+
+
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d246bc359ce1ffc6229ba8a4ced24d07b77e703
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/model.py
@@ -0,0 +1,137 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .params_model import *
+from .params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size, 
+                            num_layers=model_num_layers, 
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size, 
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+        
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+            
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+        return embeds
+    
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+            
+        return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d04121aed3d7862889ad6c771055db9b74ab6e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_data.py
@@ -0,0 +1,30 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c535205028bfec75ba7c58ea7e750ba3fff1633
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/params_model.py
@@ -0,0 +1,12 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..c59165a54e509fa63793fb1503bc6d6e346c741e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/preprocess.py
@@ -0,0 +1,177 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from multiprocess.pool import ThreadPool
+from .params_data import *
+from .config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+        
+    def _log_params(self):
+        from encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+        
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+            
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+       
+        
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+    
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return 
+        
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+                        nationality.lower() in anglophone_nationalites]
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+          (len(keep_speaker_ids), len(nationalities)))
+    
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..250d038a33b72d09dfe67811c917708aa0ea6714
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/train.py
@@ -0,0 +1,127 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .visualizations import Visualizations
+from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from .params_model import *
+from .model import SpeakerEncoder
+from .utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+    # FIXME
+    return 
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,
+        utterances_per_speaker,
+        num_workers=8,
+    )
+    
+    # Setup the device on which to run the forward pass and the loss. These can be different, 
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+            
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+            
+        profiler.tick("Extras (visualizations, saving)")
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..447ea1d797a6737a516e5f881cd1fb8e2841ad8e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/__init__.py
@@ -0,0 +1 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de50f3ec61f6b61798299726b13a1caa1638abb
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/argutils.py
@@ -0,0 +1,42 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from pathlib import Path
+import numpy as np
+import argparse
+
+_type_priorities = [    # In decreasing order
+    Path,
+    str,
+    int,
+    float,
+    bool,
+]
+
+def _priority(o):
+    p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 
+    if p is not None:
+        return p
+    p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 
+    if p is not None:
+        return p
+    return len(_type_priorities)
+
+def print_args(args: argparse.Namespace, parser=None):
+    args = vars(args)
+    if parser is None:
+        priorities = list(map(_priority, args.values()))
+    else:
+        all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
+        priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
+        priorities = list(map(priority, args.keys()))
+    
+    pad = max(map(len, args.keys())) + 3
+    indices = np.lexsort((list(args.keys()), priorities))
+    items = list(args.items())
+    
+    print("Arguments:")
+    for i in indices:
+        param, value = items[i]
+        print("    {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
+    print("")
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py
new file mode 100644
index 0000000000000000000000000000000000000000..43de43e4c29821df5d20d8303ce491101a041a86
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/logmmse.py
@@ -0,0 +1,222 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+import numpy as np
+import math
+from scipy.special import expn
+from collections import namedtuple
+
+NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
+
+
+def profile_noise(noise, sampling_rate, window_size=0):
+    """
+    Creates a profile of the noise in a given waveform.
+    
+    :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 
+    :param sampling_rate: the sampling rate of the audio
+    :param window_size: the size of the window the logmmse algorithm operates on. A default value 
+    will be picked if left as 0.
+    :return: a NoiseProfile object
+    """
+    noise, dtype = to_float(noise)
+    noise += np.finfo(np.float64).eps
+
+    if window_size == 0:
+        window_size = int(math.floor(0.02 * sampling_rate))
+
+    if window_size % 2 == 1:
+        window_size = window_size + 1
+    
+    perc = 50
+    len1 = int(math.floor(window_size * perc / 100))
+    len2 = int(window_size - len1)
+
+    win = np.hanning(window_size)
+    win = win * len2 / np.sum(win)
+    n_fft = 2 * window_size
+
+    noise_mean = np.zeros(n_fft)
+    n_frames = len(noise) // window_size
+    for j in range(0, window_size * n_frames, window_size):
+        noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
+    noise_mu2 = (noise_mean / n_frames) ** 2
+    
+    return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
+
+
+def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
+    """
+    Cleans the noise from a speech waveform given a noise profile. The waveform must have the 
+    same sampling rate as the one used to create the noise profile. 
+    
+    :param wav: a speech waveform as a numpy array of floats or ints.
+    :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 
+    the same) waveform.
+    :param eta: voice threshold for noise update. While the voice activation detection value is 
+    below this threshold, the noise profile will be continuously updated throughout the audio. 
+    Set to 0 to disable updating the noise profile.
+    :return: the clean wav as a numpy array of floats or ints of the same length.
+    """
+    wav, dtype = to_float(wav)
+    wav += np.finfo(np.float64).eps
+    p = noise_profile
+    
+    nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
+    x_final = np.zeros(nframes * p.len2)
+
+    aa = 0.98
+    mu = 0.98
+    ksi_min = 10 ** (-25 / 10)
+    
+    x_old = np.zeros(p.len1)
+    xk_prev = np.zeros(p.len1)
+    noise_mu2 = p.noise_mu2
+    for k in range(0, nframes * p.len2, p.len2):
+        insign = p.win * wav[k:k + p.window_size]
+
+        spec = np.fft.fft(insign, p.n_fft, axis=0)
+        sig = np.absolute(spec)
+        sig2 = sig ** 2
+
+        gammak = np.minimum(sig2 / noise_mu2, 40)
+
+        if xk_prev.all() == 0:
+            ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+        else:
+            ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+            ksi = np.maximum(ksi_min, ksi)
+
+        log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
+        vad_decision = np.sum(log_sigma_k) / p.window_size
+        if vad_decision < eta:
+            noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+
+        a = ksi / (1 + ksi)
+        vk = a * gammak
+        ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+        hw = a * np.exp(ei_vk)
+        sig = sig * hw
+        xk_prev = sig ** 2
+        xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
+        xi_w = np.real(xi_w)
+
+        x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
+        x_old = xi_w[p.len1:p.window_size]
+
+    output = from_float(x_final, dtype)
+    output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
+    return output
+
+
+## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 
+## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 
+## webrctvad
+# def vad(wav, sampling_rate, eta=0.15, window_size=0):
+#     """
+#     TODO: fix doc
+#     Creates a profile of the noise in a given waveform.
+# 
+#     :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 
+#     :param sampling_rate: the sampling rate of the audio
+#     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
+#     will be picked if left as 0.
+#     :param eta: voice threshold for noise update. While the voice activation detection value is 
+#     below this threshold, the noise profile will be continuously updated throughout the audio. 
+#     Set to 0 to disable updating the noise profile.
+#     """
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#     
+#     if window_size == 0:
+#         window_size = int(math.floor(0.02 * sampling_rate))
+#     
+#     if window_size % 2 == 1:
+#         window_size = window_size + 1
+#     
+#     perc = 50
+#     len1 = int(math.floor(window_size * perc / 100))
+#     len2 = int(window_size - len1)
+#     
+#     win = np.hanning(window_size)
+#     win = win * len2 / np.sum(win)
+#     n_fft = 2 * window_size
+#     
+#     wav_mean = np.zeros(n_fft)
+#     n_frames = len(wav) // window_size
+#     for j in range(0, window_size * n_frames, window_size):
+#         wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
+#     noise_mu2 = (wav_mean / n_frames) ** 2
+#     
+#     wav, dtype = to_float(wav)
+#     wav += np.finfo(np.float64).eps
+#     
+#     nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
+#     vad = np.zeros(nframes * len2, dtype=np.bool)
+# 
+#     aa = 0.98
+#     mu = 0.98
+#     ksi_min = 10 ** (-25 / 10)
+#     
+#     xk_prev = np.zeros(len1)
+#     noise_mu2 = noise_mu2
+#     for k in range(0, nframes * len2, len2):
+#         insign = win * wav[k:k + window_size]
+#         
+#         spec = np.fft.fft(insign, n_fft, axis=0)
+#         sig = np.absolute(spec)
+#         sig2 = sig ** 2
+#         
+#         gammak = np.minimum(sig2 / noise_mu2, 40)
+#         
+#         if xk_prev.all() == 0:
+#             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
+#         else:
+#             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
+#             ksi = np.maximum(ksi_min, ksi)
+#         
+#         log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
+#         vad_decision = np.sum(log_sigma_k) / window_size
+#         if vad_decision < eta:
+#             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
+#         print(vad_decision)
+#         
+#         a = ksi / (1 + ksi)
+#         vk = a * gammak
+#         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
+#         hw = a * np.exp(ei_vk)
+#         sig = sig * hw
+#         xk_prev = sig ** 2
+#         
+#         vad[k:k + len2] = vad_decision >= eta
+#         
+#     vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
+#     return vad
+
+
+def to_float(_input):
+    if _input.dtype == np.float64:
+        return _input, _input.dtype
+    elif _input.dtype == np.float32:
+        return _input.astype(np.float64), _input.dtype
+    elif _input.dtype == np.uint8:
+        return (_input - 128) / 128., _input.dtype
+    elif _input.dtype == np.int16:
+        return _input / 32768., _input.dtype
+    elif _input.dtype == np.int32:
+        return _input / 2147483648., _input.dtype
+    raise ValueError('Unsupported wave file format')
+
+
+def from_float(_input, dtype):
+    if dtype == np.float64:
+        return _input, np.float64
+    elif dtype == np.float32:
+        return _input.astype(np.float32)
+    elif dtype == np.uint8:
+        return ((_input * 128) + 128).astype(np.uint8)
+    elif dtype == np.int16:
+        return (_input * 32768).astype(np.int16)
+    elif dtype == np.int32:
+        print(_input)
+        return (_input * 2147483648).astype(np.int32)
+    raise ValueError('Unsupported wave file format')
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0176f632b58dfde15e31c04e79543b629bd4499
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/utils/profiler.py
@@ -0,0 +1,47 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from time import perf_counter as timer
+from collections import OrderedDict
+import numpy as np
+
+
+class Profiler:
+    def __init__(self, summarize_every=5, disabled=False):
+        self.last_tick = timer()
+        self.logs = OrderedDict()
+        self.summarize_every = summarize_every
+        self.disabled = disabled
+    
+    def tick(self, name):
+        if self.disabled:
+            return
+        
+        # Log the time needed to execute that function
+        if not name in self.logs:
+            self.logs[name] = []
+        if len(self.logs[name]) >= self.summarize_every:
+            self.summarize()
+            self.purge_logs()
+        self.logs[name].append(timer() - self.last_tick)
+        
+        self.reset_timer()
+        
+    def purge_logs(self):
+        for name in self.logs:
+            self.logs[name].clear()
+    
+    def reset_timer(self):
+        self.last_tick = timer()
+    
+    def summarize(self):
+        n = max(map(len, self.logs.values()))
+        assert n == self.summarize_every
+        print("\nAverage execution time over %d steps:" % n)
+
+        name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
+        pad = max(map(len, name_msgs))
+        for name_msg, deltas in zip(name_msgs, self.logs.values()):
+            print("  %s  mean: %4.0fms   std: %4.0fms" % 
+                  (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
+        print("", flush=True)    
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b0ffc1f3c54d85158521cac6d09f05dd21de6d
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/modules/speaker_encoder/encoder/visualizations.py
@@ -0,0 +1,180 @@
+""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
+
+from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255 
+
+
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled    
+        if self.disabled:
+            return 
+        
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+        
+    def log_params(self):
+        if self.disabled:
+            return 
+        from encoder import params_data
+        from encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+        
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return 
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+        
+    def log_implementation(self, params):
+        if self.disabled:
+            return 
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string, 
+            opts={"title": "Training implementation"}
+        )
+
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string), 
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+        
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+        
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1878ce622f8077b5a50d950e6a25cfad13b84fb5
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/plugin_wrapper.py
@@ -0,0 +1,76 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.p2e_cross import P2E_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class DreamVG(object):
+    def __init__(self,
+                 config_path='configs/plugin_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_plugin.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = P2E_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.spk_shape = config['model']['unet']['in_channels']
+
+    @torch.no_grad()
+    def inference(self, text,
+                  guidance_scale=5, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023,
+                 ):
+        text, text_mask = text
+        self.model.eval()
+
+        gen_shape = (1, self.spk_shape)
+        
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+    
+        # init noise
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+    
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+    
+            if guidance_scale:
+                output_text = self.model(latents, t, text, text_mask, train_cfg=False)
+                output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+    
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
+    
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+    
+        # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+        pred = scale_shift_re(latents, 1/self.scale, self.shift)
+        # pred = torch.clip(pred, min=0.0, max=0.5)
+        return pred
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/train_vc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1c10f81868cda758c332b8abe826634a13610a
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/utils/utils.py
@@ -0,0 +1,76 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def scale_shift(x, scale, shift):
+    return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+    return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+    source_len = source.shape[1]
+    if mapping_method == 'hard':
+        mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+        output = source[:, mapping_idx]
+    else:
+        # TBD
+        raise NotImplementedError
+
+    return output
+
+
+def save_plot(tensor, savepath):
+    tensor = tensor.squeeze().cpu()
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+
+
+def save_audio(file_path, sampling_rate, audio):
+    audio = np.clip(audio.cpu().squeeze().numpy(), -0.999, 0.999)
+    wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, vmin, vmax)
+    tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+    return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, -1.0, 1.0)
+    tensor = (tensor + 1) / 2
+    tensor = tensor * (vmax - vmin) + vmin
+    return tensor
+
+
+if __name__ == "__main__":
+
+    a = torch.rand(2, 10)
+    target_len = 15
+
+    b = align_seq(a, target_len)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd3b7f73ffaf1fb97edd55bce29850a2cc21cfd3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/src/vc_wrapper.py
@@ -0,0 +1,144 @@
+import yaml
+import torch
+from diffusers import DDIMScheduler
+from .model.model import DiffVC
+from .model.model_cross import DiffVC_Cross
+from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
+
+
+class ReDiffVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_base.yaml',
+                 ckpt_path='../ckpts/dreamvc_base.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  spk_embed, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
+
+class DreamVC(object):
+    def __init__(self,
+                 config_path='configs/diffvc_cross.yaml',
+                 ckpt_path='../ckpts/dreamvc_cross.pt',
+                 device='cpu'):
+
+        with open(config_path, 'r') as fp:
+            config = yaml.safe_load(fp)
+
+        self.device = device
+        self.model = DiffVC_Cross(config['model']).to(device)
+        self.model.load_state_dict(torch.load(ckpt_path)['model'])
+        self.model.eval()
+
+        noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
+                                        beta_start=config['scheduler']['beta_start'],
+                                        beta_end=config['scheduler']['beta_end'],
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+        self.noise_scheduler = noise_scheduler
+        self.scale = config['scheduler']['scale']
+        self.shift = config['scheduler']['shift']
+        self.melshape = config['model']['unet']['sample_size'][0]
+
+    @torch.no_grad()
+    def inference(self,
+                  text, content_clip, f0_clip=None,
+                  guidance_scale=3, guidance_rescale=0.7,
+                  ddim_steps=50, eta=1, random_seed=2023):
+
+        text, text_mask = text
+        self.model.eval()
+        if random_seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(random_seed)
+        else:
+            generator = torch.Generator(device=self.device)
+            generator.seed()
+
+        self.noise_scheduler.set_timesteps(ddim_steps)
+
+        # init noise
+        gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
+        noise = torch.randn(gen_shape, generator=generator, device=self.device)
+        latents = noise
+
+        for t in self.noise_scheduler.timesteps:
+            latents = self.noise_scheduler.scale_model_input(latents, t)
+
+            if guidance_scale:
+                output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+                output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
+                                           speaker_cfg=1.0, pitch_cfg=0.0)
+
+                output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+                if guidance_rescale > 0.0:
+                    output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                    guidance_rescale=guidance_rescale)
+            else:
+                output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
+
+            latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                                eta=eta, generator=generator).prev_sample
+
+        pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
+        return pred
+
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/stream.py b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e83847c3ed3e2db37c1adcef4c635b4ea30ebd0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/stream.py
@@ -0,0 +1,158 @@
+import os
+import torch
+import torch.nn.functional as F
+import librosa
+import sounddevice as sd
+from transformers import WavLMModel
+from scipy.io.wavfile import write
+from models import SynthesizerTrn
+from speaker_encoder.voice_encoder import SpeakerEncoder
+import utils
+import numpy as np
+from transformers import T5Tokenizer, T5EncoderModel
+from src.plugin_wrapper import DreamVG
+
+
+# Load configurations and models
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+print("Loading FreeVC...")
+hps = utils.get_hparams_from_file("configs/freevc.json")
+freevc = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+freevc.eval()
+utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+
+print("Loading Speaker Encoder...")
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+
+lm_path = 'google/flan-t5-base'
+tokenizer = T5Tokenizer.from_pretrained(lm_path)
+text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
+
+dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
+                  ckpt_path='checkpoints/dreamvc_plugin.pt',
+                  device=device)
+
+
+# Constants for overlap-add
+CHUNK_SIZE = 47040
+OVERLAP = 960
+BUFFER_SIZE = OVERLAP + CHUNK_SIZE
+fade_size = OVERLAP
+HANN_WINDOW = np.ones(BUFFER_SIZE)
+HANN_WINDOW[:fade_size] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))
+HANN_WINDOW[-fade_size:] = 0.5 * (1 - np.cos(np.pi * np.arange(fade_size) / fade_size))[::-1]
+
+# Initialize buffers
+input_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
+output_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
+
+
+@torch.no_grad()
+def convert_realtime_with_buffers(audio_chunk, tgt_embedding, freevc, cmodel):
+    """Process audio in chunks with overlap and manage input/output buffers."""
+    global input_buffer, output_buffer, HANN_WINDOW, BUFFER_SIZE, CHUNK_SIZE
+
+    # Add incoming audio chunk to input buffer
+    input_buffer[:OVERLAP] = input_buffer[-OVERLAP:]
+    input_buffer[OVERLAP:] = audio_chunk
+
+    # Downsample to 16,000 Hz
+    chunk = input_buffer
+    chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
+
+    # Convert to tensor and pad
+    chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).to(device).float()
+    chunk_tensor = F.pad(chunk_tensor, (40, 40))
+
+    # Extract content features using WavLM
+    c = cmodel(chunk_tensor).last_hidden_state.transpose(1, 2).to(device)
+
+    # Generate converted audio using FreeVC
+    audio = freevc.infer(c, g=tgt_embedding)
+    audio = audio[0][0].data.cpu().float().numpy()
+
+    # Upsample back to 48,000 Hz
+    audio = librosa.resample(audio, orig_sr=16000, target_sr=48000)
+
+    # Apply Hann window to the output
+    windowed_output = audio * HANN_WINDOW
+
+    # Add the new processed audio to the output buffer with overlap
+    output_buffer[:OVERLAP] = output_buffer[-OVERLAP:]
+    output_buffer[OVERLAP:] = 0
+    output_buffer += windowed_output
+
+    normalization_factors = np.zeros(BUFFER_SIZE)
+    normalization_factors[:OVERLAP] += HANN_WINDOW[-OVERLAP:]
+    normalization_factors += HANN_WINDOW
+    normalization_factors = np.clip(normalization_factors, 1e-6, None)
+    # output_buffer[:CHUNK_SIZE] = output_buffer[:CHUNK_SIZE] / normalization_factors[:CHUNK_SIZE]
+
+    return output_buffer[:CHUNK_SIZE]
+
+
+def prepare_target_embedding(tgt_audio_path):
+    """Preprocess target audio and get speaker embedding."""
+    wav_tgt, _ = librosa.load(tgt_audio_path, sr=16000)
+    wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+    g_tgt = smodel.embed_utterance(wav_tgt)
+    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+    return g_tgt
+
+
+# Prepare the target speaker embedding
+# target_audio = "p225_001.wav"  # Target speaker audio
+# target_embedding = prepare_target_embedding(target_audio)
+prompt = "A young girl voice, very cute"
+prompt_guidance_scale = 3.0
+
+text_batch = tokenizer(prompt, max_length=32,
+                       padding='max_length', truncation=True, return_tensors="pt")
+text, text_mask = text_batch.input_ids.to(device), \
+    text_batch.attention_mask.to(device)
+text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+target_embedding = dreamvg.inference([text, text_mask],
+                                     guidance_scale=prompt_guidance_scale,
+                                     guidance_rescale=0.0,
+                                     ddim_steps=100, eta=1,
+                                     random_seed=None)
+
+# Stream settings
+SAMPLING_RATE = 48000
+INPUT_DEVICE = 69
+OUTPUT_DEVICE = 58
+
+
+def audio_callback(indata, outdata, frames, time, status):
+    """Callback function for real-time audio processing with input and output buffers."""
+    global input_buffer, output_buffer
+
+    if status:
+        print(f"Status: {status}")
+    # Reshape and process input audio
+    indata = indata[:, 0]  # Mono input
+    converted_audio = convert_realtime_with_buffers(indata, target_embedding, freevc, cmodel)
+    # Write the converted audio to the output stream
+    outdata[:] = converted_audio.reshape(-1, 1)
+
+
+# Start the audio stream with the updated callback
+with sd.Stream(
+        samplerate=SAMPLING_RATE,
+        blocksize=CHUNK_SIZE,
+        channels=1,
+        dtype='float32',
+        latency='low',
+        device=(INPUT_DEVICE, OUTPUT_DEVICE),
+        callback=audio_callback):
+    try:
+        sd.sleep(1000000)
+    except KeyboardInterrupt:
+        print("Voice conversion stopped.")
diff --git a/dreamvoice/train_utils/prepare_freevc/freevc/utils.py b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff09995743b34dc0c96c81a5fc0ae72c3eda5843
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+    checkpoint = torch.load('wavlm/WavLM-Large.pt')
+    cfg = WavLMConfig(checkpoint['cfg'])
+    cmodel = WavLM(cfg).cuda(rank)
+    cmodel.load_state_dict(checkpoint['model'])
+    cmodel.eval()
+    return cmodel
+    
+    
+def get_content(cmodel, y):
+    with torch.no_grad():
+        c = cmodel.extract_features(y.squeeze(1))[0]
+    c = c.transpose(1, 2)
+    return c
+
+
+def get_vocoder(rank):
+    with open("hifigan/config.json", "r") as f:
+        config = json.load(f)
+    config = hifigan.AttrDict(config)
+    vocoder = hifigan.Generator(config)
+    ckpt = torch.load("hifigan/generator_v1")
+    vocoder.load_state_dict(ckpt["generator"])
+    vocoder.eval()
+    vocoder.remove_weight_norm()
+    vocoder.cuda(rank)
+    return vocoder
+    
+    
+def transform(mel, height): # 68-92
+    #r = np.random.random()
+    #rate = r * 0.3 + 0.85 # 0.85-1.15
+    #height = int(mel.size(-2) * rate)
+    tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+    if height >= mel.size(-2):
+        return tgt[:, :mel.size(-2), :]
+    else:
+        silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) 
+        silence += torch.randn_like(silence) / 10
+        return torch.cat((tgt, silence), 1)
+        
+        
+def stretch(mel, width): # 0.5-2
+    return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None:
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logger.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+
+
+def get_hparams(init=True):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                      help='JSON file for configuration')
+  parser.add_argument('-m', '--model', type=str, required=True,
+                      help='Model name')
+  
+  args = parser.parse_args()
+  model_dir = os.path.join("./logs", args.model)
+
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+
+  config_path = args.config
+  config_save_path = os.path.join(model_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+  
+  hparams = HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  return hparams
+
+
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+
+
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+    
+  def keys(self):
+    return self.__dict__.keys()
+
+  def items(self):
+    return self.__dict__.items()
+
+  def values(self):
+    return self.__dict__.values()
+
+  def __len__(self):
+    return len(self.__dict__)
+
+  def __getitem__(self, key):
+    return getattr(self, key)
+
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+
+  def __contains__(self, key):
+    return key in self.__dict__
+
+  def __repr__(self):
+    return self.__dict__.__repr__()
diff --git a/dreamvoice/train_utils/prepare_freevc/get_dist.py b/dreamvoice/train_utils/prepare_freevc/get_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ad1dcbbc5a83c38ceb9101c5ae6cd744959f6e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/get_dist.py
@@ -0,0 +1,49 @@
+import os
+import torch
+import random
+import numpy as np
+
+
+# Function to recursively find all .pt files in a directory
+def find_pt_files(root_dir):
+    pt_files = []
+    for dirpath, _, filenames in os.walk(root_dir):
+        for file in filenames:
+            if file.endswith('.pt'):
+                pt_files.append(os.path.join(dirpath, file))
+    return pt_files
+
+
+# Function to compute statistics for a given tensor list
+def compute_statistics(tensor_list):
+    all_data = torch.cat(tensor_list)
+    mean = torch.mean(all_data).item()
+    std = torch.std(all_data).item()
+    max_val = torch.max(all_data).item()
+    min_val = torch.min(all_data).item()
+    return mean, std, max_val, min_val
+
+
+# Root directory containing .pt files in subfolders
+root_dir = "spk"
+
+# Find all .pt files
+pt_files = find_pt_files(root_dir)
+
+# Randomly sample 1000 .pt files (or fewer if less than 1000 files are available)
+sampled_files = random.sample(pt_files, min(1000, len(pt_files)))
+
+# Load tensors from sampled files
+tensor_list = []
+for file in sampled_files:
+    tensor = torch.load(file)
+    tensor_list.append(tensor.view(-1))  # Flatten the tensor
+
+# Compute statistics
+mean, std, max_val, min_val = compute_statistics(tensor_list)
+
+# Print the results
+print(f"Mean: {mean}")
+print(f"Std: {std}")
+print(f"Max: {max_val}")
+print(f"Min: {min_val}")
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/audio.py
@@ -0,0 +1,107 @@
+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+        
+    # Apply the preprocessing: normalize volume and shorten long silences 
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    
+    return wav
+
+
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+
+
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a 
+    threshold determined by the VAD parameters in params.py.
+
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    
+    return wav[audio_mask == True]
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/ckpt/pretrained_bak_5805000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/compute_embed.py
@@ -0,0 +1,40 @@
+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+
+
+def embed_utterance(fpaths, encoder_model_fpath):
+    if not encoder.is_loaded():
+        encoder.load_model(encoder_model_fpath)
+
+    # Compute the speaker embedding of the utterance
+    wav_fpath, embed_fpath = fpaths
+    wav = np.load(wav_fpath)
+    wav = encoder.preprocess_wav(wav)
+    embed = encoder.embed_utterance(wav)
+    np.save(embed_fpath, embed, allow_pickle=False)
+    
+ 
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+
+    wav_dir = outdir_root.joinpath("audio")
+    metadata_fpath = synthesizer_root.joinpath("train.txt")
+    assert wav_dir.exists() and metadata_fpath.exists()
+    embed_dir = synthesizer_root.joinpath("embeds")
+    embed_dir.mkdir(exist_ok=True)
+    
+    # Gather the input wave filepath and the target output embed filepath
+    with metadata_fpath.open("r") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+        
+    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+    # Embed the utterances in separate threads
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    job = Pool(n_processes).imap(func, fpaths)
+    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/config.py
@@ -0,0 +1,45 @@
+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/__init__.py
@@ -0,0 +1,2 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/random_cycler.py
@@ -0,0 +1,37 @@
+import random
+
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random 
+    order. For a source sequence of n items and one or several consecutive queries of a total 
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    
+    def __next__(self):
+        return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker.py
@@ -0,0 +1,40 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+        
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+               
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all 
+        utterances come up at least once every two cycles and in a random order every time.
+        
+        :param count: The number of partial utterances to sample from the set of utterances from 
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
+        frames are the frames of the partial utterances and range is the range of the partial 
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+
+        utterances = self.utterance_cycler.sample(count)
+
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+        return a
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_batch.py
@@ -0,0 +1,12 @@
+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,56 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+
+    def __len__(self):
+        return int(1e10)
+        
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+    
+    
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+
+        super().__init__(
+            dataset=dataset, 
+            batch_size=speakers_per_batch, 
+            shuffle=False, 
+            sampler=sampler, 
+            batch_sampler=batch_sampler, 
+            num_workers=num_workers,
+            collate_fn=self.collate, 
+            pin_memory=pin_memory, 
+            drop_last=False, 
+            timeout=timeout, 
+            worker_init_fn=worker_init_fn
+        )
+
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/data_objects/utterance.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+        
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the 
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/hparams.py
@@ -0,0 +1,31 @@
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/inference.py
@@ -0,0 +1,177 @@
+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    first call to embed_frames() with the default weights file.
+    
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    
+    
+def is_loaded():
+    return _model is not None
+
+
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+        
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    
+    return wav_slices, mel_slices
+
+
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of 
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
+    normalized average. If False, the utterance is instead computed from feeding the entire 
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+
+
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/model.py
@@ -0,0 +1,135 @@
+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,     # 40
+                            hidden_size=model_hidden_size, # 256 
+                            num_layers=model_num_layers,   # 3 
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size, 
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+        
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+            
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+        return embeds
+    
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+            
+        return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_data.py
@@ -0,0 +1,29 @@
+
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/params_model.py
@@ -0,0 +1,11 @@
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/preprocess.py
@@ -0,0 +1,285 @@
+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+        
+    def _log_params(self):
+        from speaker_encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+        
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+            
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+       
+        
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+    
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        # if sources_fpath.exists():
+        #     try:
+        #         with sources_fpath.open("r") as sources_file:
+        #             existing_fnames = {line.split(",")[0] for line in sources_file}
+        #     except:
+        #         existing_fnames = {}
+        # else:
+        #     existing_fnames = {}
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+        return len(wav)
+
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    # from multiprocessing import Pool, cpu_count
+    from pathos.multiprocessing import ProcessingPool as Pool
+    # Function to preprocess utterances for one speaker
+    def __preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        wav_lens = []
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+            wav_lens.append(len(wav))
+        sources_file.close()
+        return wav_lens
+
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Process the utterances for each speaker
+    # with ThreadPool(8) as pool:
+    #     list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+    #               unit="speakers"))
+    pool = Pool(processes=20)
+    for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+        for wav_len in wav_lens:
+            logger.add_sample(duration=wav_len / sampling_rate)
+        print(f'{i}/{len(speaker_dirs)} \r')
+
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return 
+        
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+    #                     nationality.lower() in anglophone_nationalites]
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]                        
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+          (len(keep_speaker_ids), len(nationalities)))
+    
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/train.py
@@ -0,0 +1,125 @@
+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+    # FIXME
+    return 
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,       # 64
+        utterances_per_speaker,   # 10
+        num_workers=8,
+    )
+    
+    # Setup the device on which to run the forward pass and the loss. These can be different, 
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+            
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+            
+        profiler.tick("Extras (visualizations, saving)")
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/visualizations.py
@@ -0,0 +1,178 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255 
+
+
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled    
+        if self.disabled:
+            return 
+        
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+        
+    def log_params(self):
+        if self.disabled:
+            return 
+        from speaker_encoder import params_data
+        from speaker_encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+        
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return 
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+        
+    def log_implementation(self, params):
+        if self.disabled:
+            return 
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string, 
+            opts={"title": "Training implementation"}
+        )
+
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string), 
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+        
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+        
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/speaker_encoder/voice_encoder.py
@@ -0,0 +1,173 @@
+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+        """
+        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). 
+        If None, defaults to cuda if it is available on your machine, otherwise the model will 
+        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+        """
+        super().__init__()
+        
+        # Define the network
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+        
+        # Get the target device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+            
+        # Load the pretrained model'speaker weights
+        # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+        # if not weights_fpath.exists():
+        #     raise Exception("Couldn't find the voice encoder pretrained model at %s." % 
+        #                     weights_fpath)
+
+        start = timer()
+        checkpoint = torch.load(weights_fpath, map_location="cpu")
+
+        self.load_state_dict(checkpoint["model_state"], strict=False)
+        self.to(device)
+        
+        if verbose:
+            print("Loaded the voice encoder model on %s in %.2f seconds." % 
+                  (device.type, timer() - start))
+
+    def forward(self, mels: torch.FloatTensor):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size). 
+        Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+        """
+        # Pass the input through the LSTM layers and retrieve the final hidden state of the last 
+        # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    
+    @staticmethod
+    def compute_partial_slices(n_samples: int, rate, min_coverage):
+        """
+        Computes where to split an utterance waveform and its corresponding mel spectrogram to 
+        obtain partial utterances of <partials_n_frames> each. Both the waveform and the 
+        mel spectrogram slices are returned, so as to make each partial utterance waveform 
+        correspond to its spectrogram.
+    
+        The returned ranges may be indexing further than the length of the waveform. It is 
+        recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+    
+        :param n_samples: the number of samples in the waveform
+        :param rate: how many partial utterances should occur per second. Partial utterances must 
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse 
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have 
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
+        it will be discarded. If there aren't enough frames for one partial utterance, 
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+        respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+        utterances.
+        """
+        assert 0 < min_coverage <= 1
+        
+        # Compute how many frames separate two partial utterances
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+        assert 0 < frame_step, "The rate is too high"
+        assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+            (sampling_rate / (samples_per_frame * partials_n_frames))
+        
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partials_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+        if coverage < min_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        
+        return wav_slices, mel_slices
+    
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+        """
+        Computes an embedding for a single utterance. The utterance is divided in partial 
+        utterances and an embedding is computed for each. The complete utterance embedding is the 
+        L2-normed average embedding of the partial utterances.
+        
+        TODO: independent batched version of this function
+    
+        :param wav: a preprocessed utterance waveform as a numpy array of float32
+        :param return_partials: if True, the partial embeddings will also be returned along with 
+        the wav slices corresponding to each partial utterance.
+        :param rate: how many partial utterances should occur per second. Partial utterances must 
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse 
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have 
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
+        it will be discarded. If there aren't enough frames for one partial utterance, 
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+        <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+        returned.
+        """
+        # Compute where to split the utterance into partials and pad the waveform with zeros if 
+        # the partial utterances cover a larger range. 
+        wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        
+        # Split the utterance into partials and forward them through the model
+        mel = audio.wav_to_mel_spectrogram(wav)
+        mels = np.array([mel[s] for s in mel_slices])
+        with torch.no_grad():
+            mels = torch.from_numpy(mels).to(self.device)
+            partial_embeds = self(mels).cpu().numpy()
+        
+        # Compute the utterance embedding from the partial embeddings
+        raw_embed = np.mean(partial_embeds, axis=0)
+        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        
+        if return_partials:
+            return embed, partial_embeds, wav_slices
+        return embed
+    
+    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+        """
+        Compute the embedding of a collection of wavs (presumably from the same speaker) by 
+        averaging their embedding and L2-normalizing it.
+        
+        :param wavs: list of wavs a numpy arrays of float32.
+        :param kwargs: extra arguments to embed_utterance()
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        """
+        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+                             for wav in wavs], axis=0)
+        return raw_embed / np.linalg.norm(raw_embed, 2)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/prepare_freevc/spk_ext.py b/dreamvoice/train_utils/prepare_freevc/spk_ext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a71ad6a6131fb67729f1cc6f161dd3fcf276b0
--- /dev/null
+++ b/dreamvoice/train_utils/prepare_freevc/spk_ext.py
@@ -0,0 +1,90 @@
+import os
+import torch
+import librosa
+from tqdm import tqdm
+from speaker_encoder.voice_encoder import SpeakerEncoder
+from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
+
+
+@torch.no_grad()
+def se_extractor(audio_path, smodel):
+    # vad
+    SAMPLE_RATE = 16000
+    audio_vad = get_audio_tensor(audio_path)
+    segments = get_vad_segments(
+        audio_vad,
+        output_sample=True,
+        min_speech_duration=0.1,
+        min_silence_duration=1,
+        method="silero",
+    )
+    segments = [(seg["start"], seg["end"]) for seg in segments]
+    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
+
+    if len(segments) == 0:
+        segments = [(0, len(audio_vad)/SAMPLE_RATE)]
+        print(segments)
+
+    # spk
+    gs = []
+
+    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
+    # audio = torch.tensor(audio).float().to(device)
+
+    for s, e in segments:
+        y = audio[int(SAMPLE_RATE*s):int(SAMPLE_RATE*e)]
+        g = smodel.embed_utterance(y)
+        g = torch.from_numpy(g).unsqueeze(0)
+        gs.append(g)
+
+    gs = torch.stack(gs).mean(0)
+    return gs.cpu()
+
+
+def process_audio_folder(input_folder, output_folder, model, device):
+    """
+    Process all audio files in a folder and its subfolders, 
+    save the extracted features as .pt files in the output folder with the same structure.
+
+    Args:
+        input_folder (str): Path to the input folder containing audio files.
+        output_folder (str): Path to the output folder to save .pt files.
+        model: Pre-trained model for feature extraction.
+        device: Torch device (e.g., 'cpu' or 'cuda').
+    """
+    # Collect all audio file paths
+    audio_files = []
+    for root, _, files in os.walk(input_folder):
+        for file in files:
+            if file.endswith(('.wav', '.mp3', '.flac')):  # Adjust for the audio formats you want to process
+                audio_files.append(os.path.join(root, file))
+
+    # Process each audio file with tqdm for progress
+    for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
+        # Construct output path
+        relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
+        output_dir = os.path.join(output_folder, relative_path)
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
+
+        # Check if the .pt file already exists
+        if os.path.exists(output_path):
+            # print(f"Skipped (already exists): {output_path}")
+            continue  # Skip processing this file
+        # Extract features
+        target_se = se_extractor(audio_path, model).to(device)
+        # Save the feature as .pt
+        torch.save(target_se, output_path)
+        # print(f"Processed and saved: {output_path}")
+
+
+if __name__ == '__main__':
+    smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+    device = 'cuda'
+    # input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
+    # output_folder = 'spk/LibriTTS-R/train-clean-360/'
+    # process_audio_folder(input_folder, output_folder, smodel, device)
+
+    input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
+    output_folder = 'spk/VCTK/VCTK-Corpus/'
+    process_audio_folder(input_folder, output_folder, smodel, device)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/configs/plugin.py b/dreamvoice/train_utils/src/configs/plugin.py
index 5e9a409af86ef67361bae0b7c3ee3b747ee907eb..70a55a8f5edd7d95486b9d28b076fab96b916fb7 100644
--- a/dreamvoice/train_utils/src/configs/plugin.py
+++ b/dreamvoice/train_utils/src/configs/plugin.py
@@ -18,7 +18,7 @@ class AttrDict(dict):
 
 
 all_params = {
-    'Plugin_base': AttrDict(
+    'Plugin_freevc': AttrDict(
     # Diff params
     diff=AttrDict(
         num_train_steps=1000,
diff --git a/dreamvoice/train_utils/src/dataset/vcdata.py b/dreamvoice/train_utils/src/dataset/vcdata.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7a1d99a243869794900eed6189a32bad930aea4
--- /dev/null
+++ b/dreamvoice/train_utils/src/dataset/vcdata.py
@@ -0,0 +1,146 @@
+import pandas as pd
+import os
+import random
+import ast
+import numpy as np
+import torch
+from einops import repeat, rearrange
+import librosa
+
+from torch.utils.data import Dataset
+import torchaudio
+
+
+def log_f0(f0, f0_min=librosa.note_to_hz('C2'), scales=4):
+    f0[f0 < f0_min] = 0.0
+    f0_log = torch.zeros_like(f0)
+    f0_log[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
+    # f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
+    # f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
+    f0_log /= (scales*12)
+    return f0_log
+
+
+class VCData(Dataset):
+    def __init__(self,
+                 data_dir, meta_dir, subset, prompt_dir,
+                 seg_length=1.92, speaker_length=4,
+                 sr=24000, content_sr=50, speaker_sr=16000,
+                 plugin_mode=False
+                 ):
+        self.datadir = data_dir
+        meta = pd.read_csv(meta_dir)
+        self.meta = meta[meta['subset'] == subset]
+        self.subset = subset
+        self.prompts = pd.read_csv(prompt_dir)
+        self.seg_len = seg_length
+        self.speaker_length = speaker_length
+        self.sr = sr
+        self.content_sr = content_sr
+        self.speaker_sr = speaker_sr
+        self.plugin_mode = plugin_mode
+
+    def get_audio_content(self, audio_path, content_path, f0_path):
+        audio_path = self.datadir + audio_path
+        audio, sr = torchaudio.load(audio_path)
+        assert sr == self.sr
+
+        # 1, T, C
+        content = torch.load(self.datadir + content_path)
+
+        total_length = content.shape[1]
+        if int(total_length - int(self.content_sr * self.seg_len)) > 0:
+            start = np.random.randint(0, int(total_length - self.content_sr * self.seg_len) + 1)
+        else:
+            start = 0
+        end = min(start + int(self.seg_len * self.content_sr), content.shape[1])
+
+        # use last frame for padding
+        content_clip = repeat(content[:, -1, :], "b c-> b t c", t=int(self.content_sr * self.seg_len)).clone()
+        content_clip[:, :end - start, :] = content[:, start: end, :]
+
+        audio_clip = torch.zeros(int(self.seg_len * self.sr))
+        # print(start)
+        # print(end)
+        audio_start = round(start * self.sr / self.content_sr)
+        audio_end = round(end * self.sr / self.content_sr)
+        # print(audio_start)
+        # print(audio_end)
+        # print(audio.shape)
+
+        audio_clip[:audio_end - audio_start] = audio[0, audio_start: audio_end].clone()
+
+        if f0_path:
+            f0 = torch.load(self.datadir + f0_path).float()
+            f0_clip = torch.zeros(int(self.content_sr * self.seg_len))
+            f0_clip[:end-start] = f0[start:end]
+            f0_clip = log_f0(f0_clip)
+            f0_clip = f0_clip.unsqueeze(-1)
+        else:
+            f0_clip = None
+
+        return audio_clip, content_clip[0], f0_clip
+
+    def get_speaker(self, speaker_path):
+        audio_path = self.datadir + speaker_path
+        audio, sr = torchaudio.load(audio_path)
+        assert sr == self.speaker_sr
+        # if sr != self.speaker_sr:
+        #     resampler = torchaudio.transforms.Resample(sr, self.speaker_sr, dtype=audio.dtype)
+        #     audio = resampler(audio)
+
+        audio_clip = torch.zeros(self.speaker_length * self.speaker_sr)
+
+        total_length = audio.shape[1]
+        if int(total_length - self.speaker_sr * self.speaker_length) > 0:
+            start = np.random.randint(0, int(total_length - self.speaker_sr * self.speaker_length) + 1)
+        else:
+            start = 0
+        end = min(start + self.speaker_sr * self.speaker_length, total_length)
+
+        audio_clip[:end-start] = audio[0, start: end]
+
+        return audio_clip
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+
+        if self.plugin_mode:
+            audio_clip, content_clip, f0_clip = [''], [''], ['']
+        else:
+            # load current audio
+            audio_path = row['audio_path']
+            content_path = row['content_path']
+            f0_path = row['f0_path']
+            audio_clip, content_clip, f0_clip = self.get_audio_content(audio_path, content_path, f0_path)
+
+        # get speaker
+        if self.subset == 'train':
+            speaker = row['speaker']
+        else:
+            speaker = row['speaker_val']
+
+        speaker_row = self.meta[self.meta['speaker'] == speaker].sample(1)
+        speaker_path = speaker_row.iloc[0]['speaker_path']
+        speaker_clip = self.get_speaker(speaker_path)
+        # print(speaker_clip.shape)
+        # print(speaker_path)
+        # print(speaker)
+
+        # get prompt
+        prompts = self.prompts[self.prompts['ID'] == speaker]['prompts'].iloc[0]
+        prompts = ast.literal_eval(prompts)
+        prompt = random.choice(prompts)
+
+        return audio_clip, content_clip, f0_clip, speaker_clip, prompt
+
+    def __len__(self):
+        return len(self.meta)
+
+
+if __name__ == '__main__':
+    from tqdm import tqdm
+    data = VCData('../../features/', '../../data/meta_val.csv', 'val', '../../data/speaker_gender.csv')
+    for i in tqdm(range(len(data))):
+        x = data[i]
+        # print(x[-1])
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/.gitattributes b/dreamvoice/train_utils/src/freevc/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..717eda91d34e790b2de5140dd1c46748bdddef26
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/.gitattributes
@@ -0,0 +1,34 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/dreamvoice/train_utils/src/freevc/.gitignore b/dreamvoice/train_utils/src/freevc/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e4008401fb75eb82773c4bdb3f4b886e2e6d34c4
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+flagged
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/README.md b/dreamvoice/train_utils/src/freevc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ea823d354d9634023a02ba8d7e6b55e7108f9
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/README.md
@@ -0,0 +1,13 @@
+---
+title: FreeVC
+emoji: 🚀
+colorFrom: gray
+colorTo: red
+sdk: gradio
+sdk_version: 3.13.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/dreamvoice/train_utils/src/freevc/app.py b/dreamvoice/train_utils/src/freevc/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..982821f01caea503d8451f6c8e99096918705d79
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/app.py
@@ -0,0 +1,92 @@
+import os
+import torch
+import librosa
+import gradio as gr
+from scipy.io.wavfile import write
+from transformers import WavLMModel
+
+import utils
+from models import SynthesizerTrn
+from mel_processing import mel_spectrogram_torch
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+'''
+def get_wavlm():
+    os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
+    shutil.move('WavLM-Large.pt', 'wavlm')
+'''
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# print("Loading FreeVC...")
+# hps = utils.get_hparams_from_file("configs/freevc.json")
+# freevc = SynthesizerTrn(
+#     hps.data.filter_length // 2 + 1,
+#     hps.train.segment_size // hps.data.hop_length,
+#     **hps.model).to(device)
+# _ = freevc.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
+smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+print("Loading FreeVC(24k)...")
+hps = utils.get_hparams_from_file("configs/freevc-24.json")
+freevc_24 = SynthesizerTrn(
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model).to(device)
+_ = freevc_24.eval()
+_ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
+
+# print("Loading FreeVC-s...")
+# hps = utils.get_hparams_from_file("configs/freevc-s.json")
+# freevc_s = SynthesizerTrn(
+#     hps.data.filter_length // 2 + 1,
+#     hps.train.segment_size // hps.data.hop_length,
+#     **hps.model).to(device)
+# _ = freevc_s.eval()
+# _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
+#
+# print("Loading WavLM for content...")
+cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+ 
+def convert(model, cmodel, src, tgt):
+    with torch.no_grad():
+        # tgt
+        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        g_tgt = smodel.embed_utterance(wav_tgt)
+        g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+
+        # src
+        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+        wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+        c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+        # infer
+        if model == "FreeVC":
+            audio = freevc.infer(c, g=g_tgt)
+        elif model == "FreeVC-s":
+            audio = freevc_s.infer(c, mel=mel_tgt)
+        else:
+            audio = freevc_24.infer(c, g=g_tgt)
+        audio = audio[0][0].data.cpu().float().numpy()
+        if model == "FreeVC" or model == "FreeVC-s":
+            write("out.wav", hps.data.sampling_rate, audio)
+        else:
+            write("out.wav", 24000, audio)
+    out = "out.wav"
+    return out
+    
+# model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
+# audio1 = gr.inputs.Audio(label="Source Audio", type='filepath')
+# audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath')
+# inputs = [model, audio1, audio2]
+# outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
+#
+# title = "FreeVC"
+# description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please <strong>trim</strong> it before submitting."
+# article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Paper</a> | <a href='https://github.com/OlaWod/FreeVC' target='_blank'>Github Repo</a></p>"
+#
+# examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
+#
+# gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
+convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav')
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/freevc/commons.py b/dreamvoice/train_utils/src/freevc/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a72264e8d69ca5525337c27c5a3203653b63e1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/commons.py
@@ -0,0 +1,171 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+
+
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+
+
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-24.json b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afef364d2a94757408e972c75fa29bb4439af2
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc-24.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8640,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8008"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,4,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc-s.json b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb790bae9497768154c9e23955bbeb1a7445a1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc-s.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": false
+  }
+}
diff --git a/dreamvoice/train_utils/src/freevc/configs/freevc.json b/dreamvoice/train_utils/src/freevc/configs/freevc.json
new file mode 100644
index 0000000000000000000000000000000000000000..062ced66de9f20918ff02abdd61187043c02e6c1
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/configs/freevc.json
@@ -0,0 +1,54 @@
+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 10000,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 2e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 64,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 8960,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 128,
+    "port": "8001"
+  },
+  "data": {
+    "training_files":"filelists/train.txt",
+    "validation_files":"filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 16000,
+    "filter_length": 1280,
+    "hop_length": 320,
+    "win_length": 1280,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 1024,
+    "use_spk": true
+  }
+}
diff --git a/dreamvoice/train_utils/src/freevc/mel_processing.py b/dreamvoice/train_utils/src/freevc/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99e8bf8a632655181a2ce41fd325e7ebec52f54
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/mel_processing.py
@@ -0,0 +1,112 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
diff --git a/dreamvoice/train_utils/src/freevc/models.py b/dreamvoice/train_utils/src/freevc/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d3247337c6cd49351490c7f17cb33cea52e361
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/models.py
@@ -0,0 +1,351 @@
+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .commons import sequence_mask, rand_slice_segments
+from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .commons import init_weights, get_padding
+
+
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(Flip())
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+
+
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = ResBlock1 if resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+        
+        
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+            
+        return mel_slices
+    
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+        
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        
+        return embed
+
+
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+
+  def __init__(self, 
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock, 
+    resblock_kernel_sizes, 
+    resblock_dilation_sizes, 
+    upsample_rates, 
+    upsample_initial_channel, 
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) 
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+      
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+      
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) 
+    z_p = self.flow(z, spec_mask, g=g)
+
+    z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    
+    return o
diff --git a/dreamvoice/train_utils/src/freevc/modules.py b/dreamvoice/train_utils/src/freevc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a51558f78899cb0e77c595fe2ca9b3d3c762f5
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/modules.py
@@ -0,0 +1,341 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+
+ 
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+
+
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+
+
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+    if g is not None:
+      g = self.cond_layer(g)
+
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+
+      acts = fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+    
+
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+
+
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+
+
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
diff --git a/dreamvoice/train_utils/src/freevc/requirements.txt b/dreamvoice/train_utils/src/freevc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acb6e357a9135378fe36583db58af502f840078c
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/requirements.txt
@@ -0,0 +1,8 @@
+altair
+httpx==0.24.1
+numpy
+scipy
+torch
+transformers
+librosa
+webrtcvad==2.0.10
diff --git a/dreamvoice/train_utils/src/freevc/utils.py b/dreamvoice/train_utils/src/freevc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e931b1f56a976674425c5637b0767d3485c51f69
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc/utils.py
@@ -0,0 +1,305 @@
+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from .commons import sequence_mask
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def get_cmodel(rank):
+    checkpoint = torch.load('wavlm/WavLM-Large.pt')
+    cfg = WavLMConfig(checkpoint['cfg'])
+    cmodel = WavLM(cfg).cuda(rank)
+    cmodel.load_state_dict(checkpoint['model'])
+    cmodel.eval()
+    return cmodel
+    
+    
+def get_content(cmodel, y):
+    with torch.no_grad():
+        c = cmodel.extract_features(y.squeeze(1))[0]
+    c = c.transpose(1, 2)
+    return c
+
+
+def get_vocoder(rank):
+    with open("hifigan/config.json", "r") as f:
+        config = json.load(f)
+    config = hifigan.AttrDict(config)
+    vocoder = hifigan.Generator(config)
+    ckpt = torch.load("hifigan/generator_v1")
+    vocoder.load_state_dict(ckpt["generator"])
+    vocoder.eval()
+    vocoder.remove_weight_norm()
+    vocoder.cuda(rank)
+    return vocoder
+    
+    
+def transform(mel, height): # 68-92
+    #r = np.random.random()
+    #rate = r * 0.3 + 0.85 # 0.85-1.15
+    #height = int(mel.size(-2) * rate)
+    tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+    if height >= mel.size(-2):
+        return tgt[:, :mel.size(-2), :]
+    else:
+        silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) 
+        silence += torch.randn_like(silence) / 10
+        return torch.cat((tgt, silence), 1)
+        
+        
+def stretch(mel, width): # 0.5-2
+    return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+
+
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None:
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logger.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+
+
+def get_hparams(init=True):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                      help='JSON file for configuration')
+  parser.add_argument('-m', '--model', type=str, required=True,
+                      help='Model name')
+  
+  args = parser.parse_args()
+  model_dir = os.path.join("./logs", args.model)
+
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+
+  config_path = args.config
+  config_save_path = os.path.join(model_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+  
+  hparams = HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  return hparams
+
+
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+
+
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+    
+  def keys(self):
+    return self.__dict__.keys()
+
+  def items(self):
+    return self.__dict__.items()
+
+  def values(self):
+    return self.__dict__.values()
+
+  def __len__(self):
+    return len(self.__dict__)
+
+  def __getitem__(self, key):
+    return getattr(self, key)
+
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+
+  def __contains__(self, key):
+    return key in self.__dict__
+
+  def __repr__(self):
+    return self.__dict__.__repr__()
diff --git a/dreamvoice/train_utils/src/freevc_wrapper.py b/dreamvoice/train_utils/src/freevc_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a46fa184110368939cfc1bf3cc9f47a9c8092d
--- /dev/null
+++ b/dreamvoice/train_utils/src/freevc_wrapper.py
@@ -0,0 +1,63 @@
+import os
+import torch
+import librosa
+import soundfile as sf
+from pathlib import Path
+
+from transformers import WavLMModel
+from freevc.utils import load_checkpoint, get_hparams_from_file
+from freevc.models import SynthesizerTrn
+# from mel_processing import mel_spectrogram_torch
+# from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder
+from speaker_encoder.voice_encoder import SpeakerEncoder
+
+
+def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
+    hps = get_hparams_from_file("freevc/configs/freevc.json")
+    freevc = SynthesizerTrn(
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model).to(device)
+    freevc.eval()
+    load_checkpoint("../prepare_freevc/ckpts/freevc.pth", freevc, None)
+
+    cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+    cmodel.eval()
+
+    # smodel = spk_encoder.load_model(Path(speaker_path), device)
+    # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda')
+    smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
+
+    return freevc, cmodel, smodel, hps
+
+
+@torch.no_grad()
+def convert(freevc, content, speaker):
+    audio = freevc.infer(content, g=speaker)
+    audio = audio[0][0].data.cpu().float().numpy()
+    return audio, 24000
+
+
+if __name__ == '__main__':
+    freevc_24, cmodel, smodel, hps = get_freevc_models()
+
+    tgt = 'p226_002.wav'
+    # src = 'p226_002.wav'
+    src = 'p225_001.wav'
+    device = 'cuda'
+
+    # tgt
+    wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
+    wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+    g_tgt = smodel.embed_utterance(wav_tgt)
+    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+    # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda())
+
+    # src
+    wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
+    wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
+    content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
+
+    output, sr = convert(freevc_24, content, g_tgt)
+
+    sf.write('output.wav', output, sr)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/inference_freevc.py b/dreamvoice/train_utils/src/inference_freevc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdf218f87549460a47cb224c509c3f0fc80d6b0
--- /dev/null
+++ b/dreamvoice/train_utils/src/inference_freevc.py
@@ -0,0 +1,124 @@
+import os
+import torch
+import soundfile as sf
+import pandas as pd
+import librosa
+from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift_re
+from freevc_wrapper import convert
+import time
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+@torch.no_grad()
+def inference_timbre(gen_shape, text,
+                     model, scheduler,
+                     guidance_scale=5, guidance_rescale=0.7,
+                     ddim_steps=50, eta=1, random_seed=2023,
+                     device='cuda',
+                     ):
+    text, text_mask = text
+    model.eval()
+    
+    if random_seed is not None:
+        generator = torch.Generator(device=device).manual_seed(random_seed)
+    else:
+        generator = torch.Generator(device=device)
+        generator.seed()
+    
+    scheduler.set_timesteps(ddim_steps)
+
+    # init noise
+    noise = torch.randn(gen_shape, generator=generator, device=device)
+    latents = noise
+
+    for t in scheduler.timesteps:
+        latents = scheduler.scale_model_input(latents, t)
+
+        if guidance_scale:
+            output_text = model(latents, t, text, text_mask, train_cfg=False)
+            output_uncond = model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
+
+            output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+            if guidance_rescale > 0.0:
+                output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                guidance_rescale=guidance_rescale)
+        else:
+            output_pred = model(latents, t, text, text_mask, train_cfg=False)
+
+        latents = scheduler.step(model_output=output_pred, timestep=t, sample=latents,
+                                 eta=eta, generator=generator).prev_sample
+
+    # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
+    pred = scale_shift_re(latents, 20, -0.035)
+    pred = torch.clip(pred, min=0.0, max=0.5)
+    return pred
+
+
+@torch.no_grad()
+def eval_plugin(freevc, cmodel, text_model,
+                timbre_model, timbre_scheduler, timbre_shape,
+                val_meta, val_folder,
+                guidance_scale=3, guidance_rescale=0.7,
+                ddim_steps=50, eta=1, random_seed=2024,
+                device='cuda',
+                epoch=0, save_path='logs/eval/', val_num=10, sr=16000):
+
+    tokenizer, text_encoder = text_model
+
+    df = pd.read_csv(val_meta)
+
+    save_path = save_path + str(epoch) + '/'
+    os.makedirs(save_path, exist_ok=True)
+
+    step = 0
+
+    for i in range(len(df)):
+        row = df.iloc[i]
+
+        source_path = val_folder + row['path']
+        # prompt = [row['prompt']]
+        prompt = ["female's voice"]
+        with torch.no_grad():
+            text_batch = tokenizer(prompt,
+                                   max_length=32,
+                                   padding='max_length', truncation=True, return_tensors="pt")
+            text, text_mask = text_batch.input_ids.to(device), \
+                text_batch.attention_mask.to(device)
+            text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+        audio_clip = librosa.load(source_path, sr=16000)[0]
+        audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device)
+
+        content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device)
+
+        # start_time = time.time()
+        spk_embed = inference_timbre(timbre_shape, [text, text_mask],
+                                     timbre_model, timbre_scheduler,
+                                     guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
+                                     ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
+                                     device=device)
+        spk_embed = spk_embed.squeeze(-1)
+
+        output, out_sr = convert(freevc, content, spk_embed)
+        # end_time = time.time()
+        # print(end_time-start_time)
+        # print(pred.shape)
+        sf.write(save_path + f'{step}_{prompt[0]}' + '.wav', output, samplerate=sr)
+
+        step += 1
+
+        if step >= val_num:
+            break
diff --git a/dreamvoice/train_utils/src/speaker_encoder/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dreamvoice/train_utils/src/speaker_encoder/audio.py b/dreamvoice/train_utils/src/speaker_encoder/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb47c9e72f3364d8317b79a80ce62030d2403fd
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/audio.py
@@ -0,0 +1,107 @@
+from scipy.ndimage.morphology import binary_dilation
+from speaker_encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(fpath_or_wav, sr=None)
+    else:
+        wav = fpath_or_wav
+    
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+        
+    # Apply the preprocessing: normalize volume and shorten long silences 
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    
+    return wav
+
+
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+
+
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a 
+    threshold determined by the VAD parameters in params.py.
+
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    
+    return wav[audio_mask == True]
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))
diff --git a/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt
new file mode 100644
index 0000000000000000000000000000000000000000..662d22b686114b4b6124330a688007d9495d22c8
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/ckpt/pretrained_bak_5805000.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379
diff --git a/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45430c7d03d160dc64d450c1af81180f419eb51
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/compute_embed.py
@@ -0,0 +1,40 @@
+from speaker_encoder import inference as encoder
+from multiprocessing.pool import Pool
+from functools import partial
+from pathlib import Path
+# from utils import logmmse
+# from tqdm import tqdm
+# import numpy as np
+# import librosa
+
+
+def embed_utterance(fpaths, encoder_model_fpath):
+    if not encoder.is_loaded():
+        encoder.load_model(encoder_model_fpath)
+
+    # Compute the speaker embedding of the utterance
+    wav_fpath, embed_fpath = fpaths
+    wav = np.load(wav_fpath)
+    wav = encoder.preprocess_wav(wav)
+    embed = encoder.embed_utterance(wav)
+    np.save(embed_fpath, embed, allow_pickle=False)
+    
+ 
+def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int):
+
+    wav_dir = outdir_root.joinpath("audio")
+    metadata_fpath = synthesizer_root.joinpath("train.txt")
+    assert wav_dir.exists() and metadata_fpath.exists()
+    embed_dir = synthesizer_root.joinpath("embeds")
+    embed_dir.mkdir(exist_ok=True)
+    
+    # Gather the input wave filepath and the target output embed filepath
+    with metadata_fpath.open("r") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
+        
+    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+    # Embed the utterances in separate threads
+    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    job = Pool(n_processes).imap(func, fpaths)
+    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/config.py b/dreamvoice/train_utils/src/speaker_encoder/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12228c81152487da24a6090e5a736f9de0755b0
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/config.py
@@ -0,0 +1,45 @@
+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..740f750a9746e5ace34f1bf875d9ac07677e1ed6
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/__init__.py
@@ -0,0 +1,2 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5cf738d3ca5214034ce3babdedf6eaea64c469
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/random_cycler.py
@@ -0,0 +1,37 @@
+import random
+
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random 
+    order. For a source sequence of n items and one or several consecutive queries of a total 
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    
+    def __next__(self):
+        return self.sample(1)[0]
+
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb320b211f0de5b3a6fbb83380d8a8b9677151b2
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker.py
@@ -0,0 +1,40 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.utterance import Utterance
+from pathlib import Path
+
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+        
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+               
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all 
+        utterances come up at least once every two cycles and in a random order every time.
+        
+        :param count: The number of partial utterances to sample from the set of utterances from 
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
+        frames are the frames of the partial utterances and range is the range of the partial 
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+
+        utterances = self.utterance_cycler.sample(count)
+
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+
+        return a
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd5493a599e74cea594510af94015464072cb3
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_batch.py
@@ -0,0 +1,12 @@
+import numpy as np
+from typing import List
+from speaker_encoder.data_objects.speaker import Speaker
+
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..be4568923a21e8f28a229899e137d0186e0b1250
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/speaker_verification_dataset.py
@@ -0,0 +1,56 @@
+from speaker_encoder.data_objects.random_cycler import RandomCycler
+from speaker_encoder.data_objects.speaker_batch import SpeakerBatch
+from speaker_encoder.data_objects.speaker import Speaker
+from speaker_encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+
+# TODO: improve with a pool of speakers for data efficiency
+
+class SpeakerVerificationDataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+
+    def __len__(self):
+        return int(1e10)
+        
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+    
+    
+class SpeakerVerificationDataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+
+        super().__init__(
+            dataset=dataset, 
+            batch_size=speakers_per_batch, 
+            shuffle=False, 
+            sampler=sampler, 
+            batch_sampler=batch_sampler, 
+            num_workers=num_workers,
+            collate_fn=self.collate, 
+            pin_memory=pin_memory, 
+            drop_last=False, 
+            timeout=timeout, 
+            worker_init_fn=worker_init_fn
+        )
+
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
+    
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3185ec781eaf5be2a58d61c22b32586d366126
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/data_objects/utterance.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+        
+    def get_frames(self):
+        return np.load(self.frames_fpath)
+
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the 
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        return frames[start:end], (start, end)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/hparams.py b/dreamvoice/train_utils/src/speaker_encoder/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac64bcc3bd9ec490e988ac894de93921ba20f607
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/hparams.py
@@ -0,0 +1,31 @@
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/inference.py b/dreamvoice/train_utils/src/speaker_encoder/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5662912a7cc0eb8818732d0b1d233ba1b195ec7
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/inference.py
@@ -0,0 +1,177 @@
+from speaker_encoder.params_data import *
+from speaker_encoder.model import SpeakerEncoder
+from speaker_encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from speaker_encoder import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+
+
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    first call to embed_frames() with the default weights file.
+    
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+    
+    
+def is_loaded():
+    return _model is not None
+
+
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+
+
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+        
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    
+    return wav_slices, mel_slices
+
+
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of 
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
+    normalized average. If False, the utterance is instead computed from feeding the entire 
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+
+
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+
+
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)
diff --git a/dreamvoice/train_utils/src/speaker_encoder/model.py b/dreamvoice/train_utils/src/speaker_encoder/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4493a98b217e4bd082940cbe4d31b8169f18b5d9
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/model.py
@@ -0,0 +1,135 @@
+from speaker_encoder.params_model import *
+from speaker_encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,     # 40
+                            hidden_size=model_hidden_size, # 256 
+                            num_layers=model_num_layers,   # 3 
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size, 
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+        
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+            
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        
+        return embeds
+    
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
+
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
+
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+            
+        return loss, eer
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_data.py b/dreamvoice/train_utils/src/speaker_encoder/params_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..676e6dc197faf01648de7a830140172d5594b999
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/params_data.py
@@ -0,0 +1,29 @@
+
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out. 
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
diff --git a/dreamvoice/train_utils/src/speaker_encoder/params_model.py b/dreamvoice/train_utils/src/speaker_encoder/params_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32731f295b3b26e9e38bb9f9047d5c784649e127
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/params_model.py
@@ -0,0 +1,11 @@
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+
+
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 64
+utterances_per_speaker = 10
diff --git a/dreamvoice/train_utils/src/speaker_encoder/preprocess.py b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb9041551270629a27baab6d1f1525e380c5378
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/preprocess.py
@@ -0,0 +1,285 @@
+from multiprocess.pool import ThreadPool
+from speaker_encoder.params_data import *
+from speaker_encoder.config import librispeech_datasets, anglophone_nationalites
+from datetime import datetime
+from speaker_encoder import audio
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+
+
+class DatasetLog:
+    """
+    Registers metadata about the dataset in a text file.
+    """
+    def __init__(self, root, name):
+        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.sample_data = dict()
+        
+        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line("-----")
+        self._log_params()
+        
+    def _log_params(self):
+        from speaker_encoder import params_data
+        self.write_line("Parameter values:")
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            self.write_line("\t%s: %s" % (param_name, value))
+        self.write_line("-----")
+    
+    def write_line(self, line):
+        self.text_file.write("%s\n" % line)
+        
+    def add_sample(self, **kwargs):
+        for param_name, value in kwargs.items():
+            if not param_name in self.sample_data:
+                self.sample_data[param_name] = []
+            self.sample_data[param_name].append(value)
+            
+    def finalize(self):
+        self.write_line("Statistics:")
+        for param_name, values in self.sample_data.items():
+            self.write_line("\t%s:" % param_name)
+            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
+            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
+        self.write_line("-----")
+        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
+        self.write_line("Finished on %s" % end_time)
+        self.text_file.close()
+       
+        
+def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
+    dataset_root = datasets_root.joinpath(dataset_name)
+    if not dataset_root.exists():
+        print("Couldn\'t find %s, skipping this dataset." % dataset_root)
+        return None, None
+    return dataset_root, DatasetLog(out_dir, dataset_name)
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    
+    # Function to preprocess utterances for one speaker
+    def preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        if sources_fpath.exists():
+            try:
+                with sources_fpath.open("r") as sources_file:
+                    existing_fnames = {line.split(",")[0] for line in sources_file}
+            except:
+                existing_fnames = {}
+        else:
+            existing_fnames = {}
+        
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+    
+    # Process the utterances for each speaker
+    with ThreadPool(8) as pool:
+        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+                  unit="speakers"))
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+# Function to preprocess utterances for one speaker
+def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        # There's a possibility that the preprocessing was interrupted earlier, check if 
+        # there already is a sources file.
+        # if sources_fpath.exists():
+        #     try:
+        #         with sources_fpath.open("r") as sources_file:
+        #             existing_fnames = {line.split(",")[0] for line in sources_file}
+        #     except:
+        #         existing_fnames = {}
+        # else:
+        #     existing_fnames = {}
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+        
+        sources_file.close()
+        return len(wav)
+
+def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
+                             skip_existing, logger):
+    # from multiprocessing import Pool, cpu_count
+    from pathos.multiprocessing import ProcessingPool as Pool
+    # Function to preprocess utterances for one speaker
+    def __preprocess_speaker(speaker_dir: Path):
+        # Give a name to the speaker that includes its dataset
+        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+        
+        # Create an output directory with that name, as well as a txt file containing a 
+        # reference to each source file.
+        speaker_out_dir = out_dir.joinpath(speaker_name)
+        speaker_out_dir.mkdir(exist_ok=True)
+        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+        
+        existing_fnames = {}
+        # Gather all audio files for that speaker recursively
+        sources_file = sources_fpath.open("a" if skip_existing else "w")
+        wav_lens = []
+        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+            # Check if the target output file already exists
+            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
+            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            if skip_existing and out_fname in existing_fnames:
+                continue
+                
+            # Load and preprocess the waveform
+            wav = audio.preprocess_wav(in_fpath)
+            if len(wav) == 0:
+                continue
+            
+            # Create the mel spectrogram, discard those that are too short
+            frames = audio.wav_to_mel_spectrogram(wav)
+            if len(frames) < partials_n_frames:
+                continue
+            
+            out_fpath = speaker_out_dir.joinpath(out_fname)
+            np.save(out_fpath, frames)
+            # logger.add_sample(duration=len(wav) / sampling_rate)
+            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
+            wav_lens.append(len(wav))
+        sources_file.close()
+        return wav_lens
+
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+    # Process the utterances for each speaker
+    # with ThreadPool(8) as pool:
+    #     list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
+    #               unit="speakers"))
+    pool = Pool(processes=20)
+    for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1):
+        for wav_len in wav_lens:
+            logger.add_sample(duration=wav_len / sampling_rate)
+        print(f'{i}/{len(speaker_dirs)} \r')
+
+    logger.finalize()
+    print("Done preprocessing %s.\n" % dataset_name)
+
+
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
+    for dataset_name in librispeech_datasets["train"]["other"]:
+        # Initialize the preprocessing
+        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+        if not dataset_root:
+            return 
+        
+        # Preprocess all speakers
+        speaker_dirs = list(dataset_root.glob("*"))
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
+                                 skip_existing, logger)
+
+
+def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb1"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+
+    # Get the contents of the meta file
+    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
+        metadata = [line.split("\t") for line in metafile][1:]
+    
+    # Select the ID and the nationality, filter out non-anglophone speakers
+    nationalities = {line[0]: line[3] for line in metadata}
+    # keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+    #                     nationality.lower() in anglophone_nationalites]
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()]                        
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+          (len(keep_speaker_ids), len(nationalities)))
+    
+    # Get the speaker directories for anglophone speakers only
+    speaker_dirs = dataset_root.joinpath("wav").glob("*")
+    speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
+                    speaker_dir.name in keep_speaker_ids]
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+          (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
+
+    # Preprocess all speakers
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
+                             skip_existing, logger)
+
+
+def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
+    # Initialize the preprocessing
+    dataset_name = "VoxCeleb2"
+    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
+    if not dataset_root:
+        return
+    
+    # Get the speaker directories
+    # Preprocess all speakers
+    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
+    _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
+                             skip_existing, logger)
diff --git a/dreamvoice/train_utils/src/speaker_encoder/train.py b/dreamvoice/train_utils/src/speaker_encoder/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2e7fa1b08b75de40adc0e05fa3b104cb02660b
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/train.py
@@ -0,0 +1,125 @@
+from speaker_encoder.visualizations import Visualizations
+from speaker_encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
+from speaker_encoder.params_model import *
+from speaker_encoder.model import SpeakerEncoder
+from utils.profiler import Profiler
+from pathlib import Path
+import torch
+
+def sync(device: torch.device):
+    # FIXME
+    return 
+    # For correct profiling (cuda operations are async)
+    if device.type == "cuda":
+        torch.cuda.synchronize(device)
+
+def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
+          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
+          no_visdom: bool):
+    # Create a dataset and a dataloader
+    dataset = SpeakerVerificationDataset(clean_data_root)
+    loader = SpeakerVerificationDataLoader(
+        dataset,
+        speakers_per_batch,       # 64
+        utterances_per_speaker,   # 10
+        num_workers=8,
+    )
+    
+    # Setup the device on which to run the forward pass and the loss. These can be different, 
+    # because the forward pass is faster on the GPU whereas the loss is often (depending on your
+    # hyperparameters) faster on the CPU.
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # FIXME: currently, the gradient is None if loss_device is cuda
+    loss_device = torch.device("cpu")
+    
+    # Create the model and the optimizer
+    model = SpeakerEncoder(device, loss_device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
+    init_step = 1
+    
+    # Configure file path for the model
+    state_fpath = models_dir.joinpath(run_id + ".pt")
+    backup_dir = models_dir.joinpath(run_id + "_backups")
+
+    # Load any existing model
+    if not force_restart:
+        if state_fpath.exists():
+            print("Found existing model \"%s\", loading it and resuming training." % run_id)
+            checkpoint = torch.load(state_fpath)
+            init_step = checkpoint["step"]
+            model.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            optimizer.param_groups[0]["lr"] = learning_rate_init
+        else:
+            print("No model \"%s\" found, starting training from scratch." % run_id)
+    else:
+        print("Starting the training from scratch.")
+    model.train()
+    
+    # Initialize the visualization environment
+    vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
+    vis.log_dataset(dataset)
+    vis.log_params()
+    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
+    vis.log_implementation({"Device": device_name})
+    
+    # Training loop
+    profiler = Profiler(summarize_every=10, disabled=False)
+    for step, speaker_batch in enumerate(loader, init_step):
+        profiler.tick("Blocking, waiting for batch (threaded)")
+        
+        # Forward pass
+        inputs = torch.from_numpy(speaker_batch.data).to(device)
+        sync(device)
+        profiler.tick("Data to %s" % device)
+        embeds = model(inputs)
+        sync(device)
+        profiler.tick("Forward pass")
+        embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
+        loss, eer = model.loss(embeds_loss)
+        sync(loss_device)
+        profiler.tick("Loss")
+
+        # Backward pass
+        model.zero_grad()
+        loss.backward()
+        profiler.tick("Backward pass")
+        model.do_gradient_ops()
+        optimizer.step()
+        profiler.tick("Parameter update")
+        
+        # Update visualizations
+        # learning_rate = optimizer.param_groups[0]["lr"]
+        vis.update(loss.item(), eer, step)
+        
+        # Draw projections and save them to the backup folder
+        if umap_every != 0 and step % umap_every == 0:
+            print("Drawing and saving projections (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            embeds = embeds.detach().cpu().numpy()
+            vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
+            vis.save()
+
+        # Overwrite the latest version of the model
+        if save_every != 0 and step % save_every == 0:
+            print("Saving the model (step %d)" % step)
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, state_fpath)
+            
+        # Make a backup
+        if backup_every != 0 and step % backup_every == 0:
+            print("Making a backup (step %d)" % step)
+            backup_dir.mkdir(exist_ok=True)
+            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            torch.save({
+                "step": step + 1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }, backup_fpath)
+            
+        profiler.tick("Extras (visualizations, saving)")
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/visualizations.py b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2c4c073c933d38970a83798f2d0ee37a85c48e
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/visualizations.py
@@ -0,0 +1,178 @@
+from speaker_encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+from datetime import datetime
+from time import perf_counter as timer
+import matplotlib.pyplot as plt
+import numpy as np
+# import webbrowser
+import visdom
+import umap
+
+colormap = np.array([
+    [76, 255, 0],
+    [0, 127, 70],
+    [255, 0, 0],
+    [255, 217, 38],
+    [0, 135, 255],
+    [165, 0, 165],
+    [255, 167, 255],
+    [0, 255, 255],
+    [255, 96, 38],
+    [142, 76, 0],
+    [33, 0, 127],
+    [0, 0, 0],
+    [183, 183, 183],
+], dtype=np.float) / 255 
+
+
+class Visualizations:
+    def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
+        # Tracking data
+        self.last_update_timestamp = timer()
+        self.update_every = update_every
+        self.step_times = []
+        self.losses = []
+        self.eers = []
+        print("Updating the visualizations every %d steps." % update_every)
+        
+        # If visdom is disabled TODO: use a better paradigm for that
+        self.disabled = disabled    
+        if self.disabled:
+            return 
+        
+        # Set the environment name
+        now = str(datetime.now().strftime("%d-%m %Hh%M"))
+        if env_name is None:
+            self.env_name = now
+        else:
+            self.env_name = "%s (%s)" % (env_name, now)
+        
+        # Connect to visdom and open the corresponding window in the browser
+        try:
+            self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
+        except ConnectionError:
+            raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
+                            "start it.")
+        # webbrowser.open("http://localhost:8097/env/" + self.env_name)
+        
+        # Create the windows
+        self.loss_win = None
+        self.eer_win = None
+        # self.lr_win = None
+        self.implementation_win = None
+        self.projection_win = None
+        self.implementation_string = ""
+        
+    def log_params(self):
+        if self.disabled:
+            return 
+        from speaker_encoder import params_data
+        from speaker_encoder import params_model
+        param_string = "<b>Model parameters</b>:<br>"
+        for param_name in (p for p in dir(params_model) if not p.startswith("__")):
+            value = getattr(params_model, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        param_string += "<b>Data parameters</b>:<br>"
+        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
+            value = getattr(params_data, param_name)
+            param_string += "\t%s: %s<br>" % (param_name, value)
+        self.vis.text(param_string, opts={"title": "Parameters"})
+        
+    def log_dataset(self, dataset: SpeakerVerificationDataset):
+        if self.disabled:
+            return 
+        dataset_string = ""
+        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
+        dataset_string += "\n" + dataset.get_logs()
+        dataset_string = dataset_string.replace("\n", "<br>")
+        self.vis.text(dataset_string, opts={"title": "Dataset"})
+        
+    def log_implementation(self, params):
+        if self.disabled:
+            return 
+        implementation_string = ""
+        for param, value in params.items():
+            implementation_string += "<b>%s</b>: %s\n" % (param, value)
+            implementation_string = implementation_string.replace("\n", "<br>")
+        self.implementation_string = implementation_string
+        self.implementation_win = self.vis.text(
+            implementation_string, 
+            opts={"title": "Training implementation"}
+        )
+
+    def update(self, loss, eer, step):
+        # Update the tracking data
+        now = timer()
+        self.step_times.append(1000 * (now - self.last_update_timestamp))
+        self.last_update_timestamp = now
+        self.losses.append(loss)
+        self.eers.append(eer)
+        print(".", end="")
+        
+        # Update the plots every <update_every> steps
+        if step % self.update_every != 0:
+            return
+        time_string = "Step time:  mean: %5dms  std: %5dms" % \
+                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+        print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
+              (step, np.mean(self.losses), np.mean(self.eers), time_string))
+        if not self.disabled:
+            self.loss_win = self.vis.line(
+                [np.mean(self.losses)],
+                [step],
+                win=self.loss_win,
+                update="append" if self.loss_win else None,
+                opts=dict(
+                    legend=["Avg. loss"],
+                    xlabel="Step",
+                    ylabel="Loss",
+                    title="Loss",
+                )
+            )
+            self.eer_win = self.vis.line(
+                [np.mean(self.eers)],
+                [step],
+                win=self.eer_win,
+                update="append" if self.eer_win else None,
+                opts=dict(
+                    legend=["Avg. EER"],
+                    xlabel="Step",
+                    ylabel="EER",
+                    title="Equal error rate"
+                )
+            )
+            if self.implementation_win is not None:
+                self.vis.text(
+                    self.implementation_string + ("<b>%s</b>" % time_string), 
+                    win=self.implementation_win,
+                    opts={"title": "Training implementation"},
+                )
+
+        # Reset the tracking
+        self.losses.clear()
+        self.eers.clear()
+        self.step_times.clear()
+        
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
+                         max_speakers=10):
+        max_speakers = min(max_speakers, len(colormap))
+        embeds = embeds[:max_speakers * utterances_per_speaker]
+        
+        n_speakers = len(embeds) // utterances_per_speaker
+        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
+        colors = [colormap[i] for i in ground_truth]
+        
+        reducer = umap.UMAP()
+        projected = reducer.fit_transform(embeds)
+        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
+        plt.gca().set_aspect("equal", "datalim")
+        plt.title("UMAP projection (step %d)" % step)
+        if not self.disabled:
+            self.projection_win = self.vis.matplot(plt, win=self.projection_win)
+        if out_fpath is not None:
+            plt.savefig(out_fpath)
+        plt.clf()
+        
+    def save(self):
+        if not self.disabled:
+            self.vis.save([self.env_name])
+        
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f69320ec75315ff9ce2efa158a53b1a823edd2e
--- /dev/null
+++ b/dreamvoice/train_utils/src/speaker_encoder/voice_encoder.py
@@ -0,0 +1,173 @@
+from speaker_encoder.hparams import *
+from speaker_encoder import audio
+from pathlib import Path
+from typing import Union, List
+from torch import nn
+from time import perf_counter as timer
+import numpy as np
+import torch
+
+
+class SpeakerEncoder(nn.Module):
+    def __init__(self, weights_fpath, device: Union[str, torch.device]=None, verbose=True):
+        """
+        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). 
+        If None, defaults to cuda if it is available on your machine, otherwise the model will 
+        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
+        """
+        super().__init__()
+        
+        # Define the network
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+        
+        # Get the target device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+            
+        # Load the pretrained model'speaker weights
+        # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
+        # if not weights_fpath.exists():
+        #     raise Exception("Couldn't find the voice encoder pretrained model at %s." % 
+        #                     weights_fpath)
+
+        start = timer()
+        checkpoint = torch.load(weights_fpath, map_location="cpu")
+
+        self.load_state_dict(checkpoint["model_state"], strict=False)
+        self.to(device)
+        
+        if verbose:
+            print("Loaded the voice encoder model on %s in %.2f seconds." % 
+                  (device.type, timer() - start))
+
+    def forward(self, mels: torch.FloatTensor):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape 
+        (batch_size, n_frames, n_channels) 
+        :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size). 
+        Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
+        """
+        # Pass the input through the LSTM layers and retrieve the final hidden state of the last 
+        # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    
+    @staticmethod
+    def compute_partial_slices(n_samples: int, rate, min_coverage):
+        """
+        Computes where to split an utterance waveform and its corresponding mel spectrogram to 
+        obtain partial utterances of <partials_n_frames> each. Both the waveform and the 
+        mel spectrogram slices are returned, so as to make each partial utterance waveform 
+        correspond to its spectrogram.
+    
+        The returned ranges may be indexing further than the length of the waveform. It is 
+        recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
+    
+        :param n_samples: the number of samples in the waveform
+        :param rate: how many partial utterances should occur per second. Partial utterances must 
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse 
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have 
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
+        it will be discarded. If there aren't enough frames for one partial utterance, 
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
+        respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+        utterances.
+        """
+        assert 0 < min_coverage <= 1
+        
+        # Compute how many frames separate two partial utterances
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+        assert 0 < frame_step, "The rate is too high"
+        assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
+            (sampling_rate / (samples_per_frame * partials_n_frames))
+        
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partials_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+        if coverage < min_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        
+        return wav_slices, mel_slices
+    
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+        """
+        Computes an embedding for a single utterance. The utterance is divided in partial 
+        utterances and an embedding is computed for each. The complete utterance embedding is the 
+        L2-normed average embedding of the partial utterances.
+        
+        TODO: independent batched version of this function
+    
+        :param wav: a preprocessed utterance waveform as a numpy array of float32
+        :param return_partials: if True, the partial embeddings will also be returned along with 
+        the wav slices corresponding to each partial utterance.
+        :param rate: how many partial utterances should occur per second. Partial utterances must 
+        cover the span of the entire utterance, thus the rate should not be lower than the inverse 
+        of the duration of a partial utterance. By default, partial utterances are 1.6s long and 
+        the minimum rate is thus 0.625.
+        :param min_coverage: when reaching the last partial utterance, it may or may not have 
+        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, 
+        then the last partial utterance will be considered by zero-padding the audio. Otherwise, 
+        it will be discarded. If there aren't enough frames for one partial utterance, 
+        this parameter is ignored so that the function always returns at least one slice.
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
+        <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
+        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
+        returned.
+        """
+        # Compute where to split the utterance into partials and pad the waveform with zeros if 
+        # the partial utterances cover a larger range. 
+        wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        
+        # Split the utterance into partials and forward them through the model
+        mel = audio.wav_to_mel_spectrogram(wav)
+        mels = np.array([mel[s] for s in mel_slices])
+        with torch.no_grad():
+            mels = torch.from_numpy(mels).to(self.device)
+            partial_embeds = self(mels).cpu().numpy()
+        
+        # Compute the utterance embedding from the partial embeddings
+        raw_embed = np.mean(partial_embeds, axis=0)
+        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        
+        if return_partials:
+            return embed, partial_embeds, wav_slices
+        return embed
+    
+    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+        """
+        Compute the embedding of a collection of wavs (presumably from the same speaker) by 
+        averaging their embedding and L2-normalizing it.
+        
+        :param wavs: list of wavs a numpy arrays of float32.
+        :param kwargs: extra arguments to embed_utterance()
+        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        """
+        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
+                             for wav in wavs], axis=0)
+        return raw_embed / np.linalg.norm(raw_embed, 2)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/spk_ext.py b/dreamvoice/train_utils/src/spk_ext.py
index 6f20b75c46bb518143d9d5acd3481e84c71e0e47..5f348d653b2a945493ede4a2d7e55f5fd1d62288 100644
--- a/dreamvoice/train_utils/src/spk_ext.py
+++ b/dreamvoice/train_utils/src/spk_ext.py
@@ -46,4 +46,69 @@ def se_extractor(audio_path, vc):
         gs.append(g.detach())
 
     gs = torch.stack(gs).mean(0)
-    return gs.cpu()
\ No newline at end of file
+    return gs.cpu()
+
+
+def process_audio_folder(input_folder, output_folder, model, device):
+    """
+    Process all audio files in a folder and its subfolders, 
+    save the extracted features as .pt files in the output folder with the same structure.
+
+    Args:
+        input_folder (str): Path to the input folder containing audio files.
+        output_folder (str): Path to the output folder to save .pt files.
+        model: Pre-trained model for feature extraction.
+        device: Torch device (e.g., 'cpu' or 'cuda').
+    """
+    # Collect all audio file paths
+    audio_files = []
+    for root, _, files in os.walk(input_folder):
+        for file in files:
+            if file.endswith(('.wav', '.mp3', '.flac')):  # Adjust for the audio formats you want to process
+                audio_files.append(os.path.join(root, file))
+
+    # Process each audio file with tqdm for progress
+    for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
+        # Construct output path
+        relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
+        output_dir = os.path.join(output_folder, relative_path)
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
+
+        # Check if the .pt file already exists
+        if os.path.exists(output_path):
+            # print(f"Skipped (already exists): {output_path}")
+            continue  # Skip processing this file
+        # Extract features
+        target_se = se_extractor(audio_path, model).to(device)
+        # Save the feature as .pt
+        torch.save(target_se, output_path)
+        # print(f"Processed and saved: {output_path}")
+
+
+if __name__ == '__main__':
+    ckpt_converter = 'checkpoints_v2/converter'
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+    model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+
+    # audio_path = 'debug.wav'
+    # target_se = se_extractor(audio_path, model).to(device)
+
+    # source_path = 'source.wav'
+    # source_se = se_extractor(source_path, model).to(device)
+
+    # encode_message = "@MyShell"
+    # model.convert(
+    #     audio_src_path=source_path,
+    #     src_se=source_se,
+    #     tgt_se=target_se,
+    #     output_path='output.wav',
+    #     message=encode_message)
+    # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
+    # output_folder = 'spk/VCTK-Corpus/'
+    # process_audio_folder(input_folder, output_folder, model, device)
+
+    input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
+    output_folder = 'spk/LibriTTS-R/train-clean-360/'
+    process_audio_folder(input_folder, output_folder, model, device)
\ No newline at end of file
diff --git a/dreamvoice/train_utils/src/train_freevc.py b/dreamvoice/train_utils/src/train_freevc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e1fac17288da07a1f2bcb6f42f6f6c7b2e0b81
--- /dev/null
+++ b/dreamvoice/train_utils/src/train_freevc.py
@@ -0,0 +1,214 @@
+import yaml
+import random
+import argparse
+import os
+import time
+from tqdm import tqdm
+from pathlib import Path
+
+import torch
+from torch.utils.data import DataLoader
+
+from accelerate import Accelerator
+from diffusers import DDIMScheduler
+
+from configs.plugin import get_params
+from model.p2e_cross import P2E_Cross
+from modules.speaker_encoder.encoder import inference as spk_encoder
+from transformers import T5Tokenizer, T5EncoderModel, AutoModel
+from inference_freevc import eval_plugin
+from dataset.dreamvc import DreamData
+# from vc_wrapper import load_diffvc_models
+from freevc_wrapper import get_freevc_models
+from utils import minmax_norm_diff, reverse_minmax_norm_diff, scale_shift
+
+parser = argparse.ArgumentParser()
+
+# config settings
+parser.add_argument('--config-name', type=str, default='Plugin_freevc')
+parser.add_argument('--vc-unet-path', type=str, default='freevc')
+parser.add_argument('--speaker-path', type=str, default='speaker_encoder/ckpt/pretrained_bak_5805000.pt')
+
+
+# training settings
+parser.add_argument("--amp", type=str, default='fp16')
+parser.add_argument('--epochs', type=int, default=200)
+parser.add_argument('--batch-size', type=int, default=32)
+parser.add_argument('--num-workers', type=int, default=8)
+parser.add_argument('--num-threads', type=int, default=1)
+parser.add_argument('--save-every', type=int, default=10)
+
+# log and random seed
+parser.add_argument('--random-seed', type=int, default=2023)
+parser.add_argument('--log-step', type=int, default=200)
+parser.add_argument('--log-dir', type=str, default='../logs/')
+parser.add_argument('--save-dir', type=str, default='../ckpts/')
+
+args = parser.parse_args()
+params = get_params(args.config_name)
+args.log_dir = args.log_dir + args.config_name + '/'
+
+with open('model/p2e_cross.yaml', 'r') as fp:
+    config = yaml.safe_load(fp)
+
+if os.path.exists(args.save_dir + args.config_name) is False:
+    os.makedirs(args.save_dir + args.config_name)
+
+if os.path.exists(args.log_dir) is False:
+    os.makedirs(args.log_dir)
+
+if __name__ == '__main__':
+    # Fix the random seed
+    random.seed(args.random_seed)
+    torch.manual_seed(args.random_seed)
+
+    # Set device
+    torch.set_num_threads(args.num_threads)
+    if torch.cuda.is_available():
+        args.device = 'cuda'
+        torch.cuda.manual_seed(args.random_seed)
+        torch.cuda.manual_seed_all(args.random_seed)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if torch.backends.cudnn.is_available():
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cudnn.benchmark = False
+    else:
+        args.device = 'cpu'
+
+    train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv',
+                          subset='train', prompt_dir='../prepare/prompts.csv',)
+    train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
+
+    # use accelerator for multi-gpu training
+    accelerator = Accelerator(mixed_precision=args.amp)
+
+    # vc_unet, hifigan, _, logmel, vc_scheduler = load_diffvc_models(args.vc_unet_path,
+    #                                                                args.vocoder_path,
+    #                                                                args.speaker_path,
+    #                                                                args.vc_config_path,
+    #                                                                accelerator.device)
+    freevc_24, cmodel, _, hps = get_freevc_models(args.vc_unet_path, args.speaker_path, accelerator.device)
+    # speaker
+    # spk_encoder.load_model(Path(args.speaker_path), accelerator.device)
+
+    # text encoder
+    tokenizer = T5Tokenizer.from_pretrained(params.text_encoder.model)
+    text_encoder = T5EncoderModel.from_pretrained(params.text_encoder.model).to(accelerator.device)
+    text_encoder.eval()
+
+    # main U-Net
+    model = P2E_Cross(config['diffwrap']).to(accelerator.device)
+    model.load_state_dict(torch.load('../ckpts/Plugin_freevc/49.pt')['model'])
+
+    total_params = sum([param.nelement() for param in model.parameters()])
+    print("Number of parameter: %.2fM" % (total_params / 1e6))
+
+    if params.diff.v_prediction:
+        print('v prediction')
+        noise_scheduler = DDIMScheduler(num_train_timesteps=params.diff.num_train_steps,
+                                        beta_start=params.diff.beta_start, beta_end=params.diff.beta_end,
+                                        rescale_betas_zero_snr=True,
+                                        timestep_spacing="trailing",
+                                        clip_sample=False,
+                                        prediction_type='v_prediction')
+    else:
+        print('noise prediction')
+        noise_scheduler = DDIMScheduler(num_train_timesteps=args.num_train_steps,
+                                        beta_start=args.beta_start, beta_end=args.beta_end,
+                                        clip_sample=False,
+                                        prediction_type='epsilon')
+
+    optimizer = torch.optim.AdamW(model.parameters(),
+                                  lr=params.opt.learning_rate,
+                                  betas=(params.opt.beta1, params.opt.beta2),
+                                  weight_decay=params.opt.weight_decay,
+                                  eps=params.opt.adam_epsilon,
+                                  )
+    loss_func = torch.nn.MSELoss()
+
+    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
+
+    global_step = 0
+    losses = 0
+
+    if accelerator.is_main_process:
+        eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder],
+                    model, noise_scheduler, (1, 256, 1),
+                    val_meta='../prepare/val_meta.csv',
+                    val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/',
+                    guidance_scale=3.0, guidance_rescale=0.0,
+                    ddim_steps=100, eta=1, random_seed=None,
+                    device=accelerator.device,
+                    epoch='test', save_path=args.log_dir + 'output/', val_num=10)
+    accelerator.wait_for_everyone()
+
+    for epoch in range(args.epochs):
+        model.train()
+        for step, batch in enumerate(tqdm(train_loader)):
+            spk_embed, prompt = batch
+            spk_embed = spk_embed.unsqueeze(-1)
+
+            with torch.no_grad():
+                text_batch = tokenizer(prompt,
+                                       max_length=32,
+                                       padding='max_length', truncation=True, return_tensors="pt")
+                text, text_mask = text_batch.input_ids.to(spk_embed.device), \
+                    text_batch.attention_mask.to(spk_embed.device)
+                text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
+
+            spk_embed = scale_shift(spk_embed, 20, -0.035)
+            # spk_embed = minmax_norm_diff(spk_embed, vmax=0.5, vmin=0.0)
+            # content_clip = align_seq(content_clip, audio_clip.shape[-1])
+            # f0_clip = align_seq(f0_clip, audio_clip.shape[-1])
+
+            # adding noise
+            noise = torch.randn(spk_embed.shape).to(accelerator.device)
+            timesteps = torch.randint(0, params.diff.num_train_steps, (noise.shape[0],),
+                                      device=accelerator.device, ).long()
+            noisy_target = noise_scheduler.add_noise(spk_embed, noise, timesteps)
+            # v prediction - model output
+            velocity = noise_scheduler.get_velocity(spk_embed, noise, timesteps)
+
+            # inference
+            pred = model(noisy_target, timesteps, text, text_mask, train_cfg=True, cfg_prob=0.25)
+            # backward
+            if params.diff.v_prediction:
+                loss = loss_func(pred, velocity)
+            else:
+                loss = loss_func(pred, noise)
+
+            accelerator.backward(loss)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            global_step += 1
+            losses += loss.item()
+
+            if accelerator.is_main_process:
+                if global_step % args.log_step == 0:
+                    n = open(args.log_dir + 'diff_vc.txt', mode='a')
+                    n.write(time.asctime(time.localtime(time.time())))
+                    n.write('\n')
+                    n.write('Epoch: [{}][{}]    Batch: [{}][{}]    Loss: {:.6f}\n'.format(
+                        epoch + 1, args.epochs, step + 1, len(train_loader), losses / args.log_step))
+                    n.close()
+                    losses = 0.0
+
+        accelerator.wait_for_everyone()
+
+        if (epoch + 1) % args.save_every == 0:
+            if accelerator.is_main_process:
+                eval_plugin(freevc_24, cmodel, [tokenizer, text_encoder],
+                            model, noise_scheduler, (1, 256, 1),
+                            val_meta='../prepare/val_meta.csv',
+                            val_folder='/home/jerry/Projects/Dataset/Speech/vctk_libritts/',
+                            guidance_scale=3, guidance_rescale=0.0,
+                            ddim_steps=50, eta=1, random_seed=2024,
+                            device=accelerator.device,
+                            epoch=epoch, save_path=args.log_dir + 'output/', val_num=10)
+
+            unwrapped_unet = accelerator.unwrap_model(model)
+            accelerator.save({
+                "model": unwrapped_unet.state_dict(),
+            }, args.save_dir + args.config_name + '/' + str(epoch) + '.pt')
diff --git a/dreamvoice/train_utils/src/train.py b/dreamvoice/train_utils/src/train_openvoice.py
similarity index 95%
rename from dreamvoice/train_utils/src/train.py
rename to dreamvoice/train_utils/src/train_openvoice.py
index a5fb5ac6226f985925fa2fbaf417fcfdd6782443..0bfd7d2ae77310eb645a2cfdf613ed6ecd6dc9b9 100644
--- a/dreamvoice/train_utils/src/train.py
+++ b/dreamvoice/train_utils/src/train_openvoice.py
@@ -25,7 +25,7 @@ from utils import minmax_norm_diff, reverse_minmax_norm_diff
 parser = argparse.ArgumentParser()
 
 # config settings
-parser.add_argument('--config-name', type=str, default='Plugin_base')
+parser.add_argument('--config-name', type=str, default='Plugin_freevc')
 
 # training settings
 parser.add_argument("--amp", type=str, default='fp16')
@@ -73,7 +73,7 @@ if __name__ == '__main__':
     else:
         args.device = 'cpu'
 
-    train_set = DreamData(data_dir='../prepare/spk/', meta_dir='../prepare/plugin_meta.csv',
+    train_set = DreamData(data_dir='../prepare_freevc/spk/', meta_dir='../prepare/plugin_meta.csv',
                           subset='train', prompt_dir='../prepare/prompts.csv',)
     train_loader = DataLoader(train_set, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
 
diff --git a/dreamvoice/train_utils/src/utils.py b/dreamvoice/train_utils/src/utils.py
index 0fe1dcba21ce183e2e7c26a711c702ca089813d3..6dc3e7629165253801ceb946e6d9ac80a89a25f2 100644
--- a/dreamvoice/train_utils/src/utils.py
+++ b/dreamvoice/train_utils/src/utils.py
@@ -15,3 +15,23 @@ def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: floa
     tensor = (tensor + 1) / 2
     tensor = tensor * (vmax - vmin) + vmin
     return tensor
+
+
+def scale_shift(x, scale, shift):
+    return (x+shift) * scale
+
+
+def scale_shift_re(x, scale, shift):
+    return (x/scale) - shift
+
+
+def align_seq(source, target_length, mapping_method='hard'):
+    source_len = source.shape[1]
+    if mapping_method == 'hard':
+        mapping_idx = np.round(np.arange(target_length) * source_len / target_length)
+        output = source[:, mapping_idx]
+    else:
+        # TBD
+        raise NotImplementedError
+
+    return output
\ No newline at end of file
diff --git a/freevc_example.py b/freevc_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..0228cdaf097068b714c511890a78e1e08196989e
--- /dev/null
+++ b/freevc_example.py
@@ -0,0 +1,25 @@
+import torch
+import librosa
+import soundfile as sf
+from dreamvoice import DreamVoice_Plugin
+from dreamvoice.freevc_wrapper import get_freevc_models, convert
+
+freevc, cmodel, hps = get_freevc_models('ckpts_freevc/', 'dreamvoice/', 'cuda')
+device = 'cuda'
+
+# init dreamvoice
+dreamvoice = DreamVoice_Plugin(config='plugin_freevc.yaml', device=device)
+
+# generate speaker
+prompt = "old female's voice, deep and dark"
+target_se = dreamvoice.gen_spk(prompt)
+
+# content source
+source_path = 'examples/test1.wav'
+audio_clip = librosa.load(source_path, sr=16000)[0]
+audio_clip = torch.tensor(audio_clip).unsqueeze(0).to(device)
+
+content = cmodel(audio_clip).last_hidden_state.transpose(1, 2).to(device)
+
+output, out_sr = convert(freevc, content, target_se)
+sf.write('output.wav', output, out_sr)
\ No newline at end of file
diff --git a/examples/openvoice_example.py b/openvoice_example.py
similarity index 92%
rename from examples/openvoice_example.py
rename to openvoice_example.py
index dccbe4b6688e717b551979409cb778dd856f6a1d..4d20e44399cb7d8c87399bc99622d9965b128295 100644
--- a/examples/openvoice_example.py
+++ b/openvoice_example.py
@@ -14,12 +14,12 @@ openvoice = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
 openvoice.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
 
 # generate speaker
-prompt = 'rough boy voice, young'
+prompt = 'female voice, bright and cute'
 target_se = dreamvoice.gen_spk(prompt)
 target_se = target_se.unsqueeze(-1)
 
 # content source
-source_path = 'examples/test2.wav'
+source_path = 'segment_1.mp3'
 source_se = se_extractor(source_path, openvoice).to(device)
 
 # voice conversion