Higobeatz commited on
Commit
0dabde8
·
1 Parent(s): 446d934

freevc plugin

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ckpts_freevc/freevc.json +54 -0
  2. ckpts_freevc/freevc.pth +3 -0
  3. dreamvoice/freevc/.gitattributes +34 -0
  4. dreamvoice/freevc/.gitignore +2 -0
  5. dreamvoice/freevc/README.md +13 -0
  6. dreamvoice/freevc/app.py +92 -0
  7. dreamvoice/freevc/commons.py +171 -0
  8. dreamvoice/freevc/configs/freevc-24.json +54 -0
  9. dreamvoice/freevc/configs/freevc-s.json +54 -0
  10. dreamvoice/freevc/configs/freevc.json +54 -0
  11. dreamvoice/freevc/mel_processing.py +112 -0
  12. dreamvoice/freevc/models.py +351 -0
  13. dreamvoice/freevc/modules.py +341 -0
  14. dreamvoice/freevc/requirements.txt +8 -0
  15. dreamvoice/freevc/utils.py +305 -0
  16. dreamvoice/freevc_wrapper.py +63 -0
  17. dreamvoice/plugin.py +0 -1
  18. dreamvoice/plugin_ckpts/freevc.pt +3 -0
  19. dreamvoice/plugin_freevc.yaml +8 -0
  20. dreamvoice/src/configs/{plugin_cross.yaml → plugin_cross_freevc.yaml} +0 -0
  21. dreamvoice/train_utils/prepare_freevc/freevc/README.md +13 -0
  22. dreamvoice/train_utils/prepare_freevc/freevc/app.py +103 -0
  23. dreamvoice/train_utils/prepare_freevc/freevc/commons.py +171 -0
  24. dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json +54 -0
  25. dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json +54 -0
  26. dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json +54 -0
  27. dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py +69 -0
  28. dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py +112 -0
  29. dreamvoice/train_utils/prepare_freevc/freevc/models.py +351 -0
  30. dreamvoice/train_utils/prepare_freevc/freevc/modules.py +342 -0
  31. dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt +8 -0
  32. dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml +47 -0
  33. dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml +34 -0
  34. dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml +45 -0
  35. dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml +33 -0
  36. dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml +39 -0
  37. dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py +0 -0
  38. dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py +103 -0
  39. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py +42 -0
  40. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py +40 -0
  41. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore +132 -0
  42. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE +21 -0
  43. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md +161 -0
  44. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py +66 -0
  45. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png +0 -0
  46. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py +60 -0
  47. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py +80 -0
  48. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py +5 -0
  49. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py +91 -0
  50. dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py +241 -0
ckpts_freevc/freevc.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8960,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8001"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,8,2,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": true
53
+ }
54
+ }
ckpts_freevc/freevc.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2cc2d047f63b80d1d6780e37611cec11a01d597560393b1fe6118158b3bd47f
3
+ size 472644351
dreamvoice/freevc/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
dreamvoice/freevc/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ flagged
dreamvoice/freevc/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FreeVC
3
+ emoji: 🚀
4
+ colorFrom: gray
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.13.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
dreamvoice/freevc/app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import gradio as gr
5
+ from scipy.io.wavfile import write
6
+ from transformers import WavLMModel
7
+
8
+ import utils
9
+ from models import SynthesizerTrn
10
+ from mel_processing import mel_spectrogram_torch
11
+ from speaker_encoder.voice_encoder import SpeakerEncoder
12
+
13
+ '''
14
+ def get_wavlm():
15
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
16
+ shutil.move('WavLM-Large.pt', 'wavlm')
17
+ '''
18
+
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ # print("Loading FreeVC...")
22
+ # hps = utils.get_hparams_from_file("configs/freevc.json")
23
+ # freevc = SynthesizerTrn(
24
+ # hps.data.filter_length // 2 + 1,
25
+ # hps.train.segment_size // hps.data.hop_length,
26
+ # **hps.model).to(device)
27
+ # _ = freevc.eval()
28
+ # _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
29
+ smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
30
+
31
+ print("Loading FreeVC(24k)...")
32
+ hps = utils.get_hparams_from_file("configs/freevc-24.json")
33
+ freevc_24 = SynthesizerTrn(
34
+ hps.data.filter_length // 2 + 1,
35
+ hps.train.segment_size // hps.data.hop_length,
36
+ **hps.model).to(device)
37
+ _ = freevc_24.eval()
38
+ _ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
39
+
40
+ # print("Loading FreeVC-s...")
41
+ # hps = utils.get_hparams_from_file("configs/freevc-s.json")
42
+ # freevc_s = SynthesizerTrn(
43
+ # hps.data.filter_length // 2 + 1,
44
+ # hps.train.segment_size // hps.data.hop_length,
45
+ # **hps.model).to(device)
46
+ # _ = freevc_s.eval()
47
+ # _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
48
+ #
49
+ # print("Loading WavLM for content...")
50
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
51
+
52
+ def convert(model, cmodel, src, tgt):
53
+ with torch.no_grad():
54
+ # tgt
55
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
56
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
57
+ g_tgt = smodel.embed_utterance(wav_tgt)
58
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
59
+
60
+ # src
61
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
62
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
63
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
64
+ # infer
65
+ if model == "FreeVC":
66
+ audio = freevc.infer(c, g=g_tgt)
67
+ elif model == "FreeVC-s":
68
+ audio = freevc_s.infer(c, mel=mel_tgt)
69
+ else:
70
+ audio = freevc_24.infer(c, g=g_tgt)
71
+ audio = audio[0][0].data.cpu().float().numpy()
72
+ if model == "FreeVC" or model == "FreeVC-s":
73
+ write("out.wav", hps.data.sampling_rate, audio)
74
+ else:
75
+ write("out.wav", 24000, audio)
76
+ out = "out.wav"
77
+ return out
78
+
79
+ # model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
80
+ # audio1 = gr.inputs.Audio(label="Source Audio", type='filepath')
81
+ # audio2 = gr.inputs.Audio(label="Reference Audio", type='filepath')
82
+ # inputs = [model, audio1, audio2]
83
+ # outputs = gr.outputs.Audio(label="Output Audio", type='filepath')
84
+ #
85
+ # title = "FreeVC"
86
+ # description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please <strong>trim</strong> it before submitting."
87
+ # article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Paper</a> | <a href='https://github.com/OlaWod/FreeVC' target='_blank'>Github Repo</a></p>"
88
+ #
89
+ # examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
90
+ #
91
+ # gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
92
+ convert(freevc_24, cmodel, 'p225_001.wav', 'p226_002.wav')
dreamvoice/freevc/commons.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size*dilation - dilation)/2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def intersperse(lst, item):
25
+ result = [item] * (len(lst) * 2 + 1)
26
+ result[1::2] = lst
27
+ return result
28
+
29
+
30
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
31
+ """KL(P||Q)"""
32
+ kl = (logs_q - logs_p) - 0.5
33
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def rand_spec_segments(x, x_lengths=None, segment_size=4):
68
+ b, d, t = x.size()
69
+ if x_lengths is None:
70
+ x_lengths = t
71
+ ids_str_max = x_lengths - segment_size
72
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
73
+ ret = slice_segments(x, ids_str, segment_size)
74
+ return ret, ids_str
75
+
76
+
77
+ def get_timing_signal_1d(
78
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
79
+ position = torch.arange(length, dtype=torch.float)
80
+ num_timescales = channels // 2
81
+ log_timescale_increment = (
82
+ math.log(float(max_timescale) / float(min_timescale)) /
83
+ (num_timescales - 1))
84
+ inv_timescales = min_timescale * torch.exp(
85
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
86
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
87
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
88
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
89
+ signal = signal.view(1, channels, length)
90
+ return signal
91
+
92
+
93
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
94
+ b, channels, length = x.size()
95
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
96
+ return x + signal.to(dtype=x.dtype, device=x.device)
97
+
98
+
99
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
100
+ b, channels, length = x.size()
101
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
102
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
103
+
104
+
105
+ def subsequent_mask(length):
106
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
107
+ return mask
108
+
109
+
110
+ @torch.jit.script
111
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
112
+ n_channels_int = n_channels[0]
113
+ in_act = input_a + input_b
114
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
115
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
116
+ acts = t_act * s_act
117
+ return acts
118
+
119
+
120
+ def convert_pad_shape(pad_shape):
121
+ l = pad_shape[::-1]
122
+ pad_shape = [item for sublist in l for item in sublist]
123
+ return pad_shape
124
+
125
+
126
+ def shift_1d(x):
127
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
128
+ return x
129
+
130
+
131
+ def sequence_mask(length, max_length=None):
132
+ if max_length is None:
133
+ max_length = length.max()
134
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
135
+ return x.unsqueeze(0) < length.unsqueeze(1)
136
+
137
+
138
+ def generate_path(duration, mask):
139
+ """
140
+ duration: [b, 1, t_x]
141
+ mask: [b, 1, t_y, t_x]
142
+ """
143
+ device = duration.device
144
+
145
+ b, _, t_y, t_x = mask.shape
146
+ cum_duration = torch.cumsum(duration, -1)
147
+
148
+ cum_duration_flat = cum_duration.view(b * t_x)
149
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
150
+ path = path.view(b, t_x, t_y)
151
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
152
+ path = path.unsqueeze(1).transpose(2,3) * mask
153
+ return path
154
+
155
+
156
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
157
+ if isinstance(parameters, torch.Tensor):
158
+ parameters = [parameters]
159
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
160
+ norm_type = float(norm_type)
161
+ if clip_value is not None:
162
+ clip_value = float(clip_value)
163
+
164
+ total_norm = 0
165
+ for p in parameters:
166
+ param_norm = p.grad.data.norm(norm_type)
167
+ total_norm += param_norm.item() ** norm_type
168
+ if clip_value is not None:
169
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
170
+ total_norm = total_norm ** (1. / norm_type)
171
+ return total_norm
dreamvoice/freevc/configs/freevc-24.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8640,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8008"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,6,4,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": true
53
+ }
54
+ }
dreamvoice/freevc/configs/freevc-s.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8960,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8001"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,8,2,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": false
53
+ }
54
+ }
dreamvoice/freevc/configs/freevc.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8960,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8001"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,8,2,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": true
53
+ }
54
+ }
dreamvoice/freevc/mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
dreamvoice/freevc/models.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from .commons import sequence_mask, rand_slice_segments
8
+ from .modules import ResidualCouplingLayer, WN, Flip, ResBlock1, ResBlock2, LRELU_SLOPE
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from .commons import init_weights, get_padding
13
+
14
+
15
+ class ResidualCouplingBlock(nn.Module):
16
+ def __init__(self,
17
+ channels,
18
+ hidden_channels,
19
+ kernel_size,
20
+ dilation_rate,
21
+ n_layers,
22
+ n_flows=4,
23
+ gin_channels=0):
24
+ super().__init__()
25
+ self.channels = channels
26
+ self.hidden_channels = hidden_channels
27
+ self.kernel_size = kernel_size
28
+ self.dilation_rate = dilation_rate
29
+ self.n_layers = n_layers
30
+ self.n_flows = n_flows
31
+ self.gin_channels = gin_channels
32
+
33
+ self.flows = nn.ModuleList()
34
+ for i in range(n_flows):
35
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
36
+ self.flows.append(Flip())
37
+
38
+ def forward(self, x, x_mask, g=None, reverse=False):
39
+ if not reverse:
40
+ for flow in self.flows:
41
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
42
+ else:
43
+ for flow in reversed(self.flows):
44
+ x = flow(x, x_mask, g=g, reverse=reverse)
45
+ return x
46
+
47
+
48
+ class Encoder(nn.Module):
49
+ def __init__(self,
50
+ in_channels,
51
+ out_channels,
52
+ hidden_channels,
53
+ kernel_size,
54
+ dilation_rate,
55
+ n_layers,
56
+ gin_channels=0):
57
+ super().__init__()
58
+ self.in_channels = in_channels
59
+ self.out_channels = out_channels
60
+ self.hidden_channels = hidden_channels
61
+ self.kernel_size = kernel_size
62
+ self.dilation_rate = dilation_rate
63
+ self.n_layers = n_layers
64
+ self.gin_channels = gin_channels
65
+
66
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
67
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
68
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
69
+
70
+ def forward(self, x, x_lengths, g=None):
71
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
72
+ x = self.pre(x) * x_mask
73
+ x = self.enc(x, x_mask, g=g)
74
+ stats = self.proj(x) * x_mask
75
+ m, logs = torch.split(stats, self.out_channels, dim=1)
76
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
77
+ return z, m, logs, x_mask
78
+
79
+
80
+ class Generator(torch.nn.Module):
81
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
82
+ super(Generator, self).__init__()
83
+ self.num_kernels = len(resblock_kernel_sizes)
84
+ self.num_upsamples = len(upsample_rates)
85
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
86
+ resblock = ResBlock1 if resblock == '1' else ResBlock2
87
+
88
+ self.ups = nn.ModuleList()
89
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
90
+ self.ups.append(weight_norm(
91
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
92
+ k, u, padding=(k-u)//2)))
93
+
94
+ self.resblocks = nn.ModuleList()
95
+ for i in range(len(self.ups)):
96
+ ch = upsample_initial_channel//(2**(i+1))
97
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
98
+ self.resblocks.append(resblock(ch, k, d))
99
+
100
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
101
+ self.ups.apply(init_weights)
102
+
103
+ if gin_channels != 0:
104
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
105
+
106
+ def forward(self, x, g=None):
107
+ x = self.conv_pre(x)
108
+ if g is not None:
109
+ x = x + self.cond(g)
110
+
111
+ for i in range(self.num_upsamples):
112
+ x = F.leaky_relu(x, LRELU_SLOPE)
113
+ x = self.ups[i](x)
114
+ xs = None
115
+ for j in range(self.num_kernels):
116
+ if xs is None:
117
+ xs = self.resblocks[i*self.num_kernels+j](x)
118
+ else:
119
+ xs += self.resblocks[i*self.num_kernels+j](x)
120
+ x = xs / self.num_kernels
121
+ x = F.leaky_relu(x)
122
+ x = self.conv_post(x)
123
+ x = torch.tanh(x)
124
+
125
+ return x
126
+
127
+ def remove_weight_norm(self):
128
+ print('Removing weight norm...')
129
+ for l in self.ups:
130
+ remove_weight_norm(l)
131
+ for l in self.resblocks:
132
+ l.remove_weight_norm()
133
+
134
+
135
+ class DiscriminatorP(torch.nn.Module):
136
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
137
+ super(DiscriminatorP, self).__init__()
138
+ self.period = period
139
+ self.use_spectral_norm = use_spectral_norm
140
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
141
+ self.convs = nn.ModuleList([
142
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
143
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
144
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
145
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
146
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
147
+ ])
148
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
149
+
150
+ def forward(self, x):
151
+ fmap = []
152
+
153
+ # 1d to 2d
154
+ b, c, t = x.shape
155
+ if t % self.period != 0: # pad first
156
+ n_pad = self.period - (t % self.period)
157
+ x = F.pad(x, (0, n_pad), "reflect")
158
+ t = t + n_pad
159
+ x = x.view(b, c, t // self.period, self.period)
160
+
161
+ for l in self.convs:
162
+ x = l(x)
163
+ x = F.leaky_relu(x, LRELU_SLOPE)
164
+ fmap.append(x)
165
+ x = self.conv_post(x)
166
+ fmap.append(x)
167
+ x = torch.flatten(x, 1, -1)
168
+
169
+ return x, fmap
170
+
171
+
172
+ class DiscriminatorS(torch.nn.Module):
173
+ def __init__(self, use_spectral_norm=False):
174
+ super(DiscriminatorS, self).__init__()
175
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
176
+ self.convs = nn.ModuleList([
177
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
178
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
179
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
180
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
181
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
182
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
183
+ ])
184
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
185
+
186
+ def forward(self, x):
187
+ fmap = []
188
+
189
+ for l in self.convs:
190
+ x = l(x)
191
+ x = F.leaky_relu(x, LRELU_SLOPE)
192
+ fmap.append(x)
193
+ x = self.conv_post(x)
194
+ fmap.append(x)
195
+ x = torch.flatten(x, 1, -1)
196
+
197
+ return x, fmap
198
+
199
+
200
+ class MultiPeriodDiscriminator(torch.nn.Module):
201
+ def __init__(self, use_spectral_norm=False):
202
+ super(MultiPeriodDiscriminator, self).__init__()
203
+ periods = [2,3,5,7,11]
204
+
205
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
206
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
207
+ self.discriminators = nn.ModuleList(discs)
208
+
209
+ def forward(self, y, y_hat):
210
+ y_d_rs = []
211
+ y_d_gs = []
212
+ fmap_rs = []
213
+ fmap_gs = []
214
+ for i, d in enumerate(self.discriminators):
215
+ y_d_r, fmap_r = d(y)
216
+ y_d_g, fmap_g = d(y_hat)
217
+ y_d_rs.append(y_d_r)
218
+ y_d_gs.append(y_d_g)
219
+ fmap_rs.append(fmap_r)
220
+ fmap_gs.append(fmap_g)
221
+
222
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
223
+
224
+
225
+ class SpeakerEncoder(torch.nn.Module):
226
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
227
+ super(SpeakerEncoder, self).__init__()
228
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
229
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
230
+ self.relu = nn.ReLU()
231
+
232
+ def forward(self, mels):
233
+ self.lstm.flatten_parameters()
234
+ _, (hidden, _) = self.lstm(mels)
235
+ embeds_raw = self.relu(self.linear(hidden[-1]))
236
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
237
+
238
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
239
+ mel_slices = []
240
+ for i in range(0, total_frames-partial_frames, partial_hop):
241
+ mel_range = torch.arange(i, i+partial_frames)
242
+ mel_slices.append(mel_range)
243
+
244
+ return mel_slices
245
+
246
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
247
+ mel_len = mel.size(1)
248
+ last_mel = mel[:,-partial_frames:]
249
+
250
+ if mel_len > partial_frames:
251
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
252
+ mels = list(mel[:,s] for s in mel_slices)
253
+ mels.append(last_mel)
254
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
255
+
256
+ with torch.no_grad():
257
+ partial_embeds = self(mels)
258
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
259
+ #embed = embed / torch.linalg.norm(embed, 2)
260
+ else:
261
+ with torch.no_grad():
262
+ embed = self(last_mel)
263
+
264
+ return embed
265
+
266
+
267
+ class SynthesizerTrn(nn.Module):
268
+ """
269
+ Synthesizer for Training
270
+ """
271
+
272
+ def __init__(self,
273
+ spec_channels,
274
+ segment_size,
275
+ inter_channels,
276
+ hidden_channels,
277
+ filter_channels,
278
+ n_heads,
279
+ n_layers,
280
+ kernel_size,
281
+ p_dropout,
282
+ resblock,
283
+ resblock_kernel_sizes,
284
+ resblock_dilation_sizes,
285
+ upsample_rates,
286
+ upsample_initial_channel,
287
+ upsample_kernel_sizes,
288
+ gin_channels,
289
+ ssl_dim,
290
+ use_spk,
291
+ **kwargs):
292
+
293
+ super().__init__()
294
+ self.spec_channels = spec_channels
295
+ self.inter_channels = inter_channels
296
+ self.hidden_channels = hidden_channels
297
+ self.filter_channels = filter_channels
298
+ self.n_heads = n_heads
299
+ self.n_layers = n_layers
300
+ self.kernel_size = kernel_size
301
+ self.p_dropout = p_dropout
302
+ self.resblock = resblock
303
+ self.resblock_kernel_sizes = resblock_kernel_sizes
304
+ self.resblock_dilation_sizes = resblock_dilation_sizes
305
+ self.upsample_rates = upsample_rates
306
+ self.upsample_initial_channel = upsample_initial_channel
307
+ self.upsample_kernel_sizes = upsample_kernel_sizes
308
+ self.segment_size = segment_size
309
+ self.gin_channels = gin_channels
310
+ self.ssl_dim = ssl_dim
311
+ self.use_spk = use_spk
312
+
313
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
314
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
315
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
316
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
317
+
318
+ if not self.use_spk:
319
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
320
+
321
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
322
+ if c_lengths == None:
323
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
324
+ if spec_lengths == None:
325
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
326
+
327
+ if not self.use_spk:
328
+ g = self.enc_spk(mel.transpose(1,2))
329
+ g = g.unsqueeze(-1)
330
+
331
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
332
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
333
+ z_p = self.flow(z, spec_mask, g=g)
334
+
335
+ z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
336
+ o = self.dec(z_slice, g=g)
337
+
338
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
339
+
340
+ def infer(self, c, g=None, mel=None, c_lengths=None):
341
+ if c_lengths == None:
342
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
343
+ if not self.use_spk:
344
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
345
+ g = g.unsqueeze(-1)
346
+
347
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
348
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
349
+ o = self.dec(z * c_mask, g=g)
350
+
351
+ return o
dreamvoice/freevc/modules.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
13
+
14
+
15
+ LRELU_SLOPE = 0.1
16
+
17
+
18
+ class LayerNorm(nn.Module):
19
+ def __init__(self, channels, eps=1e-5):
20
+ super().__init__()
21
+ self.channels = channels
22
+ self.eps = eps
23
+
24
+ self.gamma = nn.Parameter(torch.ones(channels))
25
+ self.beta = nn.Parameter(torch.zeros(channels))
26
+
27
+ def forward(self, x):
28
+ x = x.transpose(1, -1)
29
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
30
+ return x.transpose(1, -1)
31
+
32
+
33
+ class ConvReluNorm(nn.Module):
34
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
35
+ super().__init__()
36
+ self.in_channels = in_channels
37
+ self.hidden_channels = hidden_channels
38
+ self.out_channels = out_channels
39
+ self.kernel_size = kernel_size
40
+ self.n_layers = n_layers
41
+ self.p_dropout = p_dropout
42
+ assert n_layers > 1, "Number of layers should be larger than 0."
43
+
44
+ self.conv_layers = nn.ModuleList()
45
+ self.norm_layers = nn.ModuleList()
46
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
47
+ self.norm_layers.append(LayerNorm(hidden_channels))
48
+ self.relu_drop = nn.Sequential(
49
+ nn.ReLU(),
50
+ nn.Dropout(p_dropout))
51
+ for _ in range(n_layers-1):
52
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
53
+ self.norm_layers.append(LayerNorm(hidden_channels))
54
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
55
+ self.proj.weight.data.zero_()
56
+ self.proj.bias.data.zero_()
57
+
58
+ def forward(self, x, x_mask):
59
+ x_org = x
60
+ for i in range(self.n_layers):
61
+ x = self.conv_layers[i](x * x_mask)
62
+ x = self.norm_layers[i](x)
63
+ x = self.relu_drop(x)
64
+ x = x_org + self.proj(x)
65
+ return x * x_mask
66
+
67
+
68
+ class DDSConv(nn.Module):
69
+ """
70
+ Dialted and Depth-Separable Convolution
71
+ """
72
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
73
+ super().__init__()
74
+ self.channels = channels
75
+ self.kernel_size = kernel_size
76
+ self.n_layers = n_layers
77
+ self.p_dropout = p_dropout
78
+
79
+ self.drop = nn.Dropout(p_dropout)
80
+ self.convs_sep = nn.ModuleList()
81
+ self.convs_1x1 = nn.ModuleList()
82
+ self.norms_1 = nn.ModuleList()
83
+ self.norms_2 = nn.ModuleList()
84
+ for i in range(n_layers):
85
+ dilation = kernel_size ** i
86
+ padding = (kernel_size * dilation - dilation) // 2
87
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
88
+ groups=channels, dilation=dilation, padding=padding
89
+ ))
90
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
91
+ self.norms_1.append(LayerNorm(channels))
92
+ self.norms_2.append(LayerNorm(channels))
93
+
94
+ def forward(self, x, x_mask, g=None):
95
+ if g is not None:
96
+ x = x + g
97
+ for i in range(self.n_layers):
98
+ y = self.convs_sep[i](x * x_mask)
99
+ y = self.norms_1[i](y)
100
+ y = F.gelu(y)
101
+ y = self.convs_1x1[i](y)
102
+ y = self.norms_2[i](y)
103
+ y = F.gelu(y)
104
+ y = self.drop(y)
105
+ x = x + y
106
+ return x * x_mask
107
+
108
+
109
+ class WN(torch.nn.Module):
110
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
111
+ super(WN, self).__init__()
112
+ assert(kernel_size % 2 == 1)
113
+ self.hidden_channels =hidden_channels
114
+ self.kernel_size = kernel_size,
115
+ self.dilation_rate = dilation_rate
116
+ self.n_layers = n_layers
117
+ self.gin_channels = gin_channels
118
+ self.p_dropout = p_dropout
119
+
120
+ self.in_layers = torch.nn.ModuleList()
121
+ self.res_skip_layers = torch.nn.ModuleList()
122
+ self.drop = nn.Dropout(p_dropout)
123
+
124
+ if gin_channels != 0:
125
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
126
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
127
+
128
+ for i in range(n_layers):
129
+ dilation = dilation_rate ** i
130
+ padding = int((kernel_size * dilation - dilation) / 2)
131
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
132
+ dilation=dilation, padding=padding)
133
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
134
+ self.in_layers.append(in_layer)
135
+
136
+ # last one is not necessary
137
+ if i < n_layers - 1:
138
+ res_skip_channels = 2 * hidden_channels
139
+ else:
140
+ res_skip_channels = hidden_channels
141
+
142
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
143
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
144
+ self.res_skip_layers.append(res_skip_layer)
145
+
146
+ def forward(self, x, x_mask, g=None, **kwargs):
147
+ output = torch.zeros_like(x)
148
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
149
+
150
+ if g is not None:
151
+ g = self.cond_layer(g)
152
+
153
+ for i in range(self.n_layers):
154
+ x_in = self.in_layers[i](x)
155
+ if g is not None:
156
+ cond_offset = i * 2 * self.hidden_channels
157
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
158
+ else:
159
+ g_l = torch.zeros_like(x_in)
160
+
161
+ acts = fused_add_tanh_sigmoid_multiply(
162
+ x_in,
163
+ g_l,
164
+ n_channels_tensor)
165
+ acts = self.drop(acts)
166
+
167
+ res_skip_acts = self.res_skip_layers[i](acts)
168
+ if i < self.n_layers - 1:
169
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
170
+ x = (x + res_acts) * x_mask
171
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
172
+ else:
173
+ output = output + res_skip_acts
174
+ return output * x_mask
175
+
176
+ def remove_weight_norm(self):
177
+ if self.gin_channels != 0:
178
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
179
+ for l in self.in_layers:
180
+ torch.nn.utils.remove_weight_norm(l)
181
+ for l in self.res_skip_layers:
182
+ torch.nn.utils.remove_weight_norm(l)
183
+
184
+
185
+ class ResBlock1(torch.nn.Module):
186
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
187
+ super(ResBlock1, self).__init__()
188
+ self.convs1 = nn.ModuleList([
189
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
190
+ padding=get_padding(kernel_size, dilation[0]))),
191
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
192
+ padding=get_padding(kernel_size, dilation[1]))),
193
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
194
+ padding=get_padding(kernel_size, dilation[2])))
195
+ ])
196
+ self.convs1.apply(init_weights)
197
+
198
+ self.convs2 = nn.ModuleList([
199
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
200
+ padding=get_padding(kernel_size, 1))),
201
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202
+ padding=get_padding(kernel_size, 1))),
203
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204
+ padding=get_padding(kernel_size, 1)))
205
+ ])
206
+ self.convs2.apply(init_weights)
207
+
208
+ def forward(self, x, x_mask=None):
209
+ for c1, c2 in zip(self.convs1, self.convs2):
210
+ xt = F.leaky_relu(x, LRELU_SLOPE)
211
+ if x_mask is not None:
212
+ xt = xt * x_mask
213
+ xt = c1(xt)
214
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
215
+ if x_mask is not None:
216
+ xt = xt * x_mask
217
+ xt = c2(xt)
218
+ x = xt + x
219
+ if x_mask is not None:
220
+ x = x * x_mask
221
+ return x
222
+
223
+ def remove_weight_norm(self):
224
+ for l in self.convs1:
225
+ remove_weight_norm(l)
226
+ for l in self.convs2:
227
+ remove_weight_norm(l)
228
+
229
+
230
+ class ResBlock2(torch.nn.Module):
231
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
232
+ super(ResBlock2, self).__init__()
233
+ self.convs = nn.ModuleList([
234
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
235
+ padding=get_padding(kernel_size, dilation[0]))),
236
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
237
+ padding=get_padding(kernel_size, dilation[1])))
238
+ ])
239
+ self.convs.apply(init_weights)
240
+
241
+ def forward(self, x, x_mask=None):
242
+ for c in self.convs:
243
+ xt = F.leaky_relu(x, LRELU_SLOPE)
244
+ if x_mask is not None:
245
+ xt = xt * x_mask
246
+ xt = c(xt)
247
+ x = xt + x
248
+ if x_mask is not None:
249
+ x = x * x_mask
250
+ return x
251
+
252
+ def remove_weight_norm(self):
253
+ for l in self.convs:
254
+ remove_weight_norm(l)
255
+
256
+
257
+ class Log(nn.Module):
258
+ def forward(self, x, x_mask, reverse=False, **kwargs):
259
+ if not reverse:
260
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
261
+ logdet = torch.sum(-y, [1, 2])
262
+ return y, logdet
263
+ else:
264
+ x = torch.exp(x) * x_mask
265
+ return x
266
+
267
+
268
+ class Flip(nn.Module):
269
+ def forward(self, x, *args, reverse=False, **kwargs):
270
+ x = torch.flip(x, [1])
271
+ if not reverse:
272
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
273
+ return x, logdet
274
+ else:
275
+ return x
276
+
277
+
278
+ class ElementwiseAffine(nn.Module):
279
+ def __init__(self, channels):
280
+ super().__init__()
281
+ self.channels = channels
282
+ self.m = nn.Parameter(torch.zeros(channels,1))
283
+ self.logs = nn.Parameter(torch.zeros(channels,1))
284
+
285
+ def forward(self, x, x_mask, reverse=False, **kwargs):
286
+ if not reverse:
287
+ y = self.m + torch.exp(self.logs) * x
288
+ y = y * x_mask
289
+ logdet = torch.sum(self.logs * x_mask, [1,2])
290
+ return y, logdet
291
+ else:
292
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
293
+ return x
294
+
295
+
296
+ class ResidualCouplingLayer(nn.Module):
297
+ def __init__(self,
298
+ channels,
299
+ hidden_channels,
300
+ kernel_size,
301
+ dilation_rate,
302
+ n_layers,
303
+ p_dropout=0,
304
+ gin_channels=0,
305
+ mean_only=False):
306
+ assert channels % 2 == 0, "channels should be divisible by 2"
307
+ super().__init__()
308
+ self.channels = channels
309
+ self.hidden_channels = hidden_channels
310
+ self.kernel_size = kernel_size
311
+ self.dilation_rate = dilation_rate
312
+ self.n_layers = n_layers
313
+ self.half_channels = channels // 2
314
+ self.mean_only = mean_only
315
+
316
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
317
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
318
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
319
+ self.post.weight.data.zero_()
320
+ self.post.bias.data.zero_()
321
+
322
+ def forward(self, x, x_mask, g=None, reverse=False):
323
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
324
+ h = self.pre(x0) * x_mask
325
+ h = self.enc(h, x_mask, g=g)
326
+ stats = self.post(h) * x_mask
327
+ if not self.mean_only:
328
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
329
+ else:
330
+ m = stats
331
+ logs = torch.zeros_like(m)
332
+
333
+ if not reverse:
334
+ x1 = m + x1 * torch.exp(logs) * x_mask
335
+ x = torch.cat([x0, x1], 1)
336
+ logdet = torch.sum(logs, [1,2])
337
+ return x, logdet
338
+ else:
339
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
340
+ x = torch.cat([x0, x1], 1)
341
+ return x
dreamvoice/freevc/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ httpx==0.24.1
3
+ numpy
4
+ scipy
5
+ torch
6
+ transformers
7
+ librosa
8
+ webrtcvad==2.0.10
dreamvoice/freevc/utils.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import logging
5
+ import json
6
+ import subprocess
7
+ import numpy as np
8
+ from scipy.io.wavfile import read
9
+ import torch
10
+ from torch.nn import functional as F
11
+ from .commons import sequence_mask
12
+
13
+ MATPLOTLIB_FLAG = False
14
+
15
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
16
+ logger = logging
17
+
18
+
19
+ def get_cmodel(rank):
20
+ checkpoint = torch.load('wavlm/WavLM-Large.pt')
21
+ cfg = WavLMConfig(checkpoint['cfg'])
22
+ cmodel = WavLM(cfg).cuda(rank)
23
+ cmodel.load_state_dict(checkpoint['model'])
24
+ cmodel.eval()
25
+ return cmodel
26
+
27
+
28
+ def get_content(cmodel, y):
29
+ with torch.no_grad():
30
+ c = cmodel.extract_features(y.squeeze(1))[0]
31
+ c = c.transpose(1, 2)
32
+ return c
33
+
34
+
35
+ def get_vocoder(rank):
36
+ with open("hifigan/config.json", "r") as f:
37
+ config = json.load(f)
38
+ config = hifigan.AttrDict(config)
39
+ vocoder = hifigan.Generator(config)
40
+ ckpt = torch.load("hifigan/generator_v1")
41
+ vocoder.load_state_dict(ckpt["generator"])
42
+ vocoder.eval()
43
+ vocoder.remove_weight_norm()
44
+ vocoder.cuda(rank)
45
+ return vocoder
46
+
47
+
48
+ def transform(mel, height): # 68-92
49
+ #r = np.random.random()
50
+ #rate = r * 0.3 + 0.85 # 0.85-1.15
51
+ #height = int(mel.size(-2) * rate)
52
+ tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
53
+ if height >= mel.size(-2):
54
+ return tgt[:, :mel.size(-2), :]
55
+ else:
56
+ silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
57
+ silence += torch.randn_like(silence) / 10
58
+ return torch.cat((tgt, silence), 1)
59
+
60
+
61
+ def stretch(mel, width): # 0.5-2
62
+ return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
63
+
64
+
65
+ def load_checkpoint(checkpoint_path, model, optimizer=None):
66
+ assert os.path.isfile(checkpoint_path)
67
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
68
+ iteration = checkpoint_dict['iteration']
69
+ learning_rate = checkpoint_dict['learning_rate']
70
+ if optimizer is not None:
71
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
72
+ saved_state_dict = checkpoint_dict['model']
73
+ if hasattr(model, 'module'):
74
+ state_dict = model.module.state_dict()
75
+ else:
76
+ state_dict = model.state_dict()
77
+ new_state_dict= {}
78
+ for k, v in state_dict.items():
79
+ try:
80
+ new_state_dict[k] = saved_state_dict[k]
81
+ except:
82
+ logger.info("%s is not in the checkpoint" % k)
83
+ new_state_dict[k] = v
84
+ if hasattr(model, 'module'):
85
+ model.module.load_state_dict(new_state_dict)
86
+ else:
87
+ model.load_state_dict(new_state_dict)
88
+ logger.info("Loaded checkpoint '{}' (iteration {})" .format(
89
+ checkpoint_path, iteration))
90
+ return model, optimizer, learning_rate, iteration
91
+
92
+
93
+ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
94
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
95
+ iteration, checkpoint_path))
96
+ if hasattr(model, 'module'):
97
+ state_dict = model.module.state_dict()
98
+ else:
99
+ state_dict = model.state_dict()
100
+ torch.save({'model': state_dict,
101
+ 'iteration': iteration,
102
+ 'optimizer': optimizer.state_dict(),
103
+ 'learning_rate': learning_rate}, checkpoint_path)
104
+
105
+
106
+ def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
107
+ for k, v in scalars.items():
108
+ writer.add_scalar(k, v, global_step)
109
+ for k, v in histograms.items():
110
+ writer.add_histogram(k, v, global_step)
111
+ for k, v in images.items():
112
+ writer.add_image(k, v, global_step, dataformats='HWC')
113
+ for k, v in audios.items():
114
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
115
+
116
+
117
+ def latest_checkpoint_path(dir_path, regex="G_*.pth"):
118
+ f_list = glob.glob(os.path.join(dir_path, regex))
119
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
120
+ x = f_list[-1]
121
+ print(x)
122
+ return x
123
+
124
+
125
+ def plot_spectrogram_to_numpy(spectrogram):
126
+ global MATPLOTLIB_FLAG
127
+ if not MATPLOTLIB_FLAG:
128
+ import matplotlib
129
+ matplotlib.use("Agg")
130
+ MATPLOTLIB_FLAG = True
131
+ mpl_logger = logging.getLogger('matplotlib')
132
+ mpl_logger.setLevel(logging.WARNING)
133
+ import matplotlib.pylab as plt
134
+ import numpy as np
135
+
136
+ fig, ax = plt.subplots(figsize=(10,2))
137
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
138
+ interpolation='none')
139
+ plt.colorbar(im, ax=ax)
140
+ plt.xlabel("Frames")
141
+ plt.ylabel("Channels")
142
+ plt.tight_layout()
143
+
144
+ fig.canvas.draw()
145
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
146
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
147
+ plt.close()
148
+ return data
149
+
150
+
151
+ def plot_alignment_to_numpy(alignment, info=None):
152
+ global MATPLOTLIB_FLAG
153
+ if not MATPLOTLIB_FLAG:
154
+ import matplotlib
155
+ matplotlib.use("Agg")
156
+ MATPLOTLIB_FLAG = True
157
+ mpl_logger = logging.getLogger('matplotlib')
158
+ mpl_logger.setLevel(logging.WARNING)
159
+ import matplotlib.pylab as plt
160
+ import numpy as np
161
+
162
+ fig, ax = plt.subplots(figsize=(6, 4))
163
+ im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
164
+ interpolation='none')
165
+ fig.colorbar(im, ax=ax)
166
+ xlabel = 'Decoder timestep'
167
+ if info is not None:
168
+ xlabel += '\n\n' + info
169
+ plt.xlabel(xlabel)
170
+ plt.ylabel('Encoder timestep')
171
+ plt.tight_layout()
172
+
173
+ fig.canvas.draw()
174
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
175
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
176
+ plt.close()
177
+ return data
178
+
179
+
180
+ def load_wav_to_torch(full_path):
181
+ sampling_rate, data = read(full_path)
182
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
183
+
184
+
185
+ def load_filepaths_and_text(filename, split="|"):
186
+ with open(filename, encoding='utf-8') as f:
187
+ filepaths_and_text = [line.strip().split(split) for line in f]
188
+ return filepaths_and_text
189
+
190
+
191
+ def get_hparams(init=True):
192
+ parser = argparse.ArgumentParser()
193
+ parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
194
+ help='JSON file for configuration')
195
+ parser.add_argument('-m', '--model', type=str, required=True,
196
+ help='Model name')
197
+
198
+ args = parser.parse_args()
199
+ model_dir = os.path.join("./logs", args.model)
200
+
201
+ if not os.path.exists(model_dir):
202
+ os.makedirs(model_dir)
203
+
204
+ config_path = args.config
205
+ config_save_path = os.path.join(model_dir, "config.json")
206
+ if init:
207
+ with open(config_path, "r") as f:
208
+ data = f.read()
209
+ with open(config_save_path, "w") as f:
210
+ f.write(data)
211
+ else:
212
+ with open(config_save_path, "r") as f:
213
+ data = f.read()
214
+ config = json.loads(data)
215
+
216
+ hparams = HParams(**config)
217
+ hparams.model_dir = model_dir
218
+ return hparams
219
+
220
+
221
+ def get_hparams_from_dir(model_dir):
222
+ config_save_path = os.path.join(model_dir, "config.json")
223
+ with open(config_save_path, "r") as f:
224
+ data = f.read()
225
+ config = json.loads(data)
226
+
227
+ hparams =HParams(**config)
228
+ hparams.model_dir = model_dir
229
+ return hparams
230
+
231
+
232
+ def get_hparams_from_file(config_path):
233
+ with open(config_path, "r") as f:
234
+ data = f.read()
235
+ config = json.loads(data)
236
+
237
+ hparams =HParams(**config)
238
+ return hparams
239
+
240
+
241
+ def check_git_hash(model_dir):
242
+ source_dir = os.path.dirname(os.path.realpath(__file__))
243
+ if not os.path.exists(os.path.join(source_dir, ".git")):
244
+ logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
245
+ source_dir
246
+ ))
247
+ return
248
+
249
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
250
+
251
+ path = os.path.join(model_dir, "githash")
252
+ if os.path.exists(path):
253
+ saved_hash = open(path).read()
254
+ if saved_hash != cur_hash:
255
+ logger.warn("git hash values are different. {}(saved) != {}(current)".format(
256
+ saved_hash[:8], cur_hash[:8]))
257
+ else:
258
+ open(path, "w").write(cur_hash)
259
+
260
+
261
+ def get_logger(model_dir, filename="train.log"):
262
+ global logger
263
+ logger = logging.getLogger(os.path.basename(model_dir))
264
+ logger.setLevel(logging.DEBUG)
265
+
266
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
267
+ if not os.path.exists(model_dir):
268
+ os.makedirs(model_dir)
269
+ h = logging.FileHandler(os.path.join(model_dir, filename))
270
+ h.setLevel(logging.DEBUG)
271
+ h.setFormatter(formatter)
272
+ logger.addHandler(h)
273
+ return logger
274
+
275
+
276
+ class HParams():
277
+ def __init__(self, **kwargs):
278
+ for k, v in kwargs.items():
279
+ if type(v) == dict:
280
+ v = HParams(**v)
281
+ self[k] = v
282
+
283
+ def keys(self):
284
+ return self.__dict__.keys()
285
+
286
+ def items(self):
287
+ return self.__dict__.items()
288
+
289
+ def values(self):
290
+ return self.__dict__.values()
291
+
292
+ def __len__(self):
293
+ return len(self.__dict__)
294
+
295
+ def __getitem__(self, key):
296
+ return getattr(self, key)
297
+
298
+ def __setitem__(self, key, value):
299
+ return setattr(self, key, value)
300
+
301
+ def __contains__(self, key):
302
+ return key in self.__dict__
303
+
304
+ def __repr__(self):
305
+ return self.__dict__.__repr__()
dreamvoice/freevc_wrapper.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import soundfile as sf
5
+ from pathlib import Path
6
+
7
+ from transformers import WavLMModel
8
+ from .freevc.utils import load_checkpoint, get_hparams_from_file
9
+ from .freevc.models import SynthesizerTrn
10
+ # from mel_processing import mel_spectrogram_torch
11
+ # from free_vc.speaker_encoder.voice_encoder import SpeakerEncoder
12
+ # from speaker_encoder.voice_encoder import SpeakerEncoder
13
+
14
+
15
+ def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
16
+ hps = get_hparams_from_file(f"{path}/freevc.json")
17
+ freevc = SynthesizerTrn(
18
+ hps.data.filter_length // 2 + 1,
19
+ hps.train.segment_size // hps.data.hop_length,
20
+ **hps.model).to(device)
21
+ freevc.eval()
22
+ load_checkpoint(f"{path}/freevc.pth", freevc, None)
23
+
24
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
25
+ cmodel.eval()
26
+
27
+ # smodel = spk_encoder.load_model(Path(speaker_path), device)
28
+ # smodel = spk_encoder.load_model(Path(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt"), 'cuda')
29
+ # smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
30
+
31
+ return freevc, cmodel, hps
32
+
33
+
34
+ @torch.no_grad()
35
+ def convert(freevc, content, speaker):
36
+ audio = freevc.infer(content, g=speaker)
37
+ audio = audio[0][0].data.cpu().float().numpy()
38
+ return audio, 16000
39
+
40
+
41
+ if __name__ == '__main__':
42
+ freevc_24, cmodel, smodel, hps = get_freevc_models()
43
+
44
+ tgt = 'p226_002.wav'
45
+ # src = 'p226_002.wav'
46
+ src = 'p225_001.wav'
47
+ device = 'cuda'
48
+
49
+ # tgt
50
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
51
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
52
+ g_tgt = smodel.embed_utterance(wav_tgt)
53
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
54
+ # g_tgt = spk_encoder.embed_utterance_batch(torch.tensor(wav_tgt).unsqueeze(0).cuda())
55
+
56
+ # src
57
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
58
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
59
+ content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
60
+
61
+ output, sr = convert(freevc_24, content, g_tgt)
62
+
63
+ sf.write('output.wav', output, sr)
dreamvoice/plugin.py CHANGED
@@ -108,7 +108,6 @@ class DreamVoice_Plugin:
108
  self.spk_encoder = spk_encoder
109
  self.spk_embed_cache = None
110
 
111
-
112
  @torch.no_grad()
113
  def gen_spk(self, prompt,
114
  prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
 
108
  self.spk_encoder = spk_encoder
109
  self.spk_embed_cache = None
110
 
 
111
  @torch.no_grad()
112
  def gen_spk(self, prompt,
113
  prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
dreamvoice/plugin_ckpts/freevc.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0589fd38d965a7f8aab6eb3bedae5d1c007acb0f305e04bbe0fd4a771fff717d
3
+ size 104892189
dreamvoice/plugin_freevc.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ version: 1.1
2
+
3
+ lm_path: 'google/flan-t5-base'
4
+
5
+ dreamvg:
6
+ config_path: 'src/configs/plugin_cross_freevc.yaml'
7
+ ckpt_path: 'plugin_ckpts/freevc.pt'
8
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/plugin_ckpts/freevc.pt'
dreamvoice/src/configs/{plugin_cross.yaml → plugin_cross_freevc.yaml} RENAMED
File without changes
dreamvoice/train_utils/prepare_freevc/freevc/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FreeVC
3
+ emoji: 🚀
4
+ colorFrom: gray
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.13.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
dreamvoice/train_utils/prepare_freevc/freevc/app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import gradio as gr
5
+ from scipy.io.wavfile import write
6
+ from transformers import WavLMModel
7
+
8
+ import utils
9
+ from models import SynthesizerTrn
10
+ from mel_processing import mel_spectrogram_torch
11
+ from speaker_encoder.voice_encoder import SpeakerEncoder
12
+
13
+ '''
14
+ def get_wavlm():
15
+ os.system('gdown https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU')
16
+ shutil.move('WavLM-Large.pt', 'wavlm')
17
+ '''
18
+
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ print("Loading FreeVC...")
22
+ hps = utils.get_hparams_from_file("configs/freevc.json")
23
+ freevc = SynthesizerTrn(
24
+ hps.data.filter_length // 2 + 1,
25
+ hps.train.segment_size // hps.data.hop_length,
26
+ **hps.model).to(device)
27
+ _ = freevc.eval()
28
+ _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
29
+ smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
30
+
31
+ print("Loading FreeVC(24k)...")
32
+ hps = utils.get_hparams_from_file("configs/freevc-24.json")
33
+ freevc_24 = SynthesizerTrn(
34
+ hps.data.filter_length // 2 + 1,
35
+ hps.train.segment_size // hps.data.hop_length,
36
+ **hps.model).to(device)
37
+ _ = freevc_24.eval()
38
+ _ = utils.load_checkpoint("checkpoints/freevc-24.pth", freevc_24, None)
39
+
40
+ print("Loading FreeVC-s...")
41
+ hps = utils.get_hparams_from_file("configs/freevc-s.json")
42
+ freevc_s = SynthesizerTrn(
43
+ hps.data.filter_length // 2 + 1,
44
+ hps.train.segment_size // hps.data.hop_length,
45
+ **hps.model).to(device)
46
+ _ = freevc_s.eval()
47
+ _ = utils.load_checkpoint("checkpoints/freevc-s.pth", freevc_s, None)
48
+
49
+ print("Loading WavLM for content...")
50
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
51
+
52
+ def convert(model, src, tgt):
53
+ with torch.no_grad():
54
+ # tgt
55
+ wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
56
+ wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
57
+ if model == "FreeVC" or model == "FreeVC (24kHz)":
58
+ g_tgt = smodel.embed_utterance(wav_tgt)
59
+ g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
60
+ else:
61
+ wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(device)
62
+ mel_tgt = mel_spectrogram_torch(
63
+ wav_tgt,
64
+ hps.data.filter_length,
65
+ hps.data.n_mel_channels,
66
+ hps.data.sampling_rate,
67
+ hps.data.hop_length,
68
+ hps.data.win_length,
69
+ hps.data.mel_fmin,
70
+ hps.data.mel_fmax
71
+ )
72
+ # src
73
+ wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
74
+ wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
75
+ c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
76
+ # infer
77
+ if model == "FreeVC":
78
+ audio = freevc.infer(c, g=g_tgt)
79
+ elif model == "FreeVC-s":
80
+ audio = freevc_s.infer(c, mel=mel_tgt)
81
+ else:
82
+ audio = freevc_24.infer(c, g=g_tgt)
83
+ audio = audio[0][0].data.cpu().float().numpy()
84
+ if model == "FreeVC" or model == "FreeVC-s":
85
+ write("out.wav", hps.data.sampling_rate, audio)
86
+ else:
87
+ write("out.wav", 24000, audio)
88
+ out = "out.wav"
89
+ return out
90
+
91
+ model = gr.Dropdown(choices=["FreeVC", "FreeVC-s", "FreeVC (24kHz)"], value="FreeVC",type="value", label="Model")
92
+ audio1 = gr.Audio(label="Source Audio", type='filepath')
93
+ audio2 = gr.Audio(label="Reference Audio", type='filepath')
94
+ inputs = [model, audio1, audio2]
95
+ outputs = gr.Audio(label="Output Audio", type='filepath')
96
+
97
+ title = "FreeVC"
98
+ description = "Gradio Demo for FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion. To use it, simply upload your audio, or click the example to load. Read more at the links below. Note: It seems that the WavLM checkpoint in HuggingFace is a little different from the one used to train FreeVC, which may degrade the performance a bit. In addition, speaker similarity can be largely affected if there are too much silence in the reference audio, so please <strong>trim</strong> it before submitting."
99
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2210.15418' target='_blank'>Paper</a> | <a href='https://github.com/OlaWod/FreeVC' target='_blank'>Github Repo</a></p>"
100
+
101
+ examples=[["FreeVC", 'p225_001.wav', 'p226_002.wav'], ["FreeVC-s", 'p226_002.wav', 'p225_001.wav'], ["FreeVC (24kHz)", 'p225_001.wav', 'p226_002.wav']]
102
+
103
+ gr.Interface(convert, inputs, outputs, title=title, description=description, article=article, examples=examples, enable_queue=True).launch()
dreamvoice/train_utils/prepare_freevc/freevc/commons.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size*dilation - dilation)/2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def intersperse(lst, item):
25
+ result = [item] * (len(lst) * 2 + 1)
26
+ result[1::2] = lst
27
+ return result
28
+
29
+
30
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
31
+ """KL(P||Q)"""
32
+ kl = (logs_q - logs_p) - 0.5
33
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def rand_spec_segments(x, x_lengths=None, segment_size=4):
68
+ b, d, t = x.size()
69
+ if x_lengths is None:
70
+ x_lengths = t
71
+ ids_str_max = x_lengths - segment_size
72
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
73
+ ret = slice_segments(x, ids_str, segment_size)
74
+ return ret, ids_str
75
+
76
+
77
+ def get_timing_signal_1d(
78
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
79
+ position = torch.arange(length, dtype=torch.float)
80
+ num_timescales = channels // 2
81
+ log_timescale_increment = (
82
+ math.log(float(max_timescale) / float(min_timescale)) /
83
+ (num_timescales - 1))
84
+ inv_timescales = min_timescale * torch.exp(
85
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
86
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
87
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
88
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
89
+ signal = signal.view(1, channels, length)
90
+ return signal
91
+
92
+
93
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
94
+ b, channels, length = x.size()
95
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
96
+ return x + signal.to(dtype=x.dtype, device=x.device)
97
+
98
+
99
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
100
+ b, channels, length = x.size()
101
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
102
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
103
+
104
+
105
+ def subsequent_mask(length):
106
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
107
+ return mask
108
+
109
+
110
+ @torch.jit.script
111
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
112
+ n_channels_int = n_channels[0]
113
+ in_act = input_a + input_b
114
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
115
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
116
+ acts = t_act * s_act
117
+ return acts
118
+
119
+
120
+ def convert_pad_shape(pad_shape):
121
+ l = pad_shape[::-1]
122
+ pad_shape = [item for sublist in l for item in sublist]
123
+ return pad_shape
124
+
125
+
126
+ def shift_1d(x):
127
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
128
+ return x
129
+
130
+
131
+ def sequence_mask(length, max_length=None):
132
+ if max_length is None:
133
+ max_length = length.max()
134
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
135
+ return x.unsqueeze(0) < length.unsqueeze(1)
136
+
137
+
138
+ def generate_path(duration, mask):
139
+ """
140
+ duration: [b, 1, t_x]
141
+ mask: [b, 1, t_y, t_x]
142
+ """
143
+ device = duration.device
144
+
145
+ b, _, t_y, t_x = mask.shape
146
+ cum_duration = torch.cumsum(duration, -1)
147
+
148
+ cum_duration_flat = cum_duration.view(b * t_x)
149
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
150
+ path = path.view(b, t_x, t_y)
151
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
152
+ path = path.unsqueeze(1).transpose(2,3) * mask
153
+ return path
154
+
155
+
156
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
157
+ if isinstance(parameters, torch.Tensor):
158
+ parameters = [parameters]
159
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
160
+ norm_type = float(norm_type)
161
+ if clip_value is not None:
162
+ clip_value = float(clip_value)
163
+
164
+ total_norm = 0
165
+ for p in parameters:
166
+ param_norm = p.grad.data.norm(norm_type)
167
+ total_norm += param_norm.item() ** norm_type
168
+ if clip_value is not None:
169
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
170
+ total_norm = total_norm ** (1. / norm_type)
171
+ return total_norm
dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-24.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8640,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8008"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,6,4,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": true
53
+ }
54
+ }
dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc-s.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8960,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8001"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,8,2,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": false
53
+ }
54
+ }
dreamvoice/train_utils/prepare_freevc/freevc/configs/freevc.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 10000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 64,
11
+ "fp16_run": false,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8960,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "use_sr": true,
19
+ "max_speclen": 128,
20
+ "port": "8001"
21
+ },
22
+ "data": {
23
+ "training_files":"filelists/train.txt",
24
+ "validation_files":"filelists/val.txt",
25
+ "max_wav_value": 32768.0,
26
+ "sampling_rate": 16000,
27
+ "filter_length": 1280,
28
+ "hop_length": 320,
29
+ "win_length": 1280,
30
+ "n_mel_channels": 80,
31
+ "mel_fmin": 0.0,
32
+ "mel_fmax": null
33
+ },
34
+ "model": {
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0.1,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [3,7,11],
44
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
45
+ "upsample_rates": [10,8,2,2],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [16,16,4,4],
48
+ "n_layers_q": 3,
49
+ "use_spectral_norm": false,
50
+ "gin_channels": 256,
51
+ "ssl_dim": 1024,
52
+ "use_spk": true
53
+ }
54
+ }
dreamvoice/train_utils/prepare_freevc/freevc/freevc_pipeline.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import librosa
5
+ import sounddevice as sd
6
+ from transformers import WavLMModel
7
+ from scipy.io.wavfile import write
8
+ from models import SynthesizerTrn
9
+ from speaker_encoder.voice_encoder import SpeakerEncoder
10
+ import utils
11
+ import numpy as np
12
+ from transformers import T5Tokenizer, T5EncoderModel
13
+ from src.plugin_wrapper import DreamVG
14
+ import soundfile as sf
15
+
16
+
17
+ # Load configurations and models
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+
20
+ print("Loading FreeVC...")
21
+ hps = utils.get_hparams_from_file("configs/freevc.json")
22
+ freevc = SynthesizerTrn(
23
+ hps.data.filter_length // 2 + 1,
24
+ hps.train.segment_size // hps.data.hop_length,
25
+ **hps.model).to(device)
26
+ freevc.eval()
27
+ utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)
28
+
29
+ print("Loading Speaker Encoder...")
30
+ smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
31
+
32
+ print("Loading WavLM for content...")
33
+ cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
34
+
35
+ lm_path = 'google/flan-t5-base'
36
+ tokenizer = T5Tokenizer.from_pretrained(lm_path)
37
+ text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
38
+
39
+ dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
40
+ ckpt_path='checkpoints/dreamvc_plugin.pt',
41
+ device=device)
42
+
43
+
44
+ prompt = "girl's voice, very young and cute"
45
+ prompt_guidance_scale = 3.0
46
+
47
+ text_batch = tokenizer(prompt, max_length=32,
48
+ padding='max_length', truncation=True, return_tensors="pt")
49
+ text, text_mask = text_batch.input_ids.to(device), \
50
+ text_batch.attention_mask.to(device)
51
+ text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
52
+ target_embedding = dreamvg.inference([text, text_mask],
53
+ guidance_scale=prompt_guidance_scale,
54
+ guidance_rescale=0.0,
55
+ ddim_steps=100, eta=1,
56
+ random_seed=None)
57
+
58
+ # Convert to tensor and pad
59
+ audio, sr = librosa.load('segment_1.mp3', sr=16000)
60
+ audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
61
+ audio = F.pad(audio, (40, 40))
62
+
63
+ # Extract content features using WavLM
64
+ c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)
65
+
66
+ audio = freevc.infer(c, g=target_embedding)
67
+ audio = audio[0][0].data.cpu().float().numpy()
68
+
69
+ sf.write('freevc_out.wav', audio, 16000)
dreamvoice/train_utils/prepare_freevc/freevc/mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
dreamvoice/train_utils/prepare_freevc/freevc/models.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ import commons
8
+ import modules
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from commons import init_weights, get_padding
13
+
14
+
15
+ class ResidualCouplingBlock(nn.Module):
16
+ def __init__(self,
17
+ channels,
18
+ hidden_channels,
19
+ kernel_size,
20
+ dilation_rate,
21
+ n_layers,
22
+ n_flows=4,
23
+ gin_channels=0):
24
+ super().__init__()
25
+ self.channels = channels
26
+ self.hidden_channels = hidden_channels
27
+ self.kernel_size = kernel_size
28
+ self.dilation_rate = dilation_rate
29
+ self.n_layers = n_layers
30
+ self.n_flows = n_flows
31
+ self.gin_channels = gin_channels
32
+
33
+ self.flows = nn.ModuleList()
34
+ for i in range(n_flows):
35
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
36
+ self.flows.append(modules.Flip())
37
+
38
+ def forward(self, x, x_mask, g=None, reverse=False):
39
+ if not reverse:
40
+ for flow in self.flows:
41
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
42
+ else:
43
+ for flow in reversed(self.flows):
44
+ x = flow(x, x_mask, g=g, reverse=reverse)
45
+ return x
46
+
47
+
48
+ class Encoder(nn.Module):
49
+ def __init__(self,
50
+ in_channels,
51
+ out_channels,
52
+ hidden_channels,
53
+ kernel_size,
54
+ dilation_rate,
55
+ n_layers,
56
+ gin_channels=0):
57
+ super().__init__()
58
+ self.in_channels = in_channels
59
+ self.out_channels = out_channels
60
+ self.hidden_channels = hidden_channels
61
+ self.kernel_size = kernel_size
62
+ self.dilation_rate = dilation_rate
63
+ self.n_layers = n_layers
64
+ self.gin_channels = gin_channels
65
+
66
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
67
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
68
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
69
+
70
+ def forward(self, x, x_lengths, g=None):
71
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
72
+ x = self.pre(x) * x_mask
73
+ x = self.enc(x, x_mask, g=g)
74
+ stats = self.proj(x) * x_mask
75
+ m, logs = torch.split(stats, self.out_channels, dim=1)
76
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
77
+ return z, m, logs, x_mask
78
+
79
+
80
+ class Generator(torch.nn.Module):
81
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
82
+ super(Generator, self).__init__()
83
+ self.num_kernels = len(resblock_kernel_sizes)
84
+ self.num_upsamples = len(upsample_rates)
85
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
86
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
87
+
88
+ self.ups = nn.ModuleList()
89
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
90
+ self.ups.append(weight_norm(
91
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
92
+ k, u, padding=(k-u)//2)))
93
+
94
+ self.resblocks = nn.ModuleList()
95
+ for i in range(len(self.ups)):
96
+ ch = upsample_initial_channel//(2**(i+1))
97
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
98
+ self.resblocks.append(resblock(ch, k, d))
99
+
100
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
101
+ self.ups.apply(init_weights)
102
+
103
+ if gin_channels != 0:
104
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
105
+
106
+ def forward(self, x, g=None):
107
+ x = self.conv_pre(x)
108
+ if g is not None:
109
+ x = x + self.cond(g)
110
+
111
+ for i in range(self.num_upsamples):
112
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
113
+ x = self.ups[i](x)
114
+ xs = None
115
+ for j in range(self.num_kernels):
116
+ if xs is None:
117
+ xs = self.resblocks[i*self.num_kernels+j](x)
118
+ else:
119
+ xs += self.resblocks[i*self.num_kernels+j](x)
120
+ x = xs / self.num_kernels
121
+ x = F.leaky_relu(x)
122
+ x = self.conv_post(x)
123
+ x = torch.tanh(x)
124
+
125
+ return x
126
+
127
+ def remove_weight_norm(self):
128
+ print('Removing weight norm...')
129
+ for l in self.ups:
130
+ remove_weight_norm(l)
131
+ for l in self.resblocks:
132
+ l.remove_weight_norm()
133
+
134
+
135
+ class DiscriminatorP(torch.nn.Module):
136
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
137
+ super(DiscriminatorP, self).__init__()
138
+ self.period = period
139
+ self.use_spectral_norm = use_spectral_norm
140
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
141
+ self.convs = nn.ModuleList([
142
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
143
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
144
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
145
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
146
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
147
+ ])
148
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
149
+
150
+ def forward(self, x):
151
+ fmap = []
152
+
153
+ # 1d to 2d
154
+ b, c, t = x.shape
155
+ if t % self.period != 0: # pad first
156
+ n_pad = self.period - (t % self.period)
157
+ x = F.pad(x, (0, n_pad), "reflect")
158
+ t = t + n_pad
159
+ x = x.view(b, c, t // self.period, self.period)
160
+
161
+ for l in self.convs:
162
+ x = l(x)
163
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
164
+ fmap.append(x)
165
+ x = self.conv_post(x)
166
+ fmap.append(x)
167
+ x = torch.flatten(x, 1, -1)
168
+
169
+ return x, fmap
170
+
171
+
172
+ class DiscriminatorS(torch.nn.Module):
173
+ def __init__(self, use_spectral_norm=False):
174
+ super(DiscriminatorS, self).__init__()
175
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
176
+ self.convs = nn.ModuleList([
177
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
178
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
179
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
180
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
181
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
182
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
183
+ ])
184
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
185
+
186
+ def forward(self, x):
187
+ fmap = []
188
+
189
+ for l in self.convs:
190
+ x = l(x)
191
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
192
+ fmap.append(x)
193
+ x = self.conv_post(x)
194
+ fmap.append(x)
195
+ x = torch.flatten(x, 1, -1)
196
+
197
+ return x, fmap
198
+
199
+
200
+ class MultiPeriodDiscriminator(torch.nn.Module):
201
+ def __init__(self, use_spectral_norm=False):
202
+ super(MultiPeriodDiscriminator, self).__init__()
203
+ periods = [2,3,5,7,11]
204
+
205
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
206
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
207
+ self.discriminators = nn.ModuleList(discs)
208
+
209
+ def forward(self, y, y_hat):
210
+ y_d_rs = []
211
+ y_d_gs = []
212
+ fmap_rs = []
213
+ fmap_gs = []
214
+ for i, d in enumerate(self.discriminators):
215
+ y_d_r, fmap_r = d(y)
216
+ y_d_g, fmap_g = d(y_hat)
217
+ y_d_rs.append(y_d_r)
218
+ y_d_gs.append(y_d_g)
219
+ fmap_rs.append(fmap_r)
220
+ fmap_gs.append(fmap_g)
221
+
222
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
223
+
224
+
225
+ class SpeakerEncoder(torch.nn.Module):
226
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
227
+ super(SpeakerEncoder, self).__init__()
228
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
229
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
230
+ self.relu = nn.ReLU()
231
+
232
+ def forward(self, mels):
233
+ self.lstm.flatten_parameters()
234
+ _, (hidden, _) = self.lstm(mels)
235
+ embeds_raw = self.relu(self.linear(hidden[-1]))
236
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
237
+
238
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
239
+ mel_slices = []
240
+ for i in range(0, total_frames-partial_frames, partial_hop):
241
+ mel_range = torch.arange(i, i+partial_frames)
242
+ mel_slices.append(mel_range)
243
+
244
+ return mel_slices
245
+
246
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
247
+ mel_len = mel.size(1)
248
+ last_mel = mel[:,-partial_frames:]
249
+
250
+ if mel_len > partial_frames:
251
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
252
+ mels = list(mel[:,s] for s in mel_slices)
253
+ mels.append(last_mel)
254
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
255
+
256
+ with torch.no_grad():
257
+ partial_embeds = self(mels)
258
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
259
+ #embed = embed / torch.linalg.norm(embed, 2)
260
+ else:
261
+ with torch.no_grad():
262
+ embed = self(last_mel)
263
+
264
+ return embed
265
+
266
+
267
+ class SynthesizerTrn(nn.Module):
268
+ """
269
+ Synthesizer for Training
270
+ """
271
+
272
+ def __init__(self,
273
+ spec_channels,
274
+ segment_size,
275
+ inter_channels,
276
+ hidden_channels,
277
+ filter_channels,
278
+ n_heads,
279
+ n_layers,
280
+ kernel_size,
281
+ p_dropout,
282
+ resblock,
283
+ resblock_kernel_sizes,
284
+ resblock_dilation_sizes,
285
+ upsample_rates,
286
+ upsample_initial_channel,
287
+ upsample_kernel_sizes,
288
+ gin_channels,
289
+ ssl_dim,
290
+ use_spk,
291
+ **kwargs):
292
+
293
+ super().__init__()
294
+ self.spec_channels = spec_channels
295
+ self.inter_channels = inter_channels
296
+ self.hidden_channels = hidden_channels
297
+ self.filter_channels = filter_channels
298
+ self.n_heads = n_heads
299
+ self.n_layers = n_layers
300
+ self.kernel_size = kernel_size
301
+ self.p_dropout = p_dropout
302
+ self.resblock = resblock
303
+ self.resblock_kernel_sizes = resblock_kernel_sizes
304
+ self.resblock_dilation_sizes = resblock_dilation_sizes
305
+ self.upsample_rates = upsample_rates
306
+ self.upsample_initial_channel = upsample_initial_channel
307
+ self.upsample_kernel_sizes = upsample_kernel_sizes
308
+ self.segment_size = segment_size
309
+ self.gin_channels = gin_channels
310
+ self.ssl_dim = ssl_dim
311
+ self.use_spk = use_spk
312
+
313
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
314
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
315
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
316
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
317
+
318
+ if not self.use_spk:
319
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
320
+
321
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
322
+ if c_lengths == None:
323
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
324
+ if spec_lengths == None:
325
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
326
+
327
+ if not self.use_spk:
328
+ g = self.enc_spk(mel.transpose(1,2))
329
+ g = g.unsqueeze(-1)
330
+
331
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
332
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
333
+ z_p = self.flow(z, spec_mask, g=g)
334
+
335
+ z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
336
+ o = self.dec(z_slice, g=g)
337
+
338
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
339
+
340
+ def infer(self, c, g=None, mel=None, c_lengths=None):
341
+ if c_lengths == None:
342
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
343
+ if not self.use_spk:
344
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
345
+ g = g.unsqueeze(-1)
346
+
347
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
348
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
349
+ o = self.dec(z * c_mask, g=g)
350
+
351
+ return o
dreamvoice/train_utils/prepare_freevc/freevc/modules.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ import commons
13
+ from commons import init_weights, get_padding
14
+
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
36
+ super().__init__()
37
+ self.in_channels = in_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.out_channels = out_channels
40
+ self.kernel_size = kernel_size
41
+ self.n_layers = n_layers
42
+ self.p_dropout = p_dropout
43
+ assert n_layers > 1, "Number of layers should be larger than 0."
44
+
45
+ self.conv_layers = nn.ModuleList()
46
+ self.norm_layers = nn.ModuleList()
47
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
48
+ self.norm_layers.append(LayerNorm(hidden_channels))
49
+ self.relu_drop = nn.Sequential(
50
+ nn.ReLU(),
51
+ nn.Dropout(p_dropout))
52
+ for _ in range(n_layers-1):
53
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
54
+ self.norm_layers.append(LayerNorm(hidden_channels))
55
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
56
+ self.proj.weight.data.zero_()
57
+ self.proj.bias.data.zero_()
58
+
59
+ def forward(self, x, x_mask):
60
+ x_org = x
61
+ for i in range(self.n_layers):
62
+ x = self.conv_layers[i](x * x_mask)
63
+ x = self.norm_layers[i](x)
64
+ x = self.relu_drop(x)
65
+ x = x_org + self.proj(x)
66
+ return x * x_mask
67
+
68
+
69
+ class DDSConv(nn.Module):
70
+ """
71
+ Dialted and Depth-Separable Convolution
72
+ """
73
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.kernel_size = kernel_size
77
+ self.n_layers = n_layers
78
+ self.p_dropout = p_dropout
79
+
80
+ self.drop = nn.Dropout(p_dropout)
81
+ self.convs_sep = nn.ModuleList()
82
+ self.convs_1x1 = nn.ModuleList()
83
+ self.norms_1 = nn.ModuleList()
84
+ self.norms_2 = nn.ModuleList()
85
+ for i in range(n_layers):
86
+ dilation = kernel_size ** i
87
+ padding = (kernel_size * dilation - dilation) // 2
88
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
89
+ groups=channels, dilation=dilation, padding=padding
90
+ ))
91
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
92
+ self.norms_1.append(LayerNorm(channels))
93
+ self.norms_2.append(LayerNorm(channels))
94
+
95
+ def forward(self, x, x_mask, g=None):
96
+ if g is not None:
97
+ x = x + g
98
+ for i in range(self.n_layers):
99
+ y = self.convs_sep[i](x * x_mask)
100
+ y = self.norms_1[i](y)
101
+ y = F.gelu(y)
102
+ y = self.convs_1x1[i](y)
103
+ y = self.norms_2[i](y)
104
+ y = F.gelu(y)
105
+ y = self.drop(y)
106
+ x = x + y
107
+ return x * x_mask
108
+
109
+
110
+ class WN(torch.nn.Module):
111
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112
+ super(WN, self).__init__()
113
+ assert(kernel_size % 2 == 1)
114
+ self.hidden_channels =hidden_channels
115
+ self.kernel_size = kernel_size,
116
+ self.dilation_rate = dilation_rate
117
+ self.n_layers = n_layers
118
+ self.gin_channels = gin_channels
119
+ self.p_dropout = p_dropout
120
+
121
+ self.in_layers = torch.nn.ModuleList()
122
+ self.res_skip_layers = torch.nn.ModuleList()
123
+ self.drop = nn.Dropout(p_dropout)
124
+
125
+ if gin_channels != 0:
126
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128
+
129
+ for i in range(n_layers):
130
+ dilation = dilation_rate ** i
131
+ padding = int((kernel_size * dilation - dilation) / 2)
132
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133
+ dilation=dilation, padding=padding)
134
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135
+ self.in_layers.append(in_layer)
136
+
137
+ # last one is not necessary
138
+ if i < n_layers - 1:
139
+ res_skip_channels = 2 * hidden_channels
140
+ else:
141
+ res_skip_channels = hidden_channels
142
+
143
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145
+ self.res_skip_layers.append(res_skip_layer)
146
+
147
+ def forward(self, x, x_mask, g=None, **kwargs):
148
+ output = torch.zeros_like(x)
149
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
150
+
151
+ if g is not None:
152
+ g = self.cond_layer(g)
153
+
154
+ for i in range(self.n_layers):
155
+ x_in = self.in_layers[i](x)
156
+ if g is not None:
157
+ cond_offset = i * 2 * self.hidden_channels
158
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159
+ else:
160
+ g_l = torch.zeros_like(x_in)
161
+
162
+ acts = commons.fused_add_tanh_sigmoid_multiply(
163
+ x_in,
164
+ g_l,
165
+ n_channels_tensor)
166
+ acts = self.drop(acts)
167
+
168
+ res_skip_acts = self.res_skip_layers[i](acts)
169
+ if i < self.n_layers - 1:
170
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
171
+ x = (x + res_acts) * x_mask
172
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
173
+ else:
174
+ output = output + res_skip_acts
175
+ return output * x_mask
176
+
177
+ def remove_weight_norm(self):
178
+ if self.gin_channels != 0:
179
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
180
+ for l in self.in_layers:
181
+ torch.nn.utils.remove_weight_norm(l)
182
+ for l in self.res_skip_layers:
183
+ torch.nn.utils.remove_weight_norm(l)
184
+
185
+
186
+ class ResBlock1(torch.nn.Module):
187
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188
+ super(ResBlock1, self).__init__()
189
+ self.convs1 = nn.ModuleList([
190
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191
+ padding=get_padding(kernel_size, dilation[0]))),
192
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193
+ padding=get_padding(kernel_size, dilation[1]))),
194
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195
+ padding=get_padding(kernel_size, dilation[2])))
196
+ ])
197
+ self.convs1.apply(init_weights)
198
+
199
+ self.convs2 = nn.ModuleList([
200
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201
+ padding=get_padding(kernel_size, 1))),
202
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
+ padding=get_padding(kernel_size, 1))),
204
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
+ padding=get_padding(kernel_size, 1)))
206
+ ])
207
+ self.convs2.apply(init_weights)
208
+
209
+ def forward(self, x, x_mask=None):
210
+ for c1, c2 in zip(self.convs1, self.convs2):
211
+ xt = F.leaky_relu(x, LRELU_SLOPE)
212
+ if x_mask is not None:
213
+ xt = xt * x_mask
214
+ xt = c1(xt)
215
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
216
+ if x_mask is not None:
217
+ xt = xt * x_mask
218
+ xt = c2(xt)
219
+ x = xt + x
220
+ if x_mask is not None:
221
+ x = x * x_mask
222
+ return x
223
+
224
+ def remove_weight_norm(self):
225
+ for l in self.convs1:
226
+ remove_weight_norm(l)
227
+ for l in self.convs2:
228
+ remove_weight_norm(l)
229
+
230
+
231
+ class ResBlock2(torch.nn.Module):
232
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233
+ super(ResBlock2, self).__init__()
234
+ self.convs = nn.ModuleList([
235
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]))),
237
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238
+ padding=get_padding(kernel_size, dilation[1])))
239
+ ])
240
+ self.convs.apply(init_weights)
241
+
242
+ def forward(self, x, x_mask=None):
243
+ for c in self.convs:
244
+ xt = F.leaky_relu(x, LRELU_SLOPE)
245
+ if x_mask is not None:
246
+ xt = xt * x_mask
247
+ xt = c(xt)
248
+ x = xt + x
249
+ if x_mask is not None:
250
+ x = x * x_mask
251
+ return x
252
+
253
+ def remove_weight_norm(self):
254
+ for l in self.convs:
255
+ remove_weight_norm(l)
256
+
257
+
258
+ class Log(nn.Module):
259
+ def forward(self, x, x_mask, reverse=False, **kwargs):
260
+ if not reverse:
261
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262
+ logdet = torch.sum(-y, [1, 2])
263
+ return y, logdet
264
+ else:
265
+ x = torch.exp(x) * x_mask
266
+ return x
267
+
268
+
269
+ class Flip(nn.Module):
270
+ def forward(self, x, *args, reverse=False, **kwargs):
271
+ x = torch.flip(x, [1])
272
+ if not reverse:
273
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274
+ return x, logdet
275
+ else:
276
+ return x
277
+
278
+
279
+ class ElementwiseAffine(nn.Module):
280
+ def __init__(self, channels):
281
+ super().__init__()
282
+ self.channels = channels
283
+ self.m = nn.Parameter(torch.zeros(channels,1))
284
+ self.logs = nn.Parameter(torch.zeros(channels,1))
285
+
286
+ def forward(self, x, x_mask, reverse=False, **kwargs):
287
+ if not reverse:
288
+ y = self.m + torch.exp(self.logs) * x
289
+ y = y * x_mask
290
+ logdet = torch.sum(self.logs * x_mask, [1,2])
291
+ return y, logdet
292
+ else:
293
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
294
+ return x
295
+
296
+
297
+ class ResidualCouplingLayer(nn.Module):
298
+ def __init__(self,
299
+ channels,
300
+ hidden_channels,
301
+ kernel_size,
302
+ dilation_rate,
303
+ n_layers,
304
+ p_dropout=0,
305
+ gin_channels=0,
306
+ mean_only=False):
307
+ assert channels % 2 == 0, "channels should be divisible by 2"
308
+ super().__init__()
309
+ self.channels = channels
310
+ self.hidden_channels = hidden_channels
311
+ self.kernel_size = kernel_size
312
+ self.dilation_rate = dilation_rate
313
+ self.n_layers = n_layers
314
+ self.half_channels = channels // 2
315
+ self.mean_only = mean_only
316
+
317
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320
+ self.post.weight.data.zero_()
321
+ self.post.bias.data.zero_()
322
+
323
+ def forward(self, x, x_mask, g=None, reverse=False):
324
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325
+ h = self.pre(x0) * x_mask
326
+ h = self.enc(h, x_mask, g=g)
327
+ stats = self.post(h) * x_mask
328
+ if not self.mean_only:
329
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
330
+ else:
331
+ m = stats
332
+ logs = torch.zeros_like(m)
333
+
334
+ if not reverse:
335
+ x1 = m + x1 * torch.exp(logs) * x_mask
336
+ x = torch.cat([x0, x1], 1)
337
+ logdet = torch.sum(logs, [1,2])
338
+ return x, logdet
339
+ else:
340
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
341
+ x = torch.cat([x0, x1], 1)
342
+ return x
dreamvoice/train_utils/prepare_freevc/freevc/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ httpx==0.24.1
3
+ numpy
4
+ scipy
5
+ torch
6
+ transformers
7
+ librosa
8
+ webrtcvad==2.0.10
dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ model:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: false
12
+
13
+ unet:
14
+ sample_size: [128, 256]
15
+ in_channels: 257
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "DownBlock2D",
23
+ "AttnDownBlock2D",
24
+ "AttnDownBlock2D",
25
+ ]
26
+ up_block_types:
27
+ [
28
+ "AttnUpBlock2D",
29
+ "AttnUpBlock2D",
30
+ "UpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
35
+
36
+ scheduler:
37
+ num_train_steps: 1000
38
+ beta_schedule: 'linear'
39
+ beta_start: 0.0001
40
+ beta_end: 0.02
41
+ num_infer_steps: 50
42
+ rescale_betas_zero_snr: true
43
+ timestep_spacing: "trailing"
44
+ clip_sample: false
45
+ prediction_type: 'v_prediction'
46
+ scale: 2.75
47
+ shift: 5.80
dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_base_pitch.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: true
12
+ pitch_dim: 1
13
+ pitch_hidden: 128
14
+
15
+ unet:
16
+ sample_size: [128, 256]
17
+ in_channels: 385
18
+ out_channels: 1
19
+ layers_per_block: 2
20
+ block_out_channels: [128, 256, 512]
21
+ down_block_types:
22
+ [
23
+ "DownBlock2D",
24
+ "AttnDownBlock2D",
25
+ "AttnDownBlock2D",
26
+ ]
27
+ up_block_types:
28
+ [
29
+ "AttnUpBlock2D",
30
+ "AttnUpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: false
10
+
11
+ unet:
12
+ sample_size: [128, 256]
13
+ in_channels: 257
14
+ out_channels: 1
15
+ layers_per_block: 2
16
+ block_out_channels: [128, 256, 256, 512]
17
+ down_block_types:
18
+ [
19
+ "DownBlock2D",
20
+ "DownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ ]
24
+ up_block_types:
25
+ [
26
+ "CrossAttnUpBlock2D",
27
+ "CrossAttnUpBlock2D",
28
+ "UpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
34
+ scheduler:
35
+ num_train_steps: 1000
36
+ beta_schedule: 'linear'
37
+ beta_start: 0.0001
38
+ beta_end: 0.02
39
+ num_infer_steps: 50
40
+ rescale_betas_zero_snr: true
41
+ timestep_spacing: "trailing"
42
+ clip_sample: false
43
+ prediction_type: 'v_prediction'
44
+ scale: 2.75
45
+ shift: 5.80
dreamvoice/train_utils/prepare_freevc/freevc/src/configs/diffvc_cross_pitch.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: true
10
+ pitch_dim: 1
11
+ pitch_hidden: 128
12
+
13
+ unet:
14
+ sample_size: [100, 256]
15
+ in_channels: 385
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "CrossAttnDownBlock2D",
24
+ ]
25
+ up_block_types:
26
+ [
27
+ "CrossAttnUpBlock2D",
28
+ "CrossAttnUpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
dreamvoice/train_utils/prepare_freevc/freevc/src/configs/plugin_cross.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 0.05
38
+ shift: -0.035
39
+
dreamvoice/train_utils/prepare_freevc/freevc/src/debug.py ADDED
File without changes
dreamvoice/train_utils/prepare_freevc/freevc/src/extract_features.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import pandas as pd
7
+ # from feats.hubert_model import get_soft_model, get_hubert_soft_content
8
+ from feats.contentvec_hf import get_content_model, get_content
9
+ # from modules.speaker_encoder.encoder import inference as spk_encoder
10
+ # from pathlib import Path
11
+ from tqdm import tqdm
12
+ from multiprocessing import Process
13
+ import pyworld as pw
14
+
15
+
16
+ def resample_save(infolder, audio_path, model,
17
+ audio_sr=24000, content_sr=16000, min_length=1.92,
18
+ content_resolution=50,
19
+ save_path='features'):
20
+ if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
21
+ audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
22
+ final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
23
+ # final_length = final_length / content_sr
24
+
25
+ length = max(round(min_length*content_sr), round(final_length))
26
+ assert length % 10 == 0
27
+ audio = audio[:length]
28
+ audio_save = np.zeros(length, dtype=audio.dtype)
29
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
30
+
31
+ # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
32
+ content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
33
+ content = content.cpu()
34
+ os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
35
+ torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
36
+ # print(audio_save.shape)
37
+ # print(content.shape)
38
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
39
+ sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
40
+ # print(save_path + '/' + 'audio_16k/' + audio_path)
41
+
42
+ audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
43
+ length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
44
+ assert length % 10 == 0
45
+ audio = audio[:length]
46
+ audio_save = np.zeros(length, dtype=audio.dtype)
47
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
48
+ # print(audio_save.shape)
49
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
50
+ sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
51
+
52
+
53
+ def extract_f0(in_folder, audio_path, save_path):
54
+ audio, sr = librosa.load(in_folder + audio_path, sr=None)
55
+ assert sr == 16000
56
+ if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
57
+ # wav = audio
58
+ # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
59
+ # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
60
+ # fmin=librosa.note_to_hz('C2'),
61
+ # fmax=librosa.note_to_hz('C6'))
62
+
63
+ _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
64
+ f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
65
+
66
+ f0 = np.nan_to_num(f0)
67
+ os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
68
+ # print(save_path + '/' + 'f0/' + audio_path + '.pt')
69
+ torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
70
+
71
+
72
+ def chunks(arr, m):
73
+ result = [[] for i in range(m)]
74
+ for i in range(len(arr)):
75
+ result[i%m].append(arr[i])
76
+ return result
77
+
78
+
79
+ def extract_f0_main(in_folder, audio_paths, save_path):
80
+ for audio_path in tqdm(audio_paths):
81
+ extract_f0(in_folder, audio_path, save_path)
82
+
83
+
84
+ if __name__ == '__main__':
85
+ df = pd.read_csv('../test_data/vc_meta.csv')
86
+ # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
87
+ model = get_content_model().to('cuda')
88
+ # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
89
+ for i in tqdm(range(len(df))):
90
+ row = df.iloc[i]
91
+ in_path = row['path']
92
+ resample_save('../test_data/', in_path, model, save_path='../features/')
93
+
94
+ in_folder = '../features/audio_16k/'
95
+ audio_files = list(df['path'])
96
+ save_path = '../features/'
97
+ cores = 6
98
+
99
+ subsets = chunks(audio_files, cores)
100
+
101
+ for subset in subsets:
102
+ t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
103
+ t.start()
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ from fairseq import checkpoint_utils
4
+ import torch.nn.functional as F
5
+
6
+
7
+ def get_model(vec_path):
8
+ print("load model(s) from {}".format(vec_path))
9
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
10
+ [vec_path],
11
+ suffix="",
12
+ )
13
+ model = models[0]
14
+ model.eval()
15
+ return model
16
+
17
+
18
+ @torch.no_grad()
19
+ def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
20
+ # print(layer)
21
+ wav_16k_tensor = wav_16k_tensor.to(device)
22
+ # so that the output shape will be len(audio//320)
23
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
24
+ feats = wav_16k_tensor
25
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
26
+ inputs = {
27
+ "source": feats.to(wav_16k_tensor.device),
28
+ "padding_mask": padding_mask.to(wav_16k_tensor.device),
29
+ "output_layer": layer
30
+ }
31
+ logits = hmodel.extract_features(**inputs)[0]
32
+ # feats = hmodel.final_proj(logits[0])
33
+ return logits
34
+
35
+
36
+ if __name__ == '__main__':
37
+ audio, sr = librosa.load('test.wav', sr=16000)
38
+ audio = audio[:100*320]
39
+ model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
40
+ model = model.cuda()
41
+ content = get_content(model, torch.tensor([audio]))
42
+ print(content)
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/contentvec_hf.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import HubertModel
2
+ import torch.nn as nn
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import librosa
6
+
7
+
8
+ class HubertModelWithFinalProj(HubertModel):
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+
12
+ # The final projection layer is only used for backward compatibility.
13
+ # Following https://github.com/auspicious3000/contentvec/issues/6
14
+ # Remove this layer is necessary to achieve the desired outcome.
15
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
16
+
17
+
18
+ def get_content_model(config='lengyue233/content-vec-best'):
19
+ model = HubertModelWithFinalProj.from_pretrained(config)
20
+ model.eval()
21
+ return model
22
+
23
+
24
+ @torch.no_grad()
25
+ def get_content(model, wav_16k_tensor, device='cuda'):
26
+ # print(layer)
27
+ wav_16k_tensor = wav_16k_tensor.to(device)
28
+ # so that the output shape will be len(audio//320)
29
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
30
+ logits = model(wav_16k_tensor)['last_hidden_state']
31
+ return logits
32
+
33
+
34
+ if __name__ == '__main__':
35
+ model = get_content_model().cuda()
36
+ audio, sr = librosa.load('test.wav', sr=16000)
37
+ audio = audio[:100*320]
38
+ audio = torch.tensor([audio])
39
+ content = get_content(model, audio, 'cuda')
40
+ print(content)
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # VSCode project settings
114
+ .vscode
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Benjamin van Niekerk
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/README.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuBERT
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2111.02392)
4
+ [![demo](https://img.shields.io/static/v1?message=Audio%20Samples&logo=Github&labelColor=grey&color=blue&logoColor=white&label=%20&style=flat)](https://bshall.github.io/soft-vc/)
5
+ [![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb)
6
+
7
+ Training and inference scripts for the HuBERT content encoders in [A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion](https://ieeexplore.ieee.org/abstract/document/9746484).
8
+ For more details see [soft-vc](https://github.com/bshall/soft-vc). Audio samples can be found [here](https://bshall.github.io/soft-vc/). Colab demo can be found [here](https://colab.research.google.com/github/bshall/soft-vc/blob/main/soft-vc-demo.ipynb).
9
+
10
+ <div align="center">
11
+ <img width="100%" alt="Soft-VC"
12
+ src="https://raw.githubusercontent.com/bshall/hubert/main/content-encoder.png">
13
+ </div>
14
+ <div>
15
+ <sup>
16
+ <strong>Fig 1:</strong> Architecture of the voice conversion system. a) The <strong>discrete</strong> content encoder clusters audio features to produce a sequence of discrete speech units. b) The <strong>soft</strong> content encoder is trained to predict the discrete units. The acoustic model transforms the discrete/soft speech units into a target spectrogram. The vocoder converts the spectrogram into an audio waveform.
17
+ </sup>
18
+ </div>
19
+
20
+ ## Example Usage
21
+
22
+ ### Programmatic Usage
23
+
24
+ ```python
25
+ import torch, torchaudio
26
+
27
+ # Load checkpoint (either hubert_soft or hubert_discrete)
28
+ hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()
29
+
30
+ # Load audio
31
+ wav, sr = torchaudio.load("path/to/wav")
32
+ assert sr == 16000
33
+ wav = wav.unsqueeze(0).cuda()
34
+
35
+ # Extract speech units
36
+ units = hubert.units(x)
37
+ ```
38
+
39
+ ### Script-Based Usage
40
+
41
+ ```
42
+ usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
43
+
44
+ Encode an audio dataset.
45
+
46
+ positional arguments:
47
+ {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete)
48
+ in-dir path to the dataset directory.
49
+ out-dir path to the output directory.
50
+
51
+ optional arguments:
52
+ -h, --help show this help message and exit
53
+ --extension EXTENSION
54
+ extension of the audio files (defaults to .flac).
55
+ ```
56
+
57
+ ## Training
58
+
59
+ ### Step 1: Dataset Preparation
60
+
61
+ Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. The training script expects the following tree structure for the dataset directory:
62
+
63
+ ```
64
+ │ lengths.json
65
+
66
+ └───wavs
67
+ ├───dev-*
68
+ │ ├───84
69
+ │ ├───...
70
+ │ └───8842
71
+ └───train-*
72
+ ├───19
73
+ ├───...
74
+ └───8975
75
+ ```
76
+
77
+ The `train-*` and `dev-*` directories should contain the training and validation splits respectively. Note that there can be multiple `train` and `dev` folders e.g., `train-clean-100`, `train-other-500`, etc. Finally, the `lengths.json` file should contain key-value pairs with the file path and number of samples:
78
+
79
+ ```json
80
+ {
81
+ "dev-clean/1272/128104/1272-128104-0000": 93680,
82
+ "dev-clean/1272/128104/1272-128104-0001": 77040,
83
+ }
84
+ ```
85
+
86
+ ### Step 2: Extract Discrete Speech Units
87
+
88
+ Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script:
89
+
90
+ ```
91
+ usage: encode.py [-h] [--extension EXTENSION] {soft,discrete} in-dir out-dir
92
+
93
+ Encode an audio dataset.
94
+
95
+ positional arguments:
96
+ {soft,discrete} available models (HuBERT-Soft or HuBERT-Discrete)
97
+ in-dir path to the dataset directory.
98
+ out-dir path to the output directory.
99
+
100
+ optional arguments:
101
+ -h, --help show this help message and exit
102
+ --extension EXTENSION
103
+ extension of the audio files (defaults to .flac).
104
+ ```
105
+
106
+ for example:
107
+
108
+ ```
109
+ python encode.py discrete path/to/LibriSpeech/wavs path/to/LibriSpeech/discrete
110
+ ```
111
+
112
+ At this point the directory tree should look like:
113
+
114
+ ```
115
+ │ lengths.json
116
+
117
+ ├───discrete
118
+ │ ├───...
119
+ └───wavs
120
+ ├───...
121
+ ```
122
+
123
+ ### Step 3: Train the HuBERT-Soft Content Encoder
124
+
125
+ ```
126
+ usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir
127
+
128
+ Train HuBERT soft content encoder.
129
+
130
+ positional arguments:
131
+ dataset-dir path to the data directory.
132
+ checkpoint-dir path to the checkpoint directory.
133
+
134
+ optional arguments:
135
+ -h, --help show this help message and exit
136
+ --resume RESUME path to the checkpoint to resume from.
137
+ --warmstart whether to initialize from the fairseq HuBERT checkpoint.
138
+ --mask whether to use input masking.
139
+ --alpha ALPHA weight for the masked loss.
140
+ ```
141
+
142
+ ## Links
143
+
144
+ - [Soft-VC repo](https://github.com/bshall/soft-vc)
145
+ - [Soft-VC paper](https://ieeexplore.ieee.org/abstract/document/9746484)
146
+ - [Official HuBERT repo](https://github.com/pytorch/fairseq)
147
+ - [HuBERT paper](https://arxiv.org/abs/2106.07447)
148
+
149
+ ## Citation
150
+
151
+ If you found this work helpful please consider citing our paper:
152
+
153
+ ```
154
+ @inproceedings{
155
+ soft-vc-2022,
156
+ author={van Niekerk, Benjamin and Carbonneau, Marc-André and Zaïdi, Julian and Baas, Matthew and Seuté, Hugo and Kamper, Herman},
157
+ booktitle={ICASSP},
158
+ title={A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion},
159
+ year={2022}
160
+ }
161
+ ```
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/cluster.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import logging
3
+ import argparse
4
+
5
+ import torch
6
+ import numpy as np
7
+ from sklearn.cluster import KMeans
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def cluster(args):
14
+ with open(args.subset) as file:
15
+ subset = [line.strip() for line in file]
16
+
17
+ logger.info(f"Loading features from {args.in_dir}")
18
+ features = []
19
+ for path in subset:
20
+ in_path = args.in_dir / path
21
+ features.append(np.load(in_path.with_suffix(".npy")))
22
+ features = np.concatenate(features, axis=0)
23
+
24
+ logger.info(f"Clustering features of shape: {features.shape}")
25
+ kmeans = KMeans(n_clusters=args.n_clusters).fit(features)
26
+
27
+ checkpoint_path = args.checkpoint_dir / f"kmeans_{args.n_clusters}.pt"
28
+ checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
29
+ torch.save(
30
+ checkpoint_path,
31
+ {
32
+ "n_features_in_": kmeans.n_features_in_,
33
+ "_n_threads": kmeans._n_threads,
34
+ "cluster_centers_": kmeans.cluster_centers_,
35
+ },
36
+ )
37
+
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser(description="Cluster speech features features.")
41
+ parser.add_argument(
42
+ "in_dir",
43
+ metavar="in-dir",
44
+ help="path to the encoded dataset",
45
+ type=Path,
46
+ )
47
+ parser.add_argument(
48
+ "subset",
49
+ matavar="subset",
50
+ help="path to the .txt file containing the list of files to cluster",
51
+ type=Path,
52
+ )
53
+ parser.add_argument(
54
+ "checkpoint_dir",
55
+ metavar="checkpoint-dir",
56
+ help="path to the checkpoint directory",
57
+ type=Path,
58
+ )
59
+ parser.add_argument(
60
+ "--n-clusters",
61
+ help="number of clusters",
62
+ type=int,
63
+ default=100,
64
+ )
65
+ args = parser.parse_args()
66
+ cluster(args)
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/content-encoder.png ADDED
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/encode.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import numpy as np
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+
7
+ import torch
8
+ import torchaudio
9
+ from torchaudio.functional import resample
10
+
11
+
12
+ def encode_dataset(args):
13
+ print(f"Loading hubert checkpoint")
14
+ hubert = torch.hub.load(
15
+ "bshall/hubert:main",
16
+ f"hubert_{args.model}",
17
+ trust_repo=True,
18
+ ).cuda()
19
+
20
+ print(f"Encoding dataset at {args.in_dir}")
21
+ for in_path in tqdm(list(args.in_dir.rglob(f"*{args.extension}"))):
22
+ wav, sr = torchaudio.load(in_path)
23
+ wav = resample(wav, sr, 16000)
24
+ wav = wav.unsqueeze(0).cuda()
25
+
26
+ with torch.inference_mode():
27
+ units = hubert.units(wav)
28
+
29
+ out_path = args.out_dir / in_path.relative_to(args.in_dir)
30
+ out_path.parent.mkdir(parents=True, exist_ok=True)
31
+ np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy())
32
+
33
+
34
+ if __name__ == "__main__":
35
+ parser = argparse.ArgumentParser(description="Encode an audio dataset.")
36
+ parser.add_argument(
37
+ "model",
38
+ help="available models (HuBERT-Soft or HuBERT-Discrete)",
39
+ choices=["soft", "discrete"],
40
+ )
41
+ parser.add_argument(
42
+ "in_dir",
43
+ metavar="in-dir",
44
+ help="path to the dataset directory.",
45
+ type=Path,
46
+ )
47
+ parser.add_argument(
48
+ "out_dir",
49
+ metavar="out-dir",
50
+ help="path to the output directory.",
51
+ type=Path,
52
+ )
53
+ parser.add_argument(
54
+ "--extension",
55
+ help="extension of the audio files (defaults to .flac).",
56
+ default=".flac",
57
+ type=str,
58
+ )
59
+ args = parser.parse_args()
60
+ encode_dataset(args)
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubconf.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dependencies = ["torch", "torchaudio", "sklearn"]
2
+
3
+ URLS = {
4
+ "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-discrete-96b248c5.pt",
5
+ "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt",
6
+ "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.2/kmeans100-50f36a95.pt",
7
+ }
8
+
9
+ import torch
10
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
11
+
12
+ from sklearn.cluster import KMeans
13
+
14
+ from hubert import HubertDiscrete, HubertSoft
15
+
16
+
17
+ def hubert_discrete(
18
+ pretrained: bool = True,
19
+ progress: bool = True,
20
+ ) -> HubertDiscrete:
21
+ r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
22
+ Args:
23
+ pretrained (bool): load pretrained weights into the model
24
+ progress (bool): show progress bar when downloading model
25
+ """
26
+ kmeans = kmeans100(pretrained=pretrained, progress=progress)
27
+ hubert = HubertDiscrete(kmeans)
28
+ if pretrained:
29
+ checkpoint = torch.hub.load_state_dict_from_url(
30
+ URLS["hubert-discrete"], progress=progress
31
+ )
32
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
33
+ hubert.load_state_dict(checkpoint["hubert"])
34
+ hubert.eval()
35
+ return hubert
36
+
37
+
38
+ def hubert_soft(
39
+ pretrained: bool = True,
40
+ progress: bool = True,
41
+ ) -> HubertSoft:
42
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
43
+ Args:
44
+ pretrained (bool): load pretrained weights into the model.
45
+ progress (bool): show progress bar when downloading model.
46
+ """
47
+ hubert = HubertSoft()
48
+ if pretrained:
49
+ checkpoint = torch.hub.load_state_dict_from_url(
50
+ URLS["hubert-soft"],
51
+ progress=progress,
52
+ )
53
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
54
+ hubert.load_state_dict(checkpoint["hubert"])
55
+ hubert.eval()
56
+ return hubert
57
+
58
+
59
+ def _kmeans(
60
+ num_clusters: int, pretrained: bool = True, progress: bool = True
61
+ ) -> KMeans:
62
+ kmeans = KMeans(num_clusters)
63
+ if pretrained:
64
+ checkpoint = torch.hub.load_state_dict_from_url(
65
+ URLS[f"kmeans{num_clusters}"], progress=progress
66
+ )
67
+ kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
68
+ kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
69
+ kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
70
+ return kmeans
71
+
72
+
73
+ def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
74
+ r"""
75
+ k-means checkpoint for HuBERT-Discrete with 100 clusters.
76
+ Args:
77
+ pretrained (bool): load pretrained weights into the model
78
+ progress (bool): show progress bar when downloading model
79
+ """
80
+ return _kmeans(100, pretrained, progress)
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .model import (
2
+ Hubert,
3
+ HubertDiscrete,
4
+ HubertSoft,
5
+ )
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/dataset.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import json
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch.utils.data import Dataset
9
+ import torchaudio
10
+
11
+
12
+ class AcousticUnitsDataset(Dataset):
13
+ def __init__(
14
+ self,
15
+ root: Path,
16
+ sample_rate: int = 16000,
17
+ label_rate: int = 50,
18
+ min_samples: int = 32000,
19
+ max_samples: int = 250000,
20
+ train: bool = True,
21
+ ):
22
+ self.wavs_dir = root / "wavs"
23
+ self.units_dir = root / "discrete"
24
+
25
+ with open(root / "lengths.json") as file:
26
+ self.lenghts = json.load(file)
27
+
28
+ pattern = "train-*/**/*.flac" if train else "dev-*/**/*.flac"
29
+ metadata = (
30
+ (path, path.relative_to(self.wavs_dir).with_suffix("").as_posix())
31
+ for path in self.wavs_dir.rglob(pattern)
32
+ )
33
+ metadata = ((path, key) for path, key in metadata if key in self.lenghts)
34
+ self.metadata = [
35
+ path for path, key in metadata if self.lenghts[key] > min_samples
36
+ ]
37
+
38
+ self.sample_rate = sample_rate
39
+ self.label_rate = label_rate
40
+ self.min_samples = min_samples
41
+ self.max_samples = max_samples
42
+ self.train = train
43
+
44
+ def __len__(self):
45
+ return len(self.metadata)
46
+
47
+ def __getitem__(self, index):
48
+ wav_path = self.metadata[index]
49
+ units_path = self.units_dir / wav_path.relative_to(self.wavs_dir)
50
+
51
+ wav, _ = torchaudio.load(wav_path)
52
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
53
+ codes = np.load(units_path.with_suffix(".npy"))
54
+
55
+ return wav, torch.from_numpy(codes).long()
56
+
57
+ def collate(self, batch):
58
+ wavs, codes = zip(*batch)
59
+ wavs, codes = list(wavs), list(codes)
60
+
61
+ wav_lengths = [wav.size(-1) for wav in wavs]
62
+ code_lengths = [code.size(-1) for code in codes]
63
+
64
+ wav_frames = min(self.max_samples, *wav_lengths)
65
+
66
+ collated_wavs, wav_offsets = [], []
67
+ for wav in wavs:
68
+ wav_diff = wav.size(-1) - wav_frames
69
+ wav_offset = random.randint(0, wav_diff)
70
+ wav = wav[:, wav_offset : wav_offset + wav_frames]
71
+
72
+ collated_wavs.append(wav)
73
+ wav_offsets.append(wav_offset)
74
+
75
+ rate = self.label_rate / self.sample_rate
76
+ code_offsets = [round(wav_offset * rate) for wav_offset in wav_offsets]
77
+ code_frames = round(wav_frames * rate)
78
+ remaining_code_frames = [
79
+ length - offset for length, offset in zip(code_lengths, code_offsets)
80
+ ]
81
+ code_frames = min(code_frames, *remaining_code_frames)
82
+
83
+ collated_codes = []
84
+ for code, code_offset in zip(codes, code_offsets):
85
+ code = code[code_offset : code_offset + code_frames]
86
+ collated_codes.append(code)
87
+
88
+ wavs = torch.stack(collated_wavs, dim=0)
89
+ codes = torch.stack(collated_codes, dim=0)
90
+
91
+ return wavs, codes
dreamvoice/train_utils/prepare_freevc/freevc/src/feats/hubert/hubert/model.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Optional, Tuple
3
+ import random
4
+
5
+ from sklearn.cluster import KMeans
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ class Hubert(nn.Module):
13
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
14
+ super().__init__()
15
+ self._mask = mask
16
+ self.feature_extractor = FeatureExtractor()
17
+ self.feature_projection = FeatureProjection()
18
+ self.positional_embedding = PositionalConvEmbedding()
19
+ self.norm = nn.LayerNorm(768)
20
+ self.dropout = nn.Dropout(0.1)
21
+ self.encoder = TransformerEncoder(
22
+ nn.TransformerEncoderLayer(
23
+ 768, 12, 3072, activation="gelu", batch_first=True
24
+ ),
25
+ 12,
26
+ )
27
+ self.proj = nn.Linear(768, 256)
28
+
29
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
30
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
31
+
32
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mask = None
34
+ if self.training and self._mask:
35
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
36
+ x[mask] = self.masked_spec_embed.to(x.dtype)
37
+ return x, mask
38
+
39
+ def encode(
40
+ self, x: torch.Tensor, layer: Optional[int] = None
41
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
42
+ x = self.feature_extractor(x)
43
+ x = self.feature_projection(x.transpose(1, 2))
44
+ x, mask = self.mask(x)
45
+ x = x + self.positional_embedding(x)
46
+ x = self.dropout(self.norm(x))
47
+ x = self.encoder(x, output_layer=layer)
48
+ return x, mask
49
+
50
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
51
+ logits = torch.cosine_similarity(
52
+ x.unsqueeze(2),
53
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
54
+ dim=-1,
55
+ )
56
+ return logits / 0.1
57
+
58
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
59
+ x, mask = self.encode(x)
60
+ x = self.proj(x)
61
+ logits = self.logits(x)
62
+ return logits, mask
63
+
64
+
65
+ class HubertSoft(Hubert):
66
+ """HuBERT-Soft content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ @torch.inference_mode()
72
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
73
+ """Extract soft speech units.
74
+
75
+ Args:
76
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
77
+
78
+ Returns:
79
+ Tensor: soft speech units of shape (1, N, D), where N is the number of frames and D is the unit dimensions.
80
+ """
81
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
82
+ x, _ = self.encode(wav)
83
+ return self.proj(x)
84
+
85
+
86
+ class HubertDiscrete(Hubert):
87
+ """HuBERT-Discrete content encoder from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`."""
88
+
89
+ def __init__(self, kmeans: KMeans):
90
+ super().__init__(504)
91
+ self.kmeans = kmeans
92
+
93
+ @torch.inference_mode()
94
+ def units(self, wav: torch.Tensor) -> torch.LongTensor:
95
+ """Extract discrete speech units.
96
+
97
+ Args:
98
+ wav (Tensor): an audio waveform of shape (1, 1, T), where T is the number of samples.
99
+
100
+ Returns:
101
+ LongTensor: soft speech units of shape (N,), where N is the number of frames.
102
+ """
103
+ wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
104
+ x, _ = self.encode(wav, layer=7)
105
+ x = self.kmeans.predict(x.squeeze().cpu().numpy())
106
+ return torch.tensor(x, dtype=torch.long, device=wav.device)
107
+
108
+
109
+ class FeatureExtractor(nn.Module):
110
+ def __init__(self):
111
+ super().__init__()
112
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
113
+ self.norm0 = nn.GroupNorm(512, 512)
114
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
115
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
116
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
117
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
118
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
119
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
120
+
121
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
122
+ x = F.gelu(self.norm0(self.conv0(x)))
123
+ x = F.gelu(self.conv1(x))
124
+ x = F.gelu(self.conv2(x))
125
+ x = F.gelu(self.conv3(x))
126
+ x = F.gelu(self.conv4(x))
127
+ x = F.gelu(self.conv5(x))
128
+ x = F.gelu(self.conv6(x))
129
+ return x
130
+
131
+
132
+ class FeatureProjection(nn.Module):
133
+ def __init__(self):
134
+ super().__init__()
135
+ self.norm = nn.LayerNorm(512)
136
+ self.projection = nn.Linear(512, 768)
137
+ self.dropout = nn.Dropout(0.1)
138
+
139
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
140
+ x = self.norm(x)
141
+ x = self.projection(x)
142
+ x = self.dropout(x)
143
+ return x
144
+
145
+
146
+ class PositionalConvEmbedding(nn.Module):
147
+ def __init__(self):
148
+ super().__init__()
149
+ self.conv = nn.Conv1d(
150
+ 768,
151
+ 768,
152
+ kernel_size=128,
153
+ padding=128 // 2,
154
+ groups=16,
155
+ )
156
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
157
+
158
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
159
+ x = self.conv(x.transpose(1, 2))
160
+ x = F.gelu(x[:, :, :-1])
161
+ return x.transpose(1, 2)
162
+
163
+
164
+ class TransformerEncoder(nn.Module):
165
+ def __init__(
166
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
167
+ ) -> None:
168
+ super(TransformerEncoder, self).__init__()
169
+ self.layers = nn.ModuleList(
170
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
171
+ )
172
+ self.num_layers = num_layers
173
+
174
+ def forward(
175
+ self,
176
+ src: torch.Tensor,
177
+ mask: torch.Tensor = None,
178
+ src_key_padding_mask: torch.Tensor = None,
179
+ output_layer: Optional[int] = None,
180
+ ) -> torch.Tensor:
181
+ output = src
182
+ for layer in self.layers[:output_layer]:
183
+ output = layer(
184
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
185
+ )
186
+ return output
187
+
188
+
189
+ def _compute_mask(
190
+ shape: Tuple[int, int],
191
+ mask_prob: float,
192
+ mask_length: int,
193
+ device: torch.device,
194
+ min_masks: int = 0,
195
+ ) -> torch.Tensor:
196
+ batch_size, sequence_length = shape
197
+
198
+ if mask_length < 1:
199
+ raise ValueError("`mask_length` has to be bigger than 0.")
200
+
201
+ if mask_length > sequence_length:
202
+ raise ValueError(
203
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
204
+ )
205
+
206
+ # compute number of masked spans in batch
207
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
208
+ num_masked_spans = max(num_masked_spans, min_masks)
209
+
210
+ # make sure num masked indices <= sequence_length
211
+ if num_masked_spans * mask_length > sequence_length:
212
+ num_masked_spans = sequence_length // mask_length
213
+
214
+ # SpecAugment mask to fill
215
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
216
+
217
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
218
+ uniform_dist = torch.ones(
219
+ (batch_size, sequence_length - (mask_length - 1)), device=device
220
+ )
221
+
222
+ # get random indices to mask
223
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
224
+
225
+ # expand masked indices to masked spans
226
+ mask_indices = (
227
+ mask_indices.unsqueeze(dim=-1)
228
+ .expand((batch_size, num_masked_spans, mask_length))
229
+ .reshape(batch_size, num_masked_spans * mask_length)
230
+ )
231
+ offsets = (
232
+ torch.arange(mask_length, device=device)[None, None, :]
233
+ .expand((batch_size, num_masked_spans, mask_length))
234
+ .reshape(batch_size, num_masked_spans * mask_length)
235
+ )
236
+ mask_idxs = mask_indices + offsets
237
+
238
+ # scatter indices to mask
239
+ mask = mask.scatter(1, mask_idxs, True)
240
+
241
+ return mask