Spaces:
Running
on
Zero
Running
on
Zero
Serhiy Stetskovych
commited on
Commit
·
086eb2f
1
Parent(s):
802f6d4
Add multi model
Browse files- .gitattributes +18 -0
- app.py +80 -27
- styletts_config.yml → config.yml +49 -43
- infer.py +100 -53
- models.py +5 -1
- voices/Анастасія Павленко.wav +3 -0
- voices/Вʼячеслав Дудко.wav +3 -0
- voices/Влада Муравець.wav +3 -0
- voices/Гаська Шиян.wav +3 -0
- voices/Катерина Потапенко.wav +3 -0
- voices/Марина Панас.wav +3 -0
- voices/Марися Нікітюк.wav +3 -0
- voices/Марта Мольфар.wav +3 -0
- voices/Марічка Штирбулова.wav +3 -0
- voices/Маслінка.wav +3 -0
- voices/Матвій Ніколаєв.wav +3 -0
- voices/Михайло Тишин.wav +3 -0
- voices/Наталія Калюжна.wav +3 -0
- voices/Олег Лепенець.wav +3 -0
- voices/Слава Красовська.wav +3 -0
- voices/Юрій Кудрявець.wav +3 -0
- voices/Яніна Соколова.wav +3 -0
.gitattributes
CHANGED
@@ -41,3 +41,21 @@ weights/plbert.bin filter=lfs diff=lfs merge=lfs -text
|
|
41 |
weights/asr.bin filter=lfs diff=lfs merge=lfs -text
|
42 |
weights/filatov.bin filter=lfs diff=lfs merge=lfs -text
|
43 |
weights/jdc.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
weights/asr.bin filter=lfs diff=lfs merge=lfs -text
|
42 |
weights/filatov.bin filter=lfs diff=lfs merge=lfs -text
|
43 |
weights/jdc.bin filter=lfs diff=lfs merge=lfs -text
|
44 |
+
weights/multi.bin filter=lfs diff=lfs merge=lfs -text
|
45 |
+
voices/Олег[[:space:]]Лепенець.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
+
voices/Слава[[:space:]]Красовська.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
voices/Гаська[[:space:]]Шиян.wav filter=lfs diff=lfs merge=lfs -text
|
48 |
+
voices/Марина[[:space:]]Панас.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
+
voices/Марися[[:space:]]Нікітюк.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
+
voices/Наталія[[:space:]]Калюжна.wav filter=lfs diff=lfs merge=lfs -text
|
51 |
+
voices/Анастасія[[:space:]]Павленко.wav filter=lfs diff=lfs merge=lfs -text
|
52 |
+
voices/Юрій[[:space:]]Кудрявець.wav filter=lfs diff=lfs merge=lfs -text
|
53 |
+
voices/Марта[[:space:]]Мольфар.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
voices/Яніна[[:space:]]Соколова.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
voices/Вʼячеслав[[:space:]]Дудко.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
voices/Влада[[:space:]]Муравець.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
voices/Михайло[[:space:]]Тишин.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
voices/Марічка[[:space:]]Штирбулова.wav filter=lfs diff=lfs merge=lfs -text
|
59 |
+
voices/Катерина[[:space:]]Потапенко.wav filter=lfs diff=lfs merge=lfs -text
|
60 |
+
voices/Матвій[[:space:]]Ніколаєв.wav filter=lfs diff=lfs merge=lfs -text
|
61 |
+
voices/Маслінка.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,16 +1,30 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from infer import inference
|
3 |
|
4 |
|
|
|
|
|
|
|
|
|
5 |
|
6 |
description = f'''
|
|
|
7 |
Програма може не коректно визначати деякі наголоси і не перетворює цифри, акроніми і різні скорочення в словесну форму.
|
8 |
Якщо наголос не правильний, використовуйте символ + після наголошеного складу.
|
9 |
Також дуже маленькі речення можуть крешати, тому пишіть щось більше а не одне-два слова.
|
10 |
|
11 |
'''
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
if text.strip() == "":
|
15 |
raise gr.Error("You must enter some text")
|
16 |
if len(text) > 50000:
|
@@ -19,33 +33,72 @@ def synthesise(text, speed, progress=gr.Progress()):
|
|
19 |
print(text)
|
20 |
print("*** end ***")
|
21 |
|
22 |
-
return 24000, inference(text, progress, speed=speed, alpha=
|
23 |
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
import gradio as gr
|
4 |
from infer import inference
|
5 |
|
6 |
|
7 |
+
prompts_dir = 'voices'
|
8 |
+
prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
|
9 |
+
prompts_list = ['.'.join(p.split('/')[-1].split('.')[:-1]) for p in prompts_list]
|
10 |
+
|
11 |
|
12 |
description = f'''
|
13 |
+
<h1 style="text-align:center;">StyleTTS2 ukrainian demo</h1><br>
|
14 |
Програма може не коректно визначати деякі наголоси і не перетворює цифри, акроніми і різні скорочення в словесну форму.
|
15 |
Якщо наголос не правильний, використовуйте символ + після наголошеного складу.
|
16 |
Також дуже маленькі речення можуть крешати, тому пишіть щось більше а не одне-два слова.
|
17 |
|
18 |
'''
|
19 |
|
20 |
+
examples = [
|
21 |
+
["Решта окупантів звернула на Вокзальну — центральну вулицю Бучі. Тільки уявіть їхній настрій, коли перед ними відкрилася ця пасторальна картина! Невеличкі котеджі й просторіші будинки шикуються обабіч, перед ними вивищуються голі липи та електро-стовпи, тягнуться газони й жовто-чорні бордюри. Доглянуті сади визирають із-поза зелених парканів, гавкотять собаки, співають птахи… На дверях будинку номер тридцять шість досі висить різдвяний вінок.", 1.0],
|
22 |
+
["Одна дівчинка стала королевою Франції. Звали її Анна, і була вона донькою Ярослава Му+дрого, великого київського князя. Він опі+кувався літературою та культурою в Київській Русі+, а тоді переважно про таке не дбали – більше воювали і споруджували фортеці.", 1.0],
|
23 |
+
["Одна дівчинка народилася і виросла в Америці, та коли стала дорослою, зрозуміла, що дуже любить українські вірші й найбільше хоче робити вистави про Україну. Звали її Вірляна. Дід Вірляни був український мовознавець і педагог Кость Кисілевський, котрий навчався в Лейпцизькому та Віденському університетах і, після Другої світової війни виїхавши до США, започаткував систему шкіл українознавства по всій Америці. Тож Вірляна зростала в українському середовищі, а окрім того – в середовищі вихідців з інших країн.", 1.0]
|
24 |
+
]
|
25 |
+
|
26 |
+
def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
|
27 |
+
prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
|
28 |
if text.strip() == "":
|
29 |
raise gr.Error("You must enter some text")
|
30 |
if len(text) > 50000:
|
|
|
33 |
print(text)
|
34 |
print("*** end ***")
|
35 |
|
36 |
+
return 24000, inference('multi', text, prompt_audio_path, progress, speed=speed, alpha=0, beta=0, diffusion_steps=20, embedding_scale=1.0)[0]
|
37 |
|
38 |
|
39 |
|
40 |
+
def synthesize_single(text, speed, progress=gr.Progress()):
|
41 |
+
if text.strip() == "":
|
42 |
+
raise gr.Error("You must enter some text")
|
43 |
+
if len(text) > 50000:
|
44 |
+
raise gr.Error("Text must be <50k characters")
|
45 |
+
print("*** saying ***")
|
46 |
+
print(text)
|
47 |
+
print("*** end ***")
|
48 |
+
|
49 |
+
return 24000, inference('single', text, None, progress, speed=speed, alpha=1, beta=0, diffusion_steps=4, embedding_scale=1.0)[0]
|
50 |
+
|
51 |
+
def select_example(df, evt: gr.SelectData):
|
52 |
+
return evt.row_value
|
53 |
+
|
54 |
+
with gr.Blocks() as single:
|
55 |
+
with gr.Row():
|
56 |
+
with gr.Column(scale=1):
|
57 |
+
input_text = gr.Text(label='Text:', lines=5, max_lines=10)
|
58 |
+
speed = gr.Slider(label='Швидкість:', maximum=1.3, minimum=0.7, value=1.0)
|
59 |
+
synthesise_button = gr.Button("Синтезувати")
|
60 |
+
with gr.Column(scale=1):
|
61 |
+
output_audio = gr.Audio(
|
62 |
+
label="Audio:",
|
63 |
+
autoplay=False,
|
64 |
+
streaming=False,
|
65 |
+
type="numpy",
|
66 |
+
)
|
67 |
+
|
68 |
+
synthesise_button.click(synthesize_single, inputs=[input_text, speed], outputs=[output_audio])
|
69 |
+
|
70 |
+
with gr.Row():
|
71 |
+
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
72 |
+
examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
|
73 |
+
|
74 |
+
with gr.Blocks() as multy:
|
75 |
+
with gr.Row():
|
76 |
+
with gr.Column(scale=1):
|
77 |
+
input_text = gr.Text(label='Text:', lines=5, max_lines=10)
|
78 |
+
speed = gr.Slider(label='Швидкість:', maximum=1.3, minimum=0.7, value=1.0)
|
79 |
+
speaker = gr.Dropdown(label="Голос:", choices=prompts_list, value=prompts_list[0])
|
80 |
+
|
81 |
+
with gr.Column(scale=1):
|
82 |
+
output_audio = gr.Audio(
|
83 |
+
label="Audio:",
|
84 |
+
autoplay=False,
|
85 |
+
streaming=False,
|
86 |
+
type="numpy",
|
87 |
+
)
|
88 |
+
synthesise_button = gr.Button("Синтезувати")
|
89 |
|
90 |
+
synthesise_button.click(synthesize_multi, inputs=[input_text, speaker, speed], outputs=[output_audio])
|
91 |
+
with gr.Row():
|
92 |
+
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
93 |
+
examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
with gr.Blocks(title="StyleTTS2 ukrainian demo", css="") as demo:
|
99 |
+
gr.Markdown(description)
|
100 |
+
gr.TabbedInterface([multy, single], ['Multі speaker', 'Single speaker'])
|
101 |
+
|
102 |
+
|
103 |
+
if __name__ == "__main__":
|
104 |
+
demo.queue(api_open=True, max_size=15).launch(show_api=True)
|
styletts_config.yml → config.yml
RENAMED
@@ -1,19 +1,58 @@
|
|
1 |
-
|
2 |
F0_path: "weights/jdc.bin"
|
3 |
ASR_config: "Utils/ASR/config.yml"
|
4 |
ASR_path: "weights/asr.bin"
|
5 |
-
PLBERT_dir: 'Utils/PLBERT/'
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
multispeaker: false
|
18 |
|
19 |
dim_in: 64
|
@@ -63,37 +102,4 @@ model_params:
|
|
63 |
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
64 |
mean: -3.0
|
65 |
std: 1.0
|
66 |
-
|
67 |
-
loss_params:
|
68 |
-
lambda_mel: 5. # mel reconstruction loss
|
69 |
-
lambda_gen: 1. # generator loss
|
70 |
-
lambda_slm: 1. # slm feature matching loss
|
71 |
-
|
72 |
-
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
73 |
-
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
74 |
-
TMA_epoch: 50 # TMA starting epoch (1st stage)
|
75 |
-
|
76 |
-
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
77 |
-
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
78 |
-
lambda_dur: 1. # duration loss (2nd stage)
|
79 |
-
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
80 |
-
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
81 |
-
lambda_diff: 1. # score matching loss (2nd stage)
|
82 |
-
|
83 |
-
diff_epoch: 10 # style diffusion starting epoch (2nd stage)
|
84 |
-
joint_epoch: 25 # joint training starting epoch (2nd stage)
|
85 |
-
|
86 |
-
optimizer_params:
|
87 |
-
lr: 0.0001 # general learning rate
|
88 |
-
bert_lr: 0.00001 # learning rate for PLBERT
|
89 |
-
ft_lr: 0.00001 # learning rate for acoustic modules
|
90 |
-
|
91 |
-
slmadv_params:
|
92 |
-
min_len: 400 # minimum length of samples
|
93 |
-
max_len: 500 # maximum length of samples
|
94 |
-
batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
|
95 |
-
iter: 10 # update the discriminator every this iterations of generator update
|
96 |
-
thresh: 5 # gradient norm above which the gradient is scaled
|
97 |
-
scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
|
98 |
-
sig: 1.5 # sigma for differentiable duration modeling
|
99 |
-
|
|
|
|
|
1 |
F0_path: "weights/jdc.bin"
|
2 |
ASR_config: "Utils/ASR/config.yml"
|
3 |
ASR_path: "weights/asr.bin"
|
|
|
4 |
|
5 |
|
6 |
+
model_params_multi:
|
7 |
+
multispeaker: true
|
8 |
+
|
9 |
+
dim_in: 64
|
10 |
+
hidden_dim: 512
|
11 |
+
max_conv_dim: 512
|
12 |
+
n_layer: 3
|
13 |
+
n_mels: 80
|
14 |
|
15 |
+
n_token: 181 # number of phoneme tokens
|
16 |
+
max_dur: 50 # maximum duration of a single phoneme
|
17 |
+
style_dim: 128 # style vector size
|
18 |
+
|
19 |
+
dropout: 0.2
|
20 |
+
|
21 |
+
# config for decoder
|
22 |
+
decoder:
|
23 |
+
type: 'hifigan' # either hifigan or istftnet
|
24 |
+
resblock_kernel_sizes: [3,7,11]
|
25 |
+
upsample_rates : [10,5,3,2]
|
26 |
+
upsample_initial_channel: 512
|
27 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
28 |
+
upsample_kernel_sizes: [20,10,6,4]
|
29 |
+
|
30 |
+
# speech language model config
|
31 |
+
slm:
|
32 |
+
model: ''
|
33 |
+
sr: 16000 # sampling rate of SLM
|
34 |
+
hidden: 768 # hidden size of SLM
|
35 |
+
nlayers: 13 # number of layers of SLM
|
36 |
+
initial_channel: 64 # initial channels of SLM discriminator head
|
37 |
+
|
38 |
+
# style diffusion model config
|
39 |
+
diffusion:
|
40 |
+
embedding_mask_proba: 0.1
|
41 |
+
# transformer config
|
42 |
+
transformer:
|
43 |
+
num_layers: 3
|
44 |
+
num_heads: 8
|
45 |
+
head_features: 64
|
46 |
+
multiplier: 2
|
47 |
|
48 |
+
# diffusion distribution config
|
49 |
+
dist:
|
50 |
+
sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
|
51 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
52 |
+
mean: -3.0
|
53 |
+
std: 1.0
|
54 |
+
|
55 |
+
model_params_single:
|
56 |
multispeaker: false
|
57 |
|
58 |
dim_in: 64
|
|
|
102 |
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
103 |
mean: -3.0
|
104 |
std: 1.0
|
105 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
infer.py
CHANGED
@@ -8,7 +8,8 @@ random.seed(0)
|
|
8 |
|
9 |
import numpy as np
|
10 |
np.random.seed(0)
|
11 |
-
|
|
|
12 |
|
13 |
import spaces
|
14 |
import yaml
|
@@ -42,8 +43,24 @@ def length_to_mask(lengths):
|
|
42 |
return mask
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
|
|
47 |
|
48 |
# load pretrained ASR model
|
49 |
ASR_config = config.get('ASR_config', False)
|
@@ -59,39 +76,51 @@ from Utils.PLBERT.util import load_plbert
|
|
59 |
|
60 |
plbert = load_plbert('weights/plbert.bin', 'Utils/PLBERT/config.yml')
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
|
97 |
def split_to_parts(text):
|
@@ -107,25 +136,23 @@ def split_to_parts(text):
|
|
107 |
|
108 |
|
109 |
|
110 |
-
def _inf(text, speed, s_prev, noise, alpha, diffusion_steps, embedding_scale):
|
|
|
111 |
text = text.strip()
|
112 |
text = text.replace('"', '')
|
113 |
-
text = text.replace('+', '
|
114 |
text = normalize('NFKC', text)
|
115 |
|
116 |
text = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
|
117 |
text = re.sub(r' - ', ': ', text)
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
ps = ipa(stressed)
|
122 |
-
|
123 |
-
print(stressed)
|
124 |
|
125 |
tokens = textclenaer(ps)
|
126 |
tokens.insert(0, 0)
|
|
|
127 |
tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
|
128 |
-
|
129 |
with torch.no_grad():
|
130 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
|
131 |
text_mask = length_to_mask(input_lengths).to(tokens.device)
|
@@ -134,9 +161,17 @@ def _inf(text, speed, s_prev, noise, alpha, diffusion_steps, embedding_scale):
|
|
134 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
135 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
if s_prev is not None:
|
142 |
# convex combination of previous and current style
|
@@ -144,13 +179,22 @@ def _inf(text, speed, s_prev, noise, alpha, diffusion_steps, embedding_scale):
|
|
144 |
|
145 |
s = s_pred[:, 128:]
|
146 |
ref = s_pred[:, :128]
|
|
|
|
|
|
|
|
|
147 |
|
148 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
149 |
|
150 |
x, _ = model.predictor.lstm(d)
|
151 |
duration = model.predictor.duration_proj(x)
|
|
|
152 |
duration = torch.sigmoid(duration).sum(axis=-1)/speed
|
153 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
|
|
|
|
|
|
|
|
154 |
|
155 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
156 |
c_frame = 0
|
@@ -161,26 +205,29 @@ def _inf(text, speed, s_prev, noise, alpha, diffusion_steps, embedding_scale):
|
|
161 |
# encode prosody
|
162 |
en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
|
163 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
167 |
return out.squeeze().cpu().numpy(), s_pred, ps
|
168 |
|
169 |
|
170 |
@spaces.GPU
|
171 |
-
def inference(text,
|
172 |
|
173 |
wavs = []
|
174 |
s_prev = None
|
175 |
|
176 |
#sentences = text.split('|')
|
177 |
sentences = split_to_parts(text)
|
178 |
-
|
179 |
phonemes = ''
|
180 |
noise = torch.randn(1,1,256).to(device)
|
|
|
181 |
for text in progress.tqdm(sentences):
|
182 |
if text.strip() == "": continue
|
183 |
-
wav, s_prev, ps = _inf(text, speed, s_prev, noise, alpha=alpha, diffusion_steps=diffusion_steps, embedding_scale=embedding_scale)
|
184 |
wavs.append(wav)
|
185 |
phonemes += ' ' + ps
|
186 |
return np.concatenate(wavs), phonemes
|
|
|
8 |
|
9 |
import numpy as np
|
10 |
np.random.seed(0)
|
11 |
+
import librosa
|
12 |
+
from copy import deepcopy
|
13 |
|
14 |
import spaces
|
15 |
import yaml
|
|
|
43 |
return mask
|
44 |
|
45 |
|
46 |
+
def load_state_dict(model, params):
|
47 |
+
for key in model:
|
48 |
+
if key in params:
|
49 |
+
print('%s loaded' % key)
|
50 |
+
try:
|
51 |
+
model[key].load_state_dict(params[key])
|
52 |
+
except:
|
53 |
+
from collections import OrderedDict
|
54 |
+
state_dict = params[key]
|
55 |
+
new_state_dict = OrderedDict()
|
56 |
+
for k, v in state_dict.items():
|
57 |
+
name = k[7:] # remove `module.`
|
58 |
+
new_state_dict[name] = v
|
59 |
+
|
60 |
+
model[key].load_state_dict(new_state_dict, strict=False)
|
61 |
|
62 |
+
|
63 |
+
config = yaml.safe_load(open('config.yml'))
|
64 |
|
65 |
# load pretrained ASR model
|
66 |
ASR_config = config.get('ASR_config', False)
|
|
|
76 |
|
77 |
plbert = load_plbert('weights/plbert.bin', 'Utils/PLBERT/config.yml')
|
78 |
|
79 |
+
model_single = build_model(recursive_munch(config['model_params_single']), text_aligner, pitch_extractor, plbert)
|
80 |
+
model_multi = build_model(recursive_munch(config['model_params_multi']), deepcopy(text_aligner), deepcopy(pitch_extractor), deepcopy(plbert))
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
params_multi = torch.load('weights/multi.bin', map_location='cpu')
|
85 |
+
params_single = torch.load('weights/filatov.bin', map_location='cpu')
|
86 |
+
|
87 |
+
|
88 |
+
load_state_dict(model_single, params_single)
|
89 |
+
_ = [model_single[key].eval() for key in model_single]
|
90 |
+
_ = [model_single[key].to(device) for key in model_single]
|
91 |
+
|
92 |
+
|
93 |
+
load_state_dict(model_multi, params_multi)
|
94 |
+
_ = [model_multi[key].eval() for key in model_multi]
|
95 |
+
_ = [model_multi[key].to(device) for key in model_multi]
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
models = {
|
100 |
+
'multi': model_multi,
|
101 |
+
'single': model_single
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
def preprocess(wave):
|
107 |
+
wave_tensor = torch.from_numpy(wave).float()
|
108 |
+
mel_tensor = to_mel(wave_tensor)
|
109 |
+
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
|
110 |
+
return mel_tensor
|
111 |
+
|
112 |
+
def compute_style(voice_audio):
|
113 |
+
wave, sr = librosa.load(voice_audio, sr=24000)
|
114 |
+
audio, index = librosa.effects.trim(wave, top_db=30)
|
115 |
+
if sr != 24000:
|
116 |
+
audio = librosa.resample(audio, sr, 24000)
|
117 |
+
mel_tensor = preprocess(audio).to(device)
|
118 |
+
|
119 |
+
with torch.no_grad():
|
120 |
+
ref_s = models['multi'].style_encoder(mel_tensor.unsqueeze(1))
|
121 |
+
ref_p = models['multi'].predictor_encoder(mel_tensor.unsqueeze(1))
|
122 |
+
|
123 |
+
return torch.cat([ref_s, ref_p], dim=1)
|
124 |
|
125 |
|
126 |
def split_to_parts(text):
|
|
|
136 |
|
137 |
|
138 |
|
139 |
+
def _inf(model, text, ref_s, speed, s_prev, noise, alpha, beta, diffusion_steps, embedding_scale):
|
140 |
+
model = models[model]
|
141 |
text = text.strip()
|
142 |
text = text.replace('"', '')
|
143 |
+
text = text.replace('+', 'ˈ')
|
144 |
text = normalize('NFKC', text)
|
145 |
|
146 |
text = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
|
147 |
text = re.sub(r' - ', ': ', text)
|
148 |
+
ps = ipa(stressify(text))
|
149 |
+
print(ps)
|
|
|
|
|
|
|
|
|
150 |
|
151 |
tokens = textclenaer(ps)
|
152 |
tokens.insert(0, 0)
|
153 |
+
|
154 |
tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
|
155 |
+
|
156 |
with torch.no_grad():
|
157 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
|
158 |
text_mask = length_to_mask(input_lengths).to(tokens.device)
|
|
|
161 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
162 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
163 |
|
164 |
+
|
165 |
+
if ref_s is None:
|
166 |
+
s_pred = model.sampler(noise,
|
167 |
+
embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
|
168 |
+
embedding_scale=embedding_scale).squeeze(0)
|
169 |
+
else:
|
170 |
+
s_pred = model.sampler(noise = noise,
|
171 |
+
embedding=bert_dur,
|
172 |
+
embedding_scale=embedding_scale,
|
173 |
+
features=ref_s, # reference from the same speaker as the embedding
|
174 |
+
num_steps=diffusion_steps).squeeze(1)
|
175 |
|
176 |
if s_prev is not None:
|
177 |
# convex combination of previous and current style
|
|
|
179 |
|
180 |
s = s_pred[:, 128:]
|
181 |
ref = s_pred[:, :128]
|
182 |
+
|
183 |
+
if ref_s is not None:
|
184 |
+
ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
|
185 |
+
s = beta * s + (1 - beta) * ref_s[:, 128:]
|
186 |
|
187 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
188 |
|
189 |
x, _ = model.predictor.lstm(d)
|
190 |
duration = model.predictor.duration_proj(x)
|
191 |
+
|
192 |
duration = torch.sigmoid(duration).sum(axis=-1)/speed
|
193 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
194 |
+
|
195 |
+
if ref_s is not None:
|
196 |
+
pred_dur[0] = 30
|
197 |
+
|
198 |
|
199 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
200 |
c_frame = 0
|
|
|
205 |
# encode prosody
|
206 |
en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
|
207 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
208 |
+
asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
|
209 |
+
|
210 |
+
out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
211 |
+
if ref_s is not None:
|
212 |
+
out = out[:,:, 14500:]
|
213 |
return out.squeeze().cpu().numpy(), s_pred, ps
|
214 |
|
215 |
|
216 |
@spaces.GPU
|
217 |
+
def inference(model, text, voice_audio, progress, speed=1, alpha=0.4, beta=0.4, diffusion_steps=10, embedding_scale=1.2):
|
218 |
|
219 |
wavs = []
|
220 |
s_prev = None
|
221 |
|
222 |
#sentences = text.split('|')
|
223 |
sentences = split_to_parts(text)
|
224 |
+
|
225 |
phonemes = ''
|
226 |
noise = torch.randn(1,1,256).to(device)
|
227 |
+
ref_s = compute_style(voice_audio) if voice_audio else None
|
228 |
for text in progress.tqdm(sentences):
|
229 |
if text.strip() == "": continue
|
230 |
+
wav, s_prev, ps = _inf(model, text, ref_s, speed, s_prev, noise, alpha=alpha, beta=beta, diffusion_steps=diffusion_steps, embedding_scale=embedding_scale)
|
231 |
wavs.append(wav)
|
232 |
phonemes += ' ' + ps
|
233 |
return np.concatenate(wavs), phonemes
|
models.py
CHANGED
@@ -15,7 +15,7 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
|
15 |
from Utils.ASR.models import ASRCNN
|
16 |
from Utils.JDC.model import JDCNet
|
17 |
|
18 |
-
from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
|
19 |
from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
|
20 |
from Modules.diffusion.diffusion import AudioDiffusionConditional
|
21 |
|
@@ -689,6 +689,10 @@ def build_model(args, text_aligner, pitch_extractor, bert):
|
|
689 |
|
690 |
# slm discriminator head
|
691 |
wd = WavLMDiscriminator(args.slm.hidden, args.slm.nlayers, args.slm.initial_channel),
|
|
|
|
|
|
|
|
|
692 |
)
|
693 |
|
694 |
return nets
|
|
|
15 |
from Utils.ASR.models import ASRCNN
|
16 |
from Utils.JDC.model import JDCNet
|
17 |
|
18 |
+
from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution, DiffusionSampler, ADPM2Sampler, KarrasSchedule
|
19 |
from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
|
20 |
from Modules.diffusion.diffusion import AudioDiffusionConditional
|
21 |
|
|
|
689 |
|
690 |
# slm discriminator head
|
691 |
wd = WavLMDiscriminator(args.slm.hidden, args.slm.nlayers, args.slm.initial_channel),
|
692 |
+
sampler = DiffusionSampler(diffusion.diffusion,
|
693 |
+
sampler=ADPM2Sampler(),
|
694 |
+
sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0),
|
695 |
+
clamp=False )
|
696 |
)
|
697 |
|
698 |
return nets
|
voices/Анастасія Павленко.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22d2bec547900bf22f46fdaa82613c917876d885c8d295afa0dcd954d5f30530
|
3 |
+
size 933388
|
voices/Вʼячеслав Дудко.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f31069a3f3c40e8a3fbba538298fc29b52b929981ede2f25440713571aca047
|
3 |
+
size 878188
|
voices/Влада Муравець.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97e54da36c0be7127baa0a6a7ea6690f47037f0d094ef4e20ff786abfc8ef7cc
|
3 |
+
size 916590
|
voices/Гаська Шиян.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e573f57599112d2faced434d37a2c5a664828203e523ca068ec220c232f6ed8
|
3 |
+
size 909390
|
voices/Катерина Потапенко.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e7bca8d0b0dc6cb95e7456013bb463058fd37ddbc9cd5e10b0980c6a021ea4a
|
3 |
+
size 945388
|
voices/Марина Панас.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:720d617b7b377e15ac710171670eae69437276ffacdc8339113dfb2717233a46
|
3 |
+
size 918988
|
voices/Марися Нікітюк.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acd17f7cb865d102d7a811f98e45a7baf6f1665c222fd84045932612764f2eee
|
3 |
+
size 926188
|
voices/Марта Мольфар.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52adad6d28328ab05a602dfcba768defcc8e7aa34d00c3e85416a4d6151b2535
|
3 |
+
size 748588
|
voices/Марічка Штирбулова.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6ef6573f3c777d2f79d4afc1e8936da2c02ce47559e34962f97efc9242ba39b
|
3 |
+
size 1600588
|
voices/Маслінка.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e513b693af42afe49efb170ffc4e3d07bfb609ccb8b4b7ea3f9932be57dad5f
|
3 |
+
size 942988
|
voices/Матвій Ніколаєв.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d699c90a21afdd4014c88c25bc283f9c3221e453cdb1c79ad358e69429e4ee1
|
3 |
+
size 885388
|
voices/Михайло Тишин.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:074124a743764ba7de24a0d1d3c320f2871b4324d74ba0073a7691cee6dee905
|
3 |
+
size 897388
|
voices/Наталія Калюжна.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6dda18ff38e4858e7f562b4975b48572d3f7f89b5ec367d2a9b04b90841d79ba
|
3 |
+
size 1041388
|
voices/Олег Лепенець.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f86bb5aa40ec6d5c79fd22ced1bbe8da0b8c11fe89bbe5ebfd745ef90ac715a6
|
3 |
+
size 880588
|
voices/Слава Красовська.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5648b8705b53f226c25474f48264ad88ca3b156cac9225ba8096b175b6b3a9ff
|
3 |
+
size 947788
|
voices/Юрій Кудрявець.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7e8ec2de5c6c2dba550d9814a93796fb081d55a2eedbad2bcb864ba96b5c869
|
3 |
+
size 954988
|
voices/Яніна Соколова.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82387f2d814df32a9bf50ebaab3616bdd5d74e57b0466339573e94ff2f3144a3
|
3 |
+
size 906988
|