Spaces:
Running
on
Zero
Running
on
Zero
Serhiy Stetskovych
commited on
Commit
·
ce548c0
1
Parent(s):
77d64d5
Update up to work with new code
Browse files- app.py +64 -16
- config.yml +0 -105
- infer.py +0 -237
- requirements.txt +1 -11
app.py
CHANGED
@@ -1,11 +1,21 @@
|
|
1 |
import glob
|
2 |
import os
|
|
|
3 |
import gradio as gr
|
4 |
-
from infer import inference, split_to_parts
|
5 |
import onnxruntime
|
6 |
from transformers import AutoTokenizer
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
prompts_dir = 'voices'
|
11 |
prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
|
@@ -71,6 +81,26 @@ def init_verbalizer():
|
|
71 |
tokenizer, encoder_session, decoder_session = init_verbalizer()
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def generate_text(text):
|
75 |
"""Generate text for a single input."""
|
76 |
# Prepare input
|
@@ -137,8 +167,10 @@ examples = [
|
|
137 |
["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
|
138 |
]
|
139 |
|
140 |
-
|
141 |
-
|
|
|
|
|
142 |
if text.strip() == "":
|
143 |
raise gr.Error("You must enter some text")
|
144 |
if len(text) > 50000:
|
@@ -147,20 +179,35 @@ def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
|
|
147 |
print(text)
|
148 |
print("*** end ***")
|
149 |
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
def synthesize_single(text, speed, progress=gr.Progress()):
|
155 |
-
if text.strip() == "":
|
156 |
-
raise gr.Error("You must enter some text")
|
157 |
-
if len(text) > 50000:
|
158 |
-
raise gr.Error("Text must be <50k characters")
|
159 |
-
print("*** saying ***")
|
160 |
-
print(text)
|
161 |
-
print("*** end ***")
|
162 |
|
163 |
-
return 24000,
|
|
|
|
|
164 |
|
165 |
def select_example(df, evt: gr.SelectData):
|
166 |
return evt.row_value
|
@@ -181,8 +228,8 @@ with gr.Blocks() as single:
|
|
181 |
type="numpy",
|
182 |
)
|
183 |
synthesise_button = gr.Button("Синтезувати")
|
184 |
-
|
185 |
-
synthesise_button.click(
|
186 |
|
187 |
with gr.Row():
|
188 |
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
@@ -205,8 +252,9 @@ with gr.Blocks() as multy:
|
|
205 |
type="numpy",
|
206 |
)
|
207 |
synthesise_button = gr.Button("Синтезувати")
|
|
|
208 |
|
209 |
-
synthesise_button.click(
|
210 |
with gr.Row():
|
211 |
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
212 |
examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
|
|
|
1 |
import glob
|
2 |
import os
|
3 |
+
import re
|
4 |
import gradio as gr
|
|
|
5 |
import onnxruntime
|
6 |
from transformers import AutoTokenizer
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
import numpy as np
|
9 |
+
import torch
|
10 |
+
from ipa_uk import ipa
|
11 |
+
from unicodedata import normalize
|
12 |
+
from styletts2_inference.models import StyleTTS2
|
13 |
+
from ukrainian_word_stress import Stressifier
|
14 |
+
stressify = Stressifier()
|
15 |
+
from text_utils import TextCleaner
|
16 |
+
textclenaer = TextCleaner()
|
17 |
+
|
18 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
19 |
|
20 |
prompts_dir = 'voices'
|
21 |
prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
|
|
|
81 |
tokenizer, encoder_session, decoder_session = init_verbalizer()
|
82 |
|
83 |
|
84 |
+
def split_to_parts(text):
|
85 |
+
split_symbols = '.?!:'
|
86 |
+
parts = ['']
|
87 |
+
index = 0
|
88 |
+
for s in text:
|
89 |
+
parts[index] += s
|
90 |
+
if s in split_symbols and len(parts[index]) > 150:
|
91 |
+
index += 1
|
92 |
+
parts.append('')
|
93 |
+
return parts
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
models = {
|
98 |
+
'multi': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_multispeaker', device=device),
|
99 |
+
'single': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_single', device=device)
|
100 |
+
}
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
def generate_text(text):
|
105 |
"""Generate text for a single input."""
|
106 |
# Prepare input
|
|
|
167 |
["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
|
168 |
]
|
169 |
|
170 |
+
|
171 |
+
|
172 |
+
def synthesize(model_name, text, speed, voice_audio = None, progress=gr.Progress()):
|
173 |
+
|
174 |
if text.strip() == "":
|
175 |
raise gr.Error("You must enter some text")
|
176 |
if len(text) > 50000:
|
|
|
179 |
print(text)
|
180 |
print("*** end ***")
|
181 |
|
182 |
+
diffusion_steps = 4
|
183 |
+
voice = None
|
184 |
+
if voice_audio:
|
185 |
+
prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
|
186 |
+
voice = models[model_name].compute_style(prompt_audio_path)
|
187 |
+
diffusion_steps = 10
|
188 |
|
189 |
+
s_prev = torch.tensor([[0]])
|
190 |
+
result_wav = []
|
191 |
+
for t in progress.tqdm(split_to_parts(text)):
|
192 |
+
if t:
|
193 |
+
t = t.strip()
|
194 |
+
t = t.replace('"', '')
|
195 |
+
t = t.replace('+', 'ˈ')
|
196 |
+
t = normalize('NFKC', t)
|
197 |
|
198 |
+
t = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', t)
|
199 |
+
t = re.sub(r' - ', ': ', t)
|
200 |
+
ps = ipa(stressify(t))
|
201 |
+
|
202 |
+
tokens = textclenaer(ps)
|
203 |
+
|
204 |
+
wav, s_prev = models[model_name](torch.LongTensor(tokens), voice=voice, speed=speed, diffusion_steps=diffusion_steps, s_prev=s_prev)
|
205 |
+
result_wav.append(wav)
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
+
return 24000, torch.concatenate(result_wav).numpy()
|
209 |
+
|
210 |
+
|
211 |
|
212 |
def select_example(df, evt: gr.SelectData):
|
213 |
return evt.row_value
|
|
|
228 |
type="numpy",
|
229 |
)
|
230 |
synthesise_button = gr.Button("Синтезувати")
|
231 |
+
single_text = gr.Text(value='single', visible=False)
|
232 |
+
synthesise_button.click(synthesize, inputs=[single_text, input_text, speed], outputs=[output_audio])
|
233 |
|
234 |
with gr.Row():
|
235 |
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
|
|
252 |
type="numpy",
|
253 |
)
|
254 |
synthesise_button = gr.Button("Синтезувати")
|
255 |
+
multi = gr.Text(value='multi', visible=False)
|
256 |
|
257 |
+
synthesise_button.click(synthesize, inputs=[multi, input_text, speed, speaker], outputs=[output_audio])
|
258 |
with gr.Row():
|
259 |
examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
|
260 |
examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
|
config.yml
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
F0_path: "weights/jdc.bin"
|
2 |
-
ASR_config: "Utils/ASR/config.yml"
|
3 |
-
ASR_path: "weights/asr.bin"
|
4 |
-
|
5 |
-
|
6 |
-
model_params_multi:
|
7 |
-
multispeaker: true
|
8 |
-
|
9 |
-
dim_in: 64
|
10 |
-
hidden_dim: 512
|
11 |
-
max_conv_dim: 512
|
12 |
-
n_layer: 3
|
13 |
-
n_mels: 80
|
14 |
-
|
15 |
-
n_token: 181 # number of phoneme tokens
|
16 |
-
max_dur: 50 # maximum duration of a single phoneme
|
17 |
-
style_dim: 128 # style vector size
|
18 |
-
|
19 |
-
dropout: 0.2
|
20 |
-
|
21 |
-
# config for decoder
|
22 |
-
decoder:
|
23 |
-
type: 'hifigan' # either hifigan or istftnet
|
24 |
-
resblock_kernel_sizes: [3,7,11]
|
25 |
-
upsample_rates : [10,5,3,2]
|
26 |
-
upsample_initial_channel: 512
|
27 |
-
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
28 |
-
upsample_kernel_sizes: [20,10,6,4]
|
29 |
-
|
30 |
-
# speech language model config
|
31 |
-
slm:
|
32 |
-
model: ''
|
33 |
-
sr: 16000 # sampling rate of SLM
|
34 |
-
hidden: 768 # hidden size of SLM
|
35 |
-
nlayers: 13 # number of layers of SLM
|
36 |
-
initial_channel: 64 # initial channels of SLM discriminator head
|
37 |
-
|
38 |
-
# style diffusion model config
|
39 |
-
diffusion:
|
40 |
-
embedding_mask_proba: 0.1
|
41 |
-
# transformer config
|
42 |
-
transformer:
|
43 |
-
num_layers: 3
|
44 |
-
num_heads: 8
|
45 |
-
head_features: 64
|
46 |
-
multiplier: 2
|
47 |
-
|
48 |
-
# diffusion distribution config
|
49 |
-
dist:
|
50 |
-
sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
|
51 |
-
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
52 |
-
mean: -3.0
|
53 |
-
std: 1.0
|
54 |
-
|
55 |
-
model_params_single:
|
56 |
-
multispeaker: false
|
57 |
-
|
58 |
-
dim_in: 64
|
59 |
-
hidden_dim: 512
|
60 |
-
max_conv_dim: 512
|
61 |
-
n_layer: 3
|
62 |
-
n_mels: 80
|
63 |
-
|
64 |
-
n_token: 181 # number of phoneme tokens
|
65 |
-
max_dur: 50 # maximum duration of a single phoneme
|
66 |
-
style_dim: 128 # style vector size
|
67 |
-
|
68 |
-
dropout: 0.2
|
69 |
-
|
70 |
-
# config for decoder
|
71 |
-
decoder:
|
72 |
-
type: 'istftnet' # either hifigan or istftnet
|
73 |
-
resblock_kernel_sizes: [3,7,11]
|
74 |
-
upsample_rates : [10, 6]
|
75 |
-
upsample_initial_channel: 512
|
76 |
-
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
77 |
-
upsample_kernel_sizes: [20, 12]
|
78 |
-
gen_istft_n_fft: 20
|
79 |
-
gen_istft_hop_size: 5
|
80 |
-
|
81 |
-
# speech language model config
|
82 |
-
slm:
|
83 |
-
model: 'openai/whisper-medium'
|
84 |
-
sr: 16000 # sampling rate of SLM
|
85 |
-
hidden: 768 # hidden size of SLM
|
86 |
-
nlayers: 13 # number of layers of SLM
|
87 |
-
initial_channel: 64 # initial channels of SLM discriminator head
|
88 |
-
|
89 |
-
# style diffusion model config
|
90 |
-
diffusion:
|
91 |
-
embedding_mask_proba: 0.1
|
92 |
-
# transformer config
|
93 |
-
transformer:
|
94 |
-
num_layers: 3
|
95 |
-
num_heads: 8
|
96 |
-
head_features: 64
|
97 |
-
multiplier: 2
|
98 |
-
|
99 |
-
# diffusion distribution config
|
100 |
-
dist:
|
101 |
-
sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
|
102 |
-
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
103 |
-
mean: -3.0
|
104 |
-
std: 1.0
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
infer.py
DELETED
@@ -1,237 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
torch.manual_seed(0)
|
3 |
-
torch.backends.cudnn.benchmark = False
|
4 |
-
torch.backends.cudnn.deterministic = True
|
5 |
-
|
6 |
-
import random
|
7 |
-
random.seed(0)
|
8 |
-
|
9 |
-
import numpy as np
|
10 |
-
np.random.seed(0)
|
11 |
-
import librosa
|
12 |
-
from copy import deepcopy
|
13 |
-
from huggingface_hub import hf_hub_download
|
14 |
-
|
15 |
-
import spaces
|
16 |
-
import yaml
|
17 |
-
import re
|
18 |
-
import numpy as np
|
19 |
-
import torch
|
20 |
-
import torch.nn.functional as F
|
21 |
-
import torchaudio
|
22 |
-
from ipa_uk import ipa
|
23 |
-
from unicodedata import normalize
|
24 |
-
from ukrainian_word_stress import Stressifier, StressSymbol
|
25 |
-
stressify = Stressifier()
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
from models import *
|
30 |
-
from utils import *
|
31 |
-
from text_utils import TextCleaner
|
32 |
-
textclenaer = TextCleaner()
|
33 |
-
|
34 |
-
|
35 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
36 |
-
|
37 |
-
to_mel = torchaudio.transforms.MelSpectrogram(
|
38 |
-
n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
|
39 |
-
mean, std = -4, 4
|
40 |
-
|
41 |
-
def length_to_mask(lengths):
|
42 |
-
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
43 |
-
mask = torch.gt(mask+1, lengths.unsqueeze(1))
|
44 |
-
return mask
|
45 |
-
|
46 |
-
|
47 |
-
def load_state_dict(model, params):
|
48 |
-
for key in model:
|
49 |
-
if key in params:
|
50 |
-
print('%s loaded' % key)
|
51 |
-
try:
|
52 |
-
model[key].load_state_dict(params[key])
|
53 |
-
except:
|
54 |
-
from collections import OrderedDict
|
55 |
-
state_dict = params[key]
|
56 |
-
new_state_dict = OrderedDict()
|
57 |
-
for k, v in state_dict.items():
|
58 |
-
name = k[7:] # remove `module.`
|
59 |
-
new_state_dict[name] = v
|
60 |
-
|
61 |
-
model[key].load_state_dict(new_state_dict, strict=False)
|
62 |
-
|
63 |
-
|
64 |
-
config = yaml.safe_load(open('config.yml'))
|
65 |
-
|
66 |
-
# load pretrained ASR model
|
67 |
-
ASR_config = config.get('ASR_config', False)
|
68 |
-
ASR_path = config.get('ASR_path', False)
|
69 |
-
text_aligner = load_ASR_models(ASR_path, ASR_config)
|
70 |
-
|
71 |
-
# load pretrained F0 model
|
72 |
-
F0_path = config.get('F0_path', False)
|
73 |
-
pitch_extractor = load_F0_models(F0_path)
|
74 |
-
|
75 |
-
# load BERT model
|
76 |
-
from Utils.PLBERT.util import load_plbert
|
77 |
-
|
78 |
-
plbert = load_plbert('weights/plbert.bin', 'Utils/PLBERT/config.yml')
|
79 |
-
|
80 |
-
model_single = build_model(recursive_munch(config['model_params_single']), text_aligner, pitch_extractor, plbert)
|
81 |
-
model_multi = build_model(recursive_munch(config['model_params_multi']), deepcopy(text_aligner), deepcopy(pitch_extractor), deepcopy(plbert))
|
82 |
-
|
83 |
-
|
84 |
-
multi_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_multispeaker', filename="pytorch_model.bin")
|
85 |
-
params_multi = torch.load(multi_path, map_location='cpu')
|
86 |
-
|
87 |
-
|
88 |
-
single_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_single', filename="pytorch_model.bin")
|
89 |
-
params_single = torch.load(single_path, map_location='cpu')
|
90 |
-
|
91 |
-
|
92 |
-
load_state_dict(model_single, params_single)
|
93 |
-
_ = [model_single[key].eval() for key in model_single]
|
94 |
-
_ = [model_single[key].to(device) for key in model_single]
|
95 |
-
|
96 |
-
|
97 |
-
load_state_dict(model_multi, params_multi)
|
98 |
-
_ = [model_multi[key].eval() for key in model_multi]
|
99 |
-
_ = [model_multi[key].to(device) for key in model_multi]
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
models = {
|
104 |
-
'multi': model_multi,
|
105 |
-
'single': model_single
|
106 |
-
}
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
def preprocess(wave):
|
111 |
-
wave_tensor = torch.from_numpy(wave).float()
|
112 |
-
mel_tensor = to_mel(wave_tensor)
|
113 |
-
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
|
114 |
-
return mel_tensor
|
115 |
-
|
116 |
-
def compute_style(voice_audio):
|
117 |
-
wave, sr = librosa.load(voice_audio, sr=24000)
|
118 |
-
audio, index = librosa.effects.trim(wave, top_db=30)
|
119 |
-
if sr != 24000:
|
120 |
-
audio = librosa.resample(audio, sr, 24000)
|
121 |
-
mel_tensor = preprocess(audio).to(device)
|
122 |
-
|
123 |
-
with torch.no_grad():
|
124 |
-
ref_s = models['multi'].style_encoder(mel_tensor.unsqueeze(1))
|
125 |
-
ref_p = models['multi'].predictor_encoder(mel_tensor.unsqueeze(1))
|
126 |
-
|
127 |
-
return torch.cat([ref_s, ref_p], dim=1)
|
128 |
-
|
129 |
-
|
130 |
-
def split_to_parts(text):
|
131 |
-
split_symbols = '.?!:'
|
132 |
-
parts = ['']
|
133 |
-
index = 0
|
134 |
-
for s in text:
|
135 |
-
parts[index] += s
|
136 |
-
if s in split_symbols and len(parts[index]) > 150:
|
137 |
-
index += 1
|
138 |
-
parts.append('')
|
139 |
-
return parts
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
def _inf(model, text, ref_s, speed, s_prev, noise, alpha, beta, diffusion_steps, embedding_scale):
|
144 |
-
model = models[model]
|
145 |
-
text = text.strip()
|
146 |
-
text = text.replace('"', '')
|
147 |
-
text = text.replace('+', 'ˈ')
|
148 |
-
text = normalize('NFKC', text)
|
149 |
-
|
150 |
-
text = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
|
151 |
-
text = re.sub(r' - ', ': ', text)
|
152 |
-
ps = ipa(stressify(text))
|
153 |
-
print(ps)
|
154 |
-
|
155 |
-
tokens = textclenaer(ps)
|
156 |
-
tokens.insert(0, 0)
|
157 |
-
|
158 |
-
tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
|
159 |
-
|
160 |
-
with torch.no_grad():
|
161 |
-
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
|
162 |
-
text_mask = length_to_mask(input_lengths).to(tokens.device)
|
163 |
-
|
164 |
-
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
165 |
-
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
166 |
-
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
167 |
-
|
168 |
-
|
169 |
-
if ref_s is None:
|
170 |
-
s_pred = model.sampler(noise,
|
171 |
-
embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
|
172 |
-
embedding_scale=embedding_scale).squeeze(0)
|
173 |
-
else:
|
174 |
-
s_pred = model.sampler(noise = noise,
|
175 |
-
embedding=bert_dur,
|
176 |
-
embedding_scale=embedding_scale,
|
177 |
-
features=ref_s, # reference from the same speaker as the embedding
|
178 |
-
num_steps=diffusion_steps).squeeze(1)
|
179 |
-
|
180 |
-
if s_prev is not None:
|
181 |
-
# convex combination of previous and current style
|
182 |
-
s_pred = alpha * s_prev + (1 - alpha) * s_pred
|
183 |
-
|
184 |
-
s = s_pred[:, 128:]
|
185 |
-
ref = s_pred[:, :128]
|
186 |
-
|
187 |
-
if ref_s is not None:
|
188 |
-
ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
|
189 |
-
s = beta * s + (1 - beta) * ref_s[:, 128:]
|
190 |
-
|
191 |
-
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
192 |
-
|
193 |
-
x, _ = model.predictor.lstm(d)
|
194 |
-
duration = model.predictor.duration_proj(x)
|
195 |
-
|
196 |
-
duration = torch.sigmoid(duration).sum(axis=-1)/speed
|
197 |
-
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
198 |
-
|
199 |
-
if ref_s is not None:
|
200 |
-
pred_dur[0] = 30
|
201 |
-
|
202 |
-
|
203 |
-
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
204 |
-
c_frame = 0
|
205 |
-
for i in range(pred_aln_trg.size(0)):
|
206 |
-
pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
|
207 |
-
c_frame += int(pred_dur[i].data)
|
208 |
-
|
209 |
-
# encode prosody
|
210 |
-
en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
|
211 |
-
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
212 |
-
asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
|
213 |
-
|
214 |
-
out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
215 |
-
if ref_s is not None:
|
216 |
-
out = out[:,:, 14500:]
|
217 |
-
return out.squeeze().cpu().numpy(), s_pred, ps
|
218 |
-
|
219 |
-
|
220 |
-
@spaces.GPU
|
221 |
-
def inference(model, text, voice_audio, progress, speed=1, alpha=0.4, beta=0.4, diffusion_steps=10, embedding_scale=1.2):
|
222 |
-
|
223 |
-
wavs = []
|
224 |
-
s_prev = None
|
225 |
-
|
226 |
-
#sentences = text.split('|')
|
227 |
-
sentences = split_to_parts(text)
|
228 |
-
|
229 |
-
phonemes = ''
|
230 |
-
noise = torch.randn(1,1,256).to(device)
|
231 |
-
ref_s = compute_style(voice_audio) if voice_audio else None
|
232 |
-
for text in progress.tqdm(sentences):
|
233 |
-
if text.strip() == "": continue
|
234 |
-
wav, s_prev, ps = _inf(model, text, ref_s, speed, s_prev, noise, alpha=alpha, beta=beta, diffusion_steps=diffusion_steps, embedding_scale=embedding_scale)
|
235 |
-
wavs.append(wav)
|
236 |
-
phonemes += ' ' + ps
|
237 |
-
return np.concatenate(wavs), phonemes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,20 +1,10 @@
|
|
1 |
SoundFile
|
2 |
torchaudio==2.2.0
|
3 |
-
munch
|
4 |
torch==2.2.0
|
5 |
-
pydub
|
6 |
-
pyyaml
|
7 |
-
librosa
|
8 |
-
tqdm
|
9 |
-
scipy
|
10 |
-
gradio
|
11 |
-
gruut
|
12 |
-
einops
|
13 |
-
einops_exts
|
14 |
-
txtsplit
|
15 |
transformers
|
16 |
git+https://github.com/patriotyk/ukrainian-word-stress.git
|
17 |
git+https://github.com/patriotyk/ipa-uk.git
|
|
|
18 |
spaces
|
19 |
numpy<2
|
20 |
huggingface_hub
|
|
|
1 |
SoundFile
|
2 |
torchaudio==2.2.0
|
|
|
3 |
torch==2.2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
transformers
|
5 |
git+https://github.com/patriotyk/ukrainian-word-stress.git
|
6 |
git+https://github.com/patriotyk/ipa-uk.git
|
7 |
+
git+https://github.com/patriotyk/styletts2-inference
|
8 |
spaces
|
9 |
numpy<2
|
10 |
huggingface_hub
|