Serhiy Stetskovych commited on
Commit
ce548c0
·
1 Parent(s): 77d64d5

Update up to work with new code

Browse files
Files changed (4) hide show
  1. app.py +64 -16
  2. config.yml +0 -105
  3. infer.py +0 -237
  4. requirements.txt +1 -11
app.py CHANGED
@@ -1,11 +1,21 @@
1
  import glob
2
  import os
 
3
  import gradio as gr
4
- from infer import inference, split_to_parts
5
  import onnxruntime
6
  from transformers import AutoTokenizer
7
  from huggingface_hub import hf_hub_download
8
  import numpy as np
 
 
 
 
 
 
 
 
 
 
9
 
10
  prompts_dir = 'voices'
11
  prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
@@ -71,6 +81,26 @@ def init_verbalizer():
71
  tokenizer, encoder_session, decoder_session = init_verbalizer()
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def generate_text(text):
75
  """Generate text for a single input."""
76
  # Prepare input
@@ -137,8 +167,10 @@ examples = [
137
  ["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
138
  ]
139
 
140
- def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
141
- prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
 
 
142
  if text.strip() == "":
143
  raise gr.Error("You must enter some text")
144
  if len(text) > 50000:
@@ -147,20 +179,35 @@ def synthesize_multi(text, voice_audio, speed, progress=gr.Progress()):
147
  print(text)
148
  print("*** end ***")
149
 
150
- return 24000, inference('multi', text, prompt_audio_path, progress, speed=speed, alpha=0, beta=0, diffusion_steps=20, embedding_scale=1.0)[0]
 
 
 
 
 
151
 
 
 
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
153
 
154
- def synthesize_single(text, speed, progress=gr.Progress()):
155
- if text.strip() == "":
156
- raise gr.Error("You must enter some text")
157
- if len(text) > 50000:
158
- raise gr.Error("Text must be <50k characters")
159
- print("*** saying ***")
160
- print(text)
161
- print("*** end ***")
162
 
163
- return 24000, inference('single', text, None, progress, speed=speed, alpha=1, beta=0, diffusion_steps=4, embedding_scale=1.0)[0]
 
 
164
 
165
  def select_example(df, evt: gr.SelectData):
166
  return evt.row_value
@@ -181,8 +228,8 @@ with gr.Blocks() as single:
181
  type="numpy",
182
  )
183
  synthesise_button = gr.Button("Синтезувати")
184
-
185
- synthesise_button.click(synthesize_single, inputs=[input_text, speed], outputs=[output_audio])
186
 
187
  with gr.Row():
188
  examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
@@ -205,8 +252,9 @@ with gr.Blocks() as multy:
205
  type="numpy",
206
  )
207
  synthesise_button = gr.Button("Синтезувати")
 
208
 
209
- synthesise_button.click(synthesize_multi, inputs=[input_text, speaker, speed], outputs=[output_audio])
210
  with gr.Row():
211
  examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
212
  examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
 
1
  import glob
2
  import os
3
+ import re
4
  import gradio as gr
 
5
  import onnxruntime
6
  from transformers import AutoTokenizer
7
  from huggingface_hub import hf_hub_download
8
  import numpy as np
9
+ import torch
10
+ from ipa_uk import ipa
11
+ from unicodedata import normalize
12
+ from styletts2_inference.models import StyleTTS2
13
+ from ukrainian_word_stress import Stressifier
14
+ stressify = Stressifier()
15
+ from text_utils import TextCleaner
16
+ textclenaer = TextCleaner()
17
+
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
 
20
  prompts_dir = 'voices'
21
  prompts_list = sorted(glob.glob(os.path.join(prompts_dir, '*.wav')))
 
81
  tokenizer, encoder_session, decoder_session = init_verbalizer()
82
 
83
 
84
+ def split_to_parts(text):
85
+ split_symbols = '.?!:'
86
+ parts = ['']
87
+ index = 0
88
+ for s in text:
89
+ parts[index] += s
90
+ if s in split_symbols and len(parts[index]) > 150:
91
+ index += 1
92
+ parts.append('')
93
+ return parts
94
+
95
+
96
+
97
+ models = {
98
+ 'multi': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_multispeaker', device=device),
99
+ 'single': StyleTTS2(hf_path='patriotyk/styletts2_ukrainian_single', device=device)
100
+ }
101
+
102
+
103
+
104
  def generate_text(text):
105
  """Generate text for a single input."""
106
  # Prepare input
 
167
  ["Очікується, що цей застосунок буде запущено 22.08.2025.", 1.0],
168
  ]
169
 
170
+
171
+
172
+ def synthesize(model_name, text, speed, voice_audio = None, progress=gr.Progress()):
173
+
174
  if text.strip() == "":
175
  raise gr.Error("You must enter some text")
176
  if len(text) > 50000:
 
179
  print(text)
180
  print("*** end ***")
181
 
182
+ diffusion_steps = 4
183
+ voice = None
184
+ if voice_audio:
185
+ prompt_audio_path = os.path.join(prompts_dir, voice_audio+'.wav')
186
+ voice = models[model_name].compute_style(prompt_audio_path)
187
+ diffusion_steps = 10
188
 
189
+ s_prev = torch.tensor([[0]])
190
+ result_wav = []
191
+ for t in progress.tqdm(split_to_parts(text)):
192
+ if t:
193
+ t = t.strip()
194
+ t = t.replace('"', '')
195
+ t = t.replace('+', 'ˈ')
196
+ t = normalize('NFKC', t)
197
 
198
+ t = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', t)
199
+ t = re.sub(r' - ', ': ', t)
200
+ ps = ipa(stressify(t))
201
+
202
+ tokens = textclenaer(ps)
203
+
204
+ wav, s_prev = models[model_name](torch.LongTensor(tokens), voice=voice, speed=speed, diffusion_steps=diffusion_steps, s_prev=s_prev)
205
+ result_wav.append(wav)
206
 
 
 
 
 
 
 
 
 
207
 
208
+ return 24000, torch.concatenate(result_wav).numpy()
209
+
210
+
211
 
212
  def select_example(df, evt: gr.SelectData):
213
  return evt.row_value
 
228
  type="numpy",
229
  )
230
  synthesise_button = gr.Button("Синтезувати")
231
+ single_text = gr.Text(value='single', visible=False)
232
+ synthesise_button.click(synthesize, inputs=[single_text, input_text, speed], outputs=[output_audio])
233
 
234
  with gr.Row():
235
  examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
 
252
  type="numpy",
253
  )
254
  synthesise_button = gr.Button("Синтезувати")
255
+ multi = gr.Text(value='multi', visible=False)
256
 
257
+ synthesise_button.click(synthesize, inputs=[multi, input_text, speed, speaker], outputs=[output_audio])
258
  with gr.Row():
259
  examples_table = gr.Dataframe(wrap=True, headers=["Текст", "Швидкість"], datatype=["str", "number"], value=examples, interactive=False)
260
  examples_table.select(select_example, inputs=[examples_table], outputs=[input_text, speed])
config.yml DELETED
@@ -1,105 +0,0 @@
1
- F0_path: "weights/jdc.bin"
2
- ASR_config: "Utils/ASR/config.yml"
3
- ASR_path: "weights/asr.bin"
4
-
5
-
6
- model_params_multi:
7
- multispeaker: true
8
-
9
- dim_in: 64
10
- hidden_dim: 512
11
- max_conv_dim: 512
12
- n_layer: 3
13
- n_mels: 80
14
-
15
- n_token: 181 # number of phoneme tokens
16
- max_dur: 50 # maximum duration of a single phoneme
17
- style_dim: 128 # style vector size
18
-
19
- dropout: 0.2
20
-
21
- # config for decoder
22
- decoder:
23
- type: 'hifigan' # either hifigan or istftnet
24
- resblock_kernel_sizes: [3,7,11]
25
- upsample_rates : [10,5,3,2]
26
- upsample_initial_channel: 512
27
- resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
28
- upsample_kernel_sizes: [20,10,6,4]
29
-
30
- # speech language model config
31
- slm:
32
- model: ''
33
- sr: 16000 # sampling rate of SLM
34
- hidden: 768 # hidden size of SLM
35
- nlayers: 13 # number of layers of SLM
36
- initial_channel: 64 # initial channels of SLM discriminator head
37
-
38
- # style diffusion model config
39
- diffusion:
40
- embedding_mask_proba: 0.1
41
- # transformer config
42
- transformer:
43
- num_layers: 3
44
- num_heads: 8
45
- head_features: 64
46
- multiplier: 2
47
-
48
- # diffusion distribution config
49
- dist:
50
- sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
51
- estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
52
- mean: -3.0
53
- std: 1.0
54
-
55
- model_params_single:
56
- multispeaker: false
57
-
58
- dim_in: 64
59
- hidden_dim: 512
60
- max_conv_dim: 512
61
- n_layer: 3
62
- n_mels: 80
63
-
64
- n_token: 181 # number of phoneme tokens
65
- max_dur: 50 # maximum duration of a single phoneme
66
- style_dim: 128 # style vector size
67
-
68
- dropout: 0.2
69
-
70
- # config for decoder
71
- decoder:
72
- type: 'istftnet' # either hifigan or istftnet
73
- resblock_kernel_sizes: [3,7,11]
74
- upsample_rates : [10, 6]
75
- upsample_initial_channel: 512
76
- resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
77
- upsample_kernel_sizes: [20, 12]
78
- gen_istft_n_fft: 20
79
- gen_istft_hop_size: 5
80
-
81
- # speech language model config
82
- slm:
83
- model: 'openai/whisper-medium'
84
- sr: 16000 # sampling rate of SLM
85
- hidden: 768 # hidden size of SLM
86
- nlayers: 13 # number of layers of SLM
87
- initial_channel: 64 # initial channels of SLM discriminator head
88
-
89
- # style diffusion model config
90
- diffusion:
91
- embedding_mask_proba: 0.1
92
- # transformer config
93
- transformer:
94
- num_layers: 3
95
- num_heads: 8
96
- head_features: 64
97
- multiplier: 2
98
-
99
- # diffusion distribution config
100
- dist:
101
- sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
102
- estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
103
- mean: -3.0
104
- std: 1.0
105
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer.py DELETED
@@ -1,237 +0,0 @@
1
- import torch
2
- torch.manual_seed(0)
3
- torch.backends.cudnn.benchmark = False
4
- torch.backends.cudnn.deterministic = True
5
-
6
- import random
7
- random.seed(0)
8
-
9
- import numpy as np
10
- np.random.seed(0)
11
- import librosa
12
- from copy import deepcopy
13
- from huggingface_hub import hf_hub_download
14
-
15
- import spaces
16
- import yaml
17
- import re
18
- import numpy as np
19
- import torch
20
- import torch.nn.functional as F
21
- import torchaudio
22
- from ipa_uk import ipa
23
- from unicodedata import normalize
24
- from ukrainian_word_stress import Stressifier, StressSymbol
25
- stressify = Stressifier()
26
-
27
-
28
-
29
- from models import *
30
- from utils import *
31
- from text_utils import TextCleaner
32
- textclenaer = TextCleaner()
33
-
34
-
35
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
36
-
37
- to_mel = torchaudio.transforms.MelSpectrogram(
38
- n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
39
- mean, std = -4, 4
40
-
41
- def length_to_mask(lengths):
42
- mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
43
- mask = torch.gt(mask+1, lengths.unsqueeze(1))
44
- return mask
45
-
46
-
47
- def load_state_dict(model, params):
48
- for key in model:
49
- if key in params:
50
- print('%s loaded' % key)
51
- try:
52
- model[key].load_state_dict(params[key])
53
- except:
54
- from collections import OrderedDict
55
- state_dict = params[key]
56
- new_state_dict = OrderedDict()
57
- for k, v in state_dict.items():
58
- name = k[7:] # remove `module.`
59
- new_state_dict[name] = v
60
-
61
- model[key].load_state_dict(new_state_dict, strict=False)
62
-
63
-
64
- config = yaml.safe_load(open('config.yml'))
65
-
66
- # load pretrained ASR model
67
- ASR_config = config.get('ASR_config', False)
68
- ASR_path = config.get('ASR_path', False)
69
- text_aligner = load_ASR_models(ASR_path, ASR_config)
70
-
71
- # load pretrained F0 model
72
- F0_path = config.get('F0_path', False)
73
- pitch_extractor = load_F0_models(F0_path)
74
-
75
- # load BERT model
76
- from Utils.PLBERT.util import load_plbert
77
-
78
- plbert = load_plbert('weights/plbert.bin', 'Utils/PLBERT/config.yml')
79
-
80
- model_single = build_model(recursive_munch(config['model_params_single']), text_aligner, pitch_extractor, plbert)
81
- model_multi = build_model(recursive_munch(config['model_params_multi']), deepcopy(text_aligner), deepcopy(pitch_extractor), deepcopy(plbert))
82
-
83
-
84
- multi_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_multispeaker', filename="pytorch_model.bin")
85
- params_multi = torch.load(multi_path, map_location='cpu')
86
-
87
-
88
- single_path = hf_hub_download(repo_id='patriotyk/styletts2_ukrainian_single', filename="pytorch_model.bin")
89
- params_single = torch.load(single_path, map_location='cpu')
90
-
91
-
92
- load_state_dict(model_single, params_single)
93
- _ = [model_single[key].eval() for key in model_single]
94
- _ = [model_single[key].to(device) for key in model_single]
95
-
96
-
97
- load_state_dict(model_multi, params_multi)
98
- _ = [model_multi[key].eval() for key in model_multi]
99
- _ = [model_multi[key].to(device) for key in model_multi]
100
-
101
-
102
-
103
- models = {
104
- 'multi': model_multi,
105
- 'single': model_single
106
- }
107
-
108
-
109
-
110
- def preprocess(wave):
111
- wave_tensor = torch.from_numpy(wave).float()
112
- mel_tensor = to_mel(wave_tensor)
113
- mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
114
- return mel_tensor
115
-
116
- def compute_style(voice_audio):
117
- wave, sr = librosa.load(voice_audio, sr=24000)
118
- audio, index = librosa.effects.trim(wave, top_db=30)
119
- if sr != 24000:
120
- audio = librosa.resample(audio, sr, 24000)
121
- mel_tensor = preprocess(audio).to(device)
122
-
123
- with torch.no_grad():
124
- ref_s = models['multi'].style_encoder(mel_tensor.unsqueeze(1))
125
- ref_p = models['multi'].predictor_encoder(mel_tensor.unsqueeze(1))
126
-
127
- return torch.cat([ref_s, ref_p], dim=1)
128
-
129
-
130
- def split_to_parts(text):
131
- split_symbols = '.?!:'
132
- parts = ['']
133
- index = 0
134
- for s in text:
135
- parts[index] += s
136
- if s in split_symbols and len(parts[index]) > 150:
137
- index += 1
138
- parts.append('')
139
- return parts
140
-
141
-
142
-
143
- def _inf(model, text, ref_s, speed, s_prev, noise, alpha, beta, diffusion_steps, embedding_scale):
144
- model = models[model]
145
- text = text.strip()
146
- text = text.replace('"', '')
147
- text = text.replace('+', 'ˈ')
148
- text = normalize('NFKC', text)
149
-
150
- text = re.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
151
- text = re.sub(r' - ', ': ', text)
152
- ps = ipa(stressify(text))
153
- print(ps)
154
-
155
- tokens = textclenaer(ps)
156
- tokens.insert(0, 0)
157
-
158
- tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
159
-
160
- with torch.no_grad():
161
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
162
- text_mask = length_to_mask(input_lengths).to(tokens.device)
163
-
164
- t_en = model.text_encoder(tokens, input_lengths, text_mask)
165
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
166
- d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
167
-
168
-
169
- if ref_s is None:
170
- s_pred = model.sampler(noise,
171
- embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
172
- embedding_scale=embedding_scale).squeeze(0)
173
- else:
174
- s_pred = model.sampler(noise = noise,
175
- embedding=bert_dur,
176
- embedding_scale=embedding_scale,
177
- features=ref_s, # reference from the same speaker as the embedding
178
- num_steps=diffusion_steps).squeeze(1)
179
-
180
- if s_prev is not None:
181
- # convex combination of previous and current style
182
- s_pred = alpha * s_prev + (1 - alpha) * s_pred
183
-
184
- s = s_pred[:, 128:]
185
- ref = s_pred[:, :128]
186
-
187
- if ref_s is not None:
188
- ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
189
- s = beta * s + (1 - beta) * ref_s[:, 128:]
190
-
191
- d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
192
-
193
- x, _ = model.predictor.lstm(d)
194
- duration = model.predictor.duration_proj(x)
195
-
196
- duration = torch.sigmoid(duration).sum(axis=-1)/speed
197
- pred_dur = torch.round(duration.squeeze()).clamp(min=1)
198
-
199
- if ref_s is not None:
200
- pred_dur[0] = 30
201
-
202
-
203
- pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
204
- c_frame = 0
205
- for i in range(pred_aln_trg.size(0)):
206
- pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
207
- c_frame += int(pred_dur[i].data)
208
-
209
- # encode prosody
210
- en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
211
- F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
212
- asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
213
-
214
- out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
215
- if ref_s is not None:
216
- out = out[:,:, 14500:]
217
- return out.squeeze().cpu().numpy(), s_pred, ps
218
-
219
-
220
- @spaces.GPU
221
- def inference(model, text, voice_audio, progress, speed=1, alpha=0.4, beta=0.4, diffusion_steps=10, embedding_scale=1.2):
222
-
223
- wavs = []
224
- s_prev = None
225
-
226
- #sentences = text.split('|')
227
- sentences = split_to_parts(text)
228
-
229
- phonemes = ''
230
- noise = torch.randn(1,1,256).to(device)
231
- ref_s = compute_style(voice_audio) if voice_audio else None
232
- for text in progress.tqdm(sentences):
233
- if text.strip() == "": continue
234
- wav, s_prev, ps = _inf(model, text, ref_s, speed, s_prev, noise, alpha=alpha, beta=beta, diffusion_steps=diffusion_steps, embedding_scale=embedding_scale)
235
- wavs.append(wav)
236
- phonemes += ' ' + ps
237
- return np.concatenate(wavs), phonemes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,20 +1,10 @@
1
  SoundFile
2
  torchaudio==2.2.0
3
- munch
4
  torch==2.2.0
5
- pydub
6
- pyyaml
7
- librosa
8
- tqdm
9
- scipy
10
- gradio
11
- gruut
12
- einops
13
- einops_exts
14
- txtsplit
15
  transformers
16
  git+https://github.com/patriotyk/ukrainian-word-stress.git
17
  git+https://github.com/patriotyk/ipa-uk.git
 
18
  spaces
19
  numpy<2
20
  huggingface_hub
 
1
  SoundFile
2
  torchaudio==2.2.0
 
3
  torch==2.2.0
 
 
 
 
 
 
 
 
 
 
4
  transformers
5
  git+https://github.com/patriotyk/ukrainian-word-stress.git
6
  git+https://github.com/patriotyk/ipa-uk.git
7
+ git+https://github.com/patriotyk/styletts2-inference
8
  spaces
9
  numpy<2
10
  huggingface_hub